一、简单爬取猫眼(使用字符串)
import json
import requests
import re
# 返回页面
def get_page(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
# 选择抓取内容
def parse_page(html):
# 主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actor_items = re.findall(pattern, html)
# 电影名
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
title_items = re.findall(pattern,html)
# 上映时间
pattern = re.compile('<p class="releasetime">(.*?)</p>',re.S)
time_items = re.findall(pattern,html)
# 排名
pattern = re.compile('<i class="board-index.*?">(.*?)</i>', re.S)
rank_items = re.findall(pattern, html)
# 图片链接
# pattern = re.compile('movieId.*?>.*?<img.*?<img.*?src="(.*?)"',re.S)
# items = re.findall(pattern,html)
movies = []
for i in range(len(actor_items)):
one_movie = {}
one_movie['title'] = title_items[i]
one_movie['actor'] = actor_items[i].strip()[3:]
one_movie['time'] = time_items[i].strip()[5:]
one_movie['rank'] = rank_items[i]
movies.append(one_movie)
return movies
# 写入文件
def write_img(url):
arr = url.split('@')
filename = arr[0].split('/')[-1]
with open('./images/%s' % filename, 'wb') as f:
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
f.write(response.content)
# 输出抓取内容
def main():
# 猫眼网址
urls = ['http://maoyan.com/board/4?offset=0',
'http://maoyan.com/board/4?offset=10',
'http://maoyan.com/board/4?offset=20',
'http://maoyan.com/board/4?offset=30',
'http://maoyan.com/board/4?offset=40',
'http://maoyan.com/board/4?offset=50',
'http://maoyan.com/board/4?offset=60',
'http://maoyan.com/board/4?offset=70',
'http://maoyan.com/board/4?offset=80',
'http://maoyan.com/board/4?offset=90', ]
for url in urls:
html = get_page(url)
movies = parse_page(html)
str = json.dumps(movies, ensure_ascii=False)
with open('a.json', 'a', encoding='utf-8') as f:
f.write(str)
for item in movies:
print(item)
# write_img(item.strip())
# print(html)
# print(items)
if __name__ == '__main__':
main()
二、简单爬取豆瓣(使用XPATH)
import requests
from lxml import etree
# 取页面HTML
def get_one_page():
url = "https://www.douban.com/group/explore/culture"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
# 解析页面
def parse_with_xpath(html):
etree_html = etree.HTML(html)
# print(etree_html)
# 匹配所有节点 //*
# result = etree_html.xpath('//*')
# print(result)
# print(len(result))
# 匹配所有子节点 //a 文本获取:text()
# result = etree_html.xpath('//a/text()')
# print(result)
# 查找元素子节点 /
# result = etree_html.xpath('//div/p/text()')
# print(result)
# 查找元素所有子孙节点 //
# result = etree_html.xpath('//div[@class="channel-item"]//h3/a/text()')
# print(result)
# 父节点 ..
# result = etree_html.xpath('//span[@class="pubtime"]/../span/a/text()')
# print(result)
# 属性匹配 [@class="xxx"]
# 文本匹配 text() 获取所有文本//text()
result = etree_html.xpath('//div[@class="article"]//text()')
print(result)
# 属性获取 @href
# result = etree_html.xpath('//div[@class="article"]/div/div/@class')[0]
# # result = etree_html.xpath('//div[@class="bd"]/h3/a/@href')
# print(result)
# 属性多值匹配 contains(@class 'xx')
# result = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()[1]')
# print(result)
# 多属性匹配 or, and, mod, //book | //cd, + - * div = != < > <= >=
# result = etree_html.xpath('//span[@class="pubtime" and contains(text(), "10-18")]/text()')
# print(result)
# 按序选择 [1] [last()] [poistion() < 3] [last() -2]
# 节点轴
# //li/ancestor::* 所有祖先节点
# //li/ancestor::div div这个祖先节点
# //li/attribute::* attribute轴,获取li节点所有属性值
# //li/child::a[@href="link1.html"] child轴,获取直接子节点
# //li/descendant::span 获取所有span类型的子孙节点
# //li/following::* 选取文档中当前节点的结束标记之后的所有节点
# //li/following-sibling::* 选取当前节点之后的所用同级节点
# result = etree_html.xpath('//li/ancestor::div')
# print(result)
def main():
html = get_one_page()
# print(html)
parse_with_xpath(html)
if __name__ == '__main__':
main()
Beautiful Soup爬取
import requests
from bs4 import BeautifulSoup
# 取页面HTML
def get_one_page():
url = "https://www.zhipin.com/c101270100-p100109/"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
def parse_soup(html):
soup = BeautifulSoup(html, "lxml") # 试⽤lxml解析器构造beautifulsoup
print(soup.prettify()) # 取⽹⻚缩进格式化输出
print(soup.title.string) # 取⽹⻚title内容
print(soup.head)
print(soup.p)
# 获取节点的名字
print(soup.title.name)
# 获取节点属性
soup.img.attrs["src"]print(soup.p.attrs)
print(soup.p.attrs["name"])
print(soup.p["class"])
# 获取节点包含的内容
print(soup.p.string)
<p class="c1"><span>asdf<span>asdfasdfasdfasdfadsfad<p>
嵌套选择
<head>
<title>this is title</title>
</head>
# soup的节点都为 bs4.element.Tag类型,可以继续选择
print(soup.head.title.string)
关联选择
有些元素没有特征定位,可以先选择有办法定位的,然后以这个节点为 准选择它的⼦节点、⽗节点、兄弟节点等
<p class="p1"></p>
<p></p>
<p></p>
print(soup.p.contents) # 取p节点下⾯所有⼦节点列表
print(soup.p.descendants) #取p节点所有⼦孙节点
print(soup.a.parent) # 取⽗节点
print(soup.a.parents) # 取所有祖先节点
print(soup.a.next_sibling) # 同级下⼀节点
print(soup.a.previous_sibling) # 同级上⼀节点
print(soup.a.next_siblings) # 同级所有后⾯节点
print(soup.a.previous_siblings) # 同级所有前⾯节点
print(list(soup.a.parents)[0].attrs['class'])
⽅法选择器
根据属性和⽂本进⾏查找
<ul><li><li><ul><ul><li><li>jjj<li><li></ul>
print(soup.find_all(name="ul"))
for ul in soup.find_all(name="ul"):
print(ul.find_all(name="li"))
for li in ul.find_all(name="li"):
print(li.string)
soup.find_all(attrs={"id": "list-1"})
css 选择器
<p id="p1" class="panel"><p class=""><p><p>
soup.select('.panel .panel_heading')
soup.select('ul li')
soup.select('#id1 .element')
def main():
html = get_one_page()
# print(html)
parse_soup(html)
if __name__ == '__main__':
main()