1爬虫解析库的使用
Xpath解析库
使用Xpath解析库需要先安装lxml库
pip3 install lxml
Beautiful Soup解析库
pip3 install beautifulsoup4
还需要安装requests
pip3 install requests
2从js中获取数据
将接口传递给js的数据获取
import re
import json
import requests
# 获取网页
def get_page():
# 网址
url = 'http://cd.meituan.com/meishi/b6119/'
# 伪装请求头
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
# 请求网页并返回响应
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
def main():
html = get_page()
print(html)
pattern = re.compile('"poiInfos":(.*?)},"comHeader"', re.S)
result = re.findall(pattern, html)
print(result)
meituan = json.loads(result[0])
print(len(meituan))
for item in meituan:
print(item['title'])
if __name__ == '__main__':
main()
3lxml使用
import requests
import re
from lxml import etree
# 取页面HTML
def get_one_page():
url = "https://www.douban.com/group/explore"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
# 解析页面
def parse_with_xpath(html):
etree_html = etree.HTML(html)
print(etree_html)
channel_result = etree_html.xpath('//div[@class="channel-item"]')
for channel in channel_result:
title = channel.xpath('./div[@class="bd"]/h3/a/text()')[0]
print(title)
title_result = etree_html.xpath('//div[@class="channel-item"]/div[@class="bd"]/h3/a/text()')
print(title_result)
# 匹配所有节点 //* ; 双斜杠表示从根目录下匹配(即从最开始匹配),单斜杠表示获取上一个的子节点
result = etree_html.xpath('//*')
print(result)
print(len(result))
# 匹配所有子节点 //a 文本获取:text()
result = etree_html.xpath('//a/text()')
print(result)
# 查找元素子节点 /
result = etree_html.xpath('//div/p/text()')
print(result)
# 查找元素所有子孙节点 //
result = etree_html.xpath('//div[@class="channel-item"]')
print(len(result))
result = etree_html.xpath('//div[@class="channel-item"] | //span[@class="pubtime"]/../span/a/text()')
print(result)
# 父节点 ..
result = etree_html.xpath('//span[@class="pubtime"]/../span/a/text()')
print(result)
# 属性匹配 [@class="xxx"]
# 文本匹配 text() 获取所有文本//text()
result = etree_html.xpath('//div[@class="article"]//text()')
print(result)
# 属性获取 @href
result = etree_html.xpath('//div[@class="article"]/div/div/@class')[0]
print(result)
result = etree_html.xpath('//div[@class="bd"]/h3/a/@href')
print(result)
# 属性多值匹配 contains(@class 'xx')
result = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()[1]')
print(result)
# 多属性匹配 or, and, mod, //book | //cd, + - * div = != < > <= >=
result = etree_html.xpath('//span[@class="pubtime" and contains(text(), "-12-29")]/text()')
print(result)
# 按序选择 [1] [last()] [poistion() < 3] [last() -2]
# 节点轴
result = etree_html.xpath('//div/child::div[@class="likes"]/following-sibling::*//span[@class="pubtime"]/text()')
print(result)
print(len(result))
//li/ancestor::* 所有祖先节点
//li/ancestor::div div这个祖先节点
//li/attribute::* attribute轴,获取li节点所有属性值
//li/child::a[@href="link1.html"] child轴,获取直接子节点
//li/descendant::span 获取所有span类型的子孙节点
//li/following::* 选取文档中当前节点的结束标记之后的所有节点
//li/following-sibling::* 选取当前节点之后的所有同级节点
result = etree_html.xpath('//div[@class="channel-item"][1]/following-sibling::*')
print(result)
print(len(result))
result = etree_html.xpath('//div[contains(@class, "channel-group-rec")]//div[@class="title"]/following::*[1]/text()')
print(result)
def main():
html = get_one_page()
print(html)
parse_with_xpath(html)
if __name__ == '__main__':
main()
channel_result = etree_html.xpath('//div[@class="channel-item"]') 获取div中class属性为channel-item的节点
channel_result = etree_html.xpath('//div[@class="channel-item"]/text()') 获取div中class属性为channel-item的节点的文本内容(不包括属性)
channel_result = etree_html.xpath('//div[@class="channel-item"]//text()') 获取div中class属性为channel-item的节点的文本内容(包括属性)
双斜杠表示从当前目录下匹配;单斜杠text()表示获取标签中的文本内容;//text()表示获取所有文本(包括属性)
/表示该标签下的子标签;//标签该标签下的所有标签;./表示从当前标签取其子标签;.//表示从当前标签取其所有标签
result = etree_html.xpath('//div[@class="bd"]/h3/a/@href') 获取属性
| 表示并,一下取多个内容;
按序选择 [1]第一个 [last()]最后一个 [poistion() < 3]前两个 [last() -2]倒数第三个
4beautifulsoup使用
from bs4 import BeautifulSoup
import requests
import re
# 取页面HTML
def get_one_page():
url = "http://sports.sina.com.cn/nba/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
def parse_with_bs4(html):
# html = '<p><div><a></a></div></p>'
# print(html)
soup = BeautifulSoup(html, 'lxml')
# 让页面标签整齐的输出
print(soup.prettify())
# head标签里面title的文字内容
print(soup.title.string)
# 取整个指定的标签
print(soup.head)
print(type(soup.head))
print(soup.p)
print(soup.p.name)
print(soup.img.attrs["src"])
print(soup.img.attrs)
print(soup.img.attrs['src'])
print(soup.img['src'])
print(soup.p)
print(soup.p.contents) # 取p节点下面所有子节点列表
print(soup.p.descendants) # 取p节点所有子孙节点
# 取a的所有祖先节点
# [0]表示取它的父标签,1表示在往上取一层
print(soup.a.parent) # 取父节点
print(soup.a.parents) # 取所有祖先节点
print(list(soup.a.parents)) # 取父节点
print(list(soup.a.parents)[0].attrs['class'])
print(soup.head.title.string)
result = soup.select('.news-list-b .list .item p a')
for item in result:
print(item.string)
print(item['href'])
result = soup.select('.-live-layout-row.layout_sports_350_650')
print(result)
l = soup.select('.ct_t_01 a')
for item in l:
print(item.string)
print(item['href'])
print(len(l))
item = soup.select('#syncad_1 p')[0]
print(item)
print(item.contents)
print(len(item.contents))
item = soup.select('.b_time')[0].string
print(item)
def main():
html = get_one_page()
# print(html)
parse_with_bs4(html)
if __name__ == '__main__':
main()
soup = BeautifulSoup(html, 'lxml');生成soup对象;
soup.prettify()对传入的html参数进行全部
soup.title.string 获取title中的文本内容;soup.head获取head标签及其中的内容
soup.p.name 获取p标签的标签名(此处为p);soup.img['src']获取img中的src属性
soup.p.contents 取p节点下面所有子节点列表;
soup.p.descendants 取p节点所有子孙节点;soup.a.parent 取父节点;soup.a.parents 取所有祖先节点;list(soup.a.parents)[0].attrs['class']; [0]表示取它的父标签,1表示在往上取一层
result = soup.select('.news-list-b .list .item p a')
for item in result:
print(item.string)
print(item['href'])
获取对应class下的a标签并输出其文本内容及其href属性。
result = soup.select('.-live-layout-row.layout_sports_350_650')
5将数据存至数据库
5.1建表语句
create database maoyan default character set='utf8';
use maoyan;
create table movie (
id int primary key auto_increment,
title varchar(256),
actor varchar(256),
detail varchar(1024),
cover_url varchar(1024)
);
5.2存数据相关函数
import pymysql
# 获取数据库连接
def get_connection():
host = '127.0.0.1'
port = 3306
user = 'root'
password = '123456'
database = 'maoyan'
db = pymysql.connect(host, user, password, database, charset='utf8', port=port)
return db
# 获取数据库游标
def get_cursor(db):
cursor = db.cursor()
return cursor
# 关闭链接
def close_connection(db):
db.colos()
# 插入一条记录
def insert_record(db, cursor, item):
sql = 'insert into movie (title, actor, detail, cover_url) values("%s","%s","%s","%s")' % (item['movie_name'], item['actor'], item['detail'], item['cover'])
print(sql)
cursor.execute(sql)
# 需要上传,数据才能存入数据库
db.commit()
5.3爬虫中存数据
from maoyan_db_helper import *
db = get_connection()
cursor = get_cursor(db)
for i in range(len(details)):
simple_message = {}
simple_message['cover'] = covers[i]
insert_record(db, cursor, simple_message)