今日内容:
1.beautifulsoup
2.mongoDB
1.beautifulsoup
-1 基础知识
'''
什么是bs4?
一个基于re开发的解析库,可以提供一些强大的解析功能。
提高提取数据的效率
'''
from bs4 import BeautifulSoup
# 从bs4中倒入beautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 参数一:解析文本
# 参数二:解析器
soup = BeautifulSoup(html_doc,'lxml')
print(soup)
print('*' * 100)
print(type(soup))
print('*' * 100)
# 文档美化
html = soup.prettify()
print(html)
-2 文档遍历
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
'''
1、直接使用
2、获取标签的名称
3、获取标签的属性
4、获取标签的内容
5、嵌套选择
6、子节点、子孙节点
7、父节点、祖先节点
8、兄弟节点
'''
# 直接使用
print(soup.p)
print(soup.a)
# 获取标签的名称
print(soup.head.name)
# 获取标签的属性
# 获取a标签中的href属性
print(soup.a.attrs['href'])
# 获取标签的内容
print(soup.p.text)
# 嵌套选择
print(soup.html.head)
# 子节点、子孙节点
# 返回迭代器对象,强转为list
print(list(soup.body.children))
# 返回生成器对象(generator)
print(list(soup.body.descendants))
# 父节点、祖先节点
print(soup.p.parent)
print(list(soup.p.parents))# 返回生成器对象
兄弟节点
# 找下一个兄弟
print(soup.a.next_sibling)
print(list(soup.a.next_siblings))
print(soup.a.previous_sibling)
print(list(soup.a.previous_siblings))
-3 文档搜索
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
# 字符串过滤器
# name
p_tag = soup.find(name='p')
print(p_tag)
# attrs
# 查找第一个class为sister的节点
p = soup.find(attrs={"class":"sister"})
print(p)
tag_s = soup.find_all(attrs={"class":"sister"})
print(tag_s)
tag_s_2 = soup.find_all(name='p')
print(tag_s_2)
# text
# 配合使用
a = soup.find(name="a",attrs={"id":"link2"},text="Lacie")
print(a)
#正则过滤器
import re
p_tag = soup.find(re.compile('p'))
print(p_tag)
# 列表过滤器
import re
tags = soup.find_all(['a','p',re.compile('html')])
print(tags)
# bool 过滤器
# True 匹配
p = soup.find(name='p',attrs={"id":True})# 找到有id的p标签
print(p)
方法过滤器
匹配标签名为a、属性有id没有class的标签
def hava_id_class(tag):
if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
return tag
tag = soup.find(name=hava_id_class)
print(tag)
-4 实战:爬取豌豆荚游戏应用
'''
主页:
图标地址、下载次数、大小、详情页地址
详情页:
游戏名、图标名、好评率、评论数、小编点评、简介、网友评论、1-5张截图链接地址、下载地址
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=666&page=2&ctoken=6ZKhTRrA2zSEgGntNW_gdGwy
page页数总共23
'''
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
return response
def parse_detail(text):
soup = BeautifulSoup(text,'lxml')
# app名称
name = soup.find(name="span",attrs={"class":"title"}).text
print(name)
# 好评率
love = soup.find(name="span",attrs={"class":"love"}).text
# 评论数
commit_num = soup.find(name="a",attrs={"class":"love"}).text
# 小编点评
commit_content = soup.find(name="div",attrs={"class":"con"}).text
# 下载地址
download_url = soup.find(name='a',attrs={"class":"install-btn"}).attrs['href']
print(download_url)
def parse_index(data):
soup = BeautifulSoup(data,'lxml')
# 图标地址:
'''
<li data-pn="ly.pp.justpiano" class="card" data-suffix=""><div class="icon-wrap"><a href="https://www.wandoujia.com/apps/ly.pp.justpiano"> <img src="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" data-original="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" alt="极品钢琴 Just Piano" class="icon lazy" width="68" height="68" style=""> </a></div><div class="app-desc"><h2 class="app-title-h2"><a href="https://www.wandoujia.com/apps/ly.pp.justpiano" title="极品钢琴 Just Piano" class="name">极品钢琴 Just Piano</a></h2><div class="meta"> <a rel="nofollow" class="tag gooddev" href="https://www.wandoujia.com/manual_testing?from=www" target="_blank" title="编辑亲测基本可用,无明显恶意行为。"></a> <span class="install-count">2.2万人安装</span> <span class="dot">・</span> <span title="14.03MB">14.03MB</span></div><div class="comment"> 趣味独到的绿色音乐游戏,好玩有趣 </div></div> <a class="tag-link" href="https://www.wandoujia.com/category/6001?pos=w/cardtag/gamecategory_ly.pp.justpiano">休闲益智</a> <a data-app-id="2387217" data-app-vid="200645762" data-app-name="极品钢琴 Just Piano" data-app-pname="ly.pp.justpiano" data-app-vcode="41" data-app-vname="4.3" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" data-app-rtype="1" data-app-requireid="1" class="detail-check-btn" href="https://www.wandoujia.com/apps/ly.pp.justpiano">查看 </a> </li>
'''
app_list = soup.find_all(name='li',attrs={"class":"card"})
for app in app_list:
# 图标地址
img = app.find(name='img').attrs['data-original']
print(img)
# 下载次数
down_num = app.find(name = 'span',attrs={"class":"install-count"}).text
# 大小
import re
size = soup.find(name='span',text=re.compile("\d+MB")).text
# 详情页地址
detail_url = soup.find(name='a',attrs={"class":"detail-check-btn"}).attrs['href']
response = get_page(detail_url)
parse_detail(response.text)
def main():
for line in range(1,23):
url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=666&page=(line)&ctoken=6ZKhTRrA2zSEgGntNW_gdGwy"
response = get_page(url)
print(response.text)
print('*' *100 )
data = response.json()
app_li = data['data']['content']
if __name__ == "__main__":
main()
2.mongoDB
基础操作
from pymongo import MongoClient
# 1.链接mongoDB客户端
# 参数1.mongoDB的ip address
# 参数2.mongoDB的 端口号 默认 27017
client = MongoClient('localhost',27017)
# print(client)
# # 2.获取你的数据库
# print(client['me_db'])
# # 3.创建集合
# print(client['me_db']['people'])
# # 4.给你的数据库插入数据
# data1 = {
# 'name':'k',
# 'age':18,
# 'sex':'male'
# }
# client['me_db']['people'].insert(data1)
# # 插入多条
# data2 = {
# 'name':'l',
# 'age':20,
# 'sex':'female'
# }
# data3 = {
# 'name':'p',
# 'age':23,
# 'sex':'male'
# }
# client['me_db']['people'].insert([data1,data2,data3])
# 5.查数据
for data in client['me_db']['people'].find({}):
print(data)