今日内容：

1.beautifulsoup

2.mongoDB

1.beautifulsoup

-1 基础知识

'''

什么是bs4？

一个基于re开发的解析库，可以提供一些强大的解析功能。

提高提取数据的效率

'''

from bs4 import BeautifulSoup

# 从bs4中倒入beautifulSoup

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

# 参数一：解析文本

# 参数二：解析器

soup = BeautifulSoup(html_doc,'lxml')

print(soup)

print('*' * 100)

print(type(soup))

print('*' * 100)

# 文档美化

html = soup.prettify()

print(html)

-2 文档遍历

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc,'lxml')

'''

1、直接使用

2、获取标签的名称

3、获取标签的属性

4、获取标签的内容

5、嵌套选择

6、子节点、子孙节点

7、父节点、祖先节点

8、兄弟节点

'''

# 直接使用

print(soup.p)

print(soup.a)

# 获取标签的名称

print(soup.head.name)

# 获取标签的属性

# 获取a标签中的href属性

print(soup.a.attrs['href'])

# 获取标签的内容

print(soup.p.text)

# 嵌套选择

print(soup.html.head)

# 子节点、子孙节点

# 返回迭代器对象，强转为list

print(list(soup.body.children))

# 返回生成器对象（generator）

print(list(soup.body.descendants))

# 父节点、祖先节点

print(soup.p.parent)

print(list(soup.p.parents))# 返回生成器对象

兄弟节点

# 找下一个兄弟

print(soup.a.next_sibling)

print(list(soup.a.next_siblings))

print(soup.a.previous_sibling)

print(list(soup.a.previous_siblings))

-3 文档搜索

html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc,'lxml')

# 字符串过滤器

# name

p_tag = soup.find(name='p')

print(p_tag)

# attrs

# 查找第一个class为sister的节点

p = soup.find(attrs={"class":"sister"})

print(p)

tag_s = soup.find_all(attrs={"class":"sister"})

print(tag_s)

tag_s_2 = soup.find_all(name='p')

print(tag_s_2)

# text

# 配合使用

a = soup.find(name="a",attrs={"id":"link2"},text="Lacie")

print(a)

#正则过滤器

import re

p_tag = soup.find(re.compile('p'))

print(p_tag)

# 列表过滤器

import re

tags = soup.find_all(['a','p',re.compile('html')])

print(tags)

# bool 过滤器

# True 匹配

p = soup.find(name='p',attrs={"id":True})# 找到有id的p标签

print(p)

方法过滤器

匹配标签名为a、属性有id没有class的标签

def hava_id_class(tag):

if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):

return tag

tag = soup.find(name=hava_id_class)

print(tag)

-4 实战：爬取豌豆荚游戏应用

'''

主页:

图标地址、下载次数、大小、详情页地址

详情页:

游戏名、图标名、好评率、评论数、小编点评、简介、网友评论、1-5张截图链接地址、下载地址

https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=666&page=2&ctoken=6ZKhTRrA2zSEgGntNW_gdGwy

page页数总共23

'''

import requests

from bs4 import BeautifulSoup

def get_page(url):

response = requests.get(url)

return response

def parse_detail(text):

soup = BeautifulSoup(text,'lxml')

# app名称

name = soup.find(name="span",attrs={"class":"title"}).text

print(name)

# 好评率

love = soup.find(name="span",attrs={"class":"love"}).text

# 评论数

commit_num = soup.find(name="a",attrs={"class":"love"}).text

# 小编点评

commit_content = soup.find(name="div",attrs={"class":"con"}).text

# 下载地址

download_url = soup.find(name='a',attrs={"class":"install-btn"}).attrs['href']

print(download_url)

def parse_index(data):

soup = BeautifulSoup(data,'lxml')

# 图标地址:

'''

<li data-pn="ly.pp.justpiano" class="card" data-suffix=""><div class="icon-wrap"><a href="https://www.wandoujia.com/apps/ly.pp.justpiano"> <img src="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" data-original="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" alt="极品钢琴 Just Piano" class="icon lazy" width="68" height="68" style=""> </a></div><div class="app-desc"><h2 class="app-title-h2"><a href="https://www.wandoujia.com/apps/ly.pp.justpiano" title="极品钢琴 Just Piano" class="name">极品钢琴 Just Piano</a></h2><div class="meta"> <a rel="nofollow" class="tag gooddev" href="https://www.wandoujia.com/manual_testing?from=www" target="_blank" title="编辑亲测基本可用，无明显恶意行为。"></a> <span class="install-count">2.2万人安装</span> <span class="dot">・</span> <span title="14.03MB">14.03MB</span></div><div class="comment"> 趣味独到的绿色音乐游戏，好玩有趣 </div></div> <a class="tag-link" href="https://www.wandoujia.com/category/6001?pos=w/cardtag/gamecategory_ly.pp.justpiano">休闲益智</a> <a data-app-id="2387217" data-app-vid="200645762" data-app-name="极品钢琴 Just Piano" data-app-pname="ly.pp.justpiano" data-app-vcode="41" data-app-vname="4.3" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" data-app-rtype="1" data-app-requireid="1" class="detail-check-btn" href="https://www.wandoujia.com/apps/ly.pp.justpiano">查看 </a> </li>

'''

app_list = soup.find_all(name='li',attrs={"class":"card"})

for app in app_list:

# 图标地址

img = app.find(name='img').attrs['data-original']

print(img)

# 下载次数

down_num = app.find(name = 'span',attrs={"class":"install-count"}).text

# 大小

import re

size = soup.find(name='span',text=re.compile("\d+MB")).text

# 详情页地址

detail_url = soup.find(name='a',attrs={"class":"detail-check-btn"}).attrs['href']

response = get_page(detail_url)

parse_detail(response.text)

def main():

for line in range(1,23):

url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=666&page=(line)&ctoken=6ZKhTRrA2zSEgGntNW_gdGwy"

response = get_page(url)

print(response.text)

print('*' *100 )

data = response.json()

app_li = data['data']['content']

if __name__ == "__main__":

main()

2.mongoDB

基础操作

from pymongo import MongoClient

# 1.链接mongoDB客户端

# 参数1.mongoDB的ip address

# 参数2.mongoDB的端口号默认 27017

client = MongoClient('localhost',27017)

# print(client)

# # 2.获取你的数据库

# print(client['me_db'])

# # 3.创建集合

# print(client['me_db']['people'])

# # 4.给你的数据库插入数据

# data1 = {

# 'name':'k',

# 'age':18,

# 'sex':'male'

# }

# client['me_db']['people'].insert(data1)

# # 插入多条

# data2 = {

# 'name':'l',

# 'age':20,

# 'sex':'female'

# }

# data3 = {

# 'name':'p',

# 'age':23,

# 'sex':'male'

# }

# client['me_db']['people'].insert([data1,data2,data3])

# 5.查数据

for data in client['me_db']['people'].find({}):

print(data)

2019-06-20 python day-08

2019-06-20 python day-08

今日内容：

1.beautifulsoup

-1 基础知识

-2 文档遍历

-3 文档搜索

2.mongoDB

推荐阅读更多精彩内容