今日作业

今日作业
程序代码
'''
url:
https://www.wandoujia.com/category/6001
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9
32个页面
'''
import requests
from bs4 import BeautifulSoup as bs
import lxml
import re
from pymongo import MongoClient
# 发送get请求
def get_page(url):
response = requests.get(url)
return response
# 解析详情页
def parse_detail(data):
soup = bs(data, 'lxml')
# 获取APP名称
name = soup.find(name='span', attrs={'class', 'title'}).text
# print(name)
# 好评率
love = soup.find(name='span', attrs={'class', 'love'}).text
# print(love)
# 评论人数
commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
# print(commit_num)
# 小编点评
commit = soup.find(name='div', attrs={'class', 'con'}).text
# print(commit)
# 下载链接
download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
# print(download)
# 简介
intro = soup.find(name='div', attrs={'class', 'desc-info'}).text
# 网友评论(星星、名字、评论、时间)
try:
star = soup.find(name='i', attrs={'class', 'score-current'}).attrs['style']
star_dict = {'width: 20%': '1颗星', 'width: 40%': '2颗星', 'width: 60%': '3颗星', 'width: 80%': '4颗星', 'width: 100%': '5颗星'}
user_name = soup.find(name='span', attrs={'class': 'name'}).text
time = soup.find(name='span', attrs={'class': 'time'}).text
user_commit = soup.find(name='p', attrs={'class': 'cmt-content'}).text
except:
star = None
star_dict = None
user_name = None
time = None
user_commit = None
# 1——5张截图链接地址
link = []
for i in range(0, 4):
link.append(soup.find(name='img', attrs={'data-index': '{}'.format(i)}).attrs['src'])
# print(
# '''
# APP名称:{}
# 好评率:{}
# 评论人数:{}
# 小编点评:{}
# 下载链接:{}
# 简介:{}
# 姓名:{}
# 时间:{}
# star:{}
# 评论:{}
# 截图链接:{}
# '''
# .format(name, love, commit_num, commit, download, intro, user_name, time, star_dict[star], user_commit, link)
# )
client['wandoujia']['detail'].insert({'app_name': name})
client['wandoujia']['detail'].insert({'love': love})
client['wandoujia']['detail'].insert({'commit_num': commit_num})
client['wandoujia']['detail'].insert({'commit': commit})
client['wandoujia']['detail'].insert({'download_link': download})
client['wandoujia']['detail'].insert({'intro': intro})
client['wandoujia']['detail'].insert({'user_name': user_name})
client['wandoujia']['detail'].insert({'time': time})
client['wandoujia']['detail'].insert({'star': star_dict[star]})
client['wandoujia']['detail'].insert({'user_commit': user_commit})
client['wandoujia']['detail'].insert({'pic_link': link})
# 解析主页
def parse_text(data):
soup = bs(data, 'lxml')
# 获取所有app的li标签
li_data = soup.find_all(name='li', attrs={'class': 'card'})
for li in li_data:
# 图标地址
img = li.find(name='img').attrs['data-original']
# print('图标地址:'+img)
client['wandoujia']['index'].insert({'icon_addr': img})
# 下载人数
download_count = li.find(name='span', attrs={'class': 'install-count'}).text
# print('下载人数:'+count)
client['wandoujia']['index'].insert({'download_count': download_count})
# 大小
size = li.find(name='span', text=re.compile('\w+B')).text
# print('大小:'+size)
client['wandoujia']['index'].insert({'size': size})
# 详情页链接
detail_url = li.find(name='a').attrs['href']
# print('详情页链接:'+detail_url)
client['wandoujia']['index'].insert({'detail_url': detail_url})
# 访问详情页
detail_response = get_page(detail_url)
# print(detail_response.text)
# 解析详情页
parse_detail(detail_response.text)
def main():
for i in range(1, 33):
url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)
response = get_page(url)
# print(response)
# 反序列化为字典
data = response.json()
app_li = data['data']['content']
# 解析主页面
parse_text(app_li)
# 关闭mongo客户端
client.close()
if __name__ == '__main__':
client = MongoClient('localhost', 27017)
print(client)
client['wandoujia']['index']
client['wandoujia']['detail']
main()
1.基本使用
'''
BeautifulSoup4
1、什么是bs4
是一个基于re开发的解析库
'''
'''
1.基本使用
'''
from bs4 import BeautifulSoup as bs
import lxml
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 用BeautifulSoup实例化得到一个soup对象
# 参数一:解析文本,参数二:解析器
soup = bs(html_doc, 'lxml')
print(soup)
# 格式美化
html = soup.prettify()
print(html)
2.遍历文档树
from bs4 import BeautifulSoup as bs
import lxml
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""
soup = bs(html_doc, 'lxml')
'''
1、直接使用
2、获取标签的名称
3、获取标签的属性
4、获取标签的内容
5、嵌套选择
6、子节点、子孙节点
7、父节点、祖先节点
8、兄弟节点
'''
'''
1、直接使用
'''
# 查找第一个p标签
print(soup.p)
print(soup.a)
'''
2、获取标签的名称
'''
print(soup.head.name)
'''
3、获取标签的属性
'''
# 获取a标签中的所有属性
print(soup.a.attrs)
# 获取a标签中的href属性
print(soup.a.attrs['href'])
'''
4、获取标签的内容
'''
print(soup.p.text)
'''
5、嵌套选择
'''
print(soup.html.head)
'''
6、子节点、子孙节点
'''
# body所有子节点,返回迭代器对象
print(soup.body.children)
# 强制转换为列表
print(list(soup.body.children))
# body的子孙节点,返回生成器对象
print(soup.body.descendants)
# 强制转换为列表
print(list(soup.body.descendants))
'''
7、父节点、祖先节点
'''
# 获取p标签的父亲节点
print(soup.p.parent)
# 获取p标签的祖先节点,返回生成器对象
print(soup.p.parents)
# 强制转换为列表
print(list(soup.p.parents))
'''
8、兄弟节点
'''
# 找p标签下一个兄弟
print(soup.p.next_sibling)
# 找p下面所有兄弟
print(soup.p.next_siblings)
print(list(soup.p.next_siblings))
# 找a标签的上一个兄弟
print(soup.a.previous_sibling)
# 找a标签的上面所有兄弟
print(soup.a.previous_siblings)
print(list(soup.a.previous_siblings))
3.搜索文档树
'''
搜索文档树:
find() 找一个
find() 找所有
标签查找与属性查找:
标签:
name 属性匹配
attrs 属性查找匹配
text 文本匹配
- 字符串过滤器
字符串全局匹配
- 正则过滤器
re模块匹配
- 列表过滤器
列表内的数据匹配
- bool过滤器
True匹配
- 方法过滤器
用于一些要的属性以及不需要的属性查找。
属性:
- class_
- id
'''
from bs4 import BeautifulSoup as bs
import lxml
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""
soup = bs(html_doc, 'lxml')
'''
字符串过滤器
'''
# name
# 根据标签名查找
p_tag = soup.find(name='p')
print(p_tag)
# 找到所有p标签
p_all_tag = soup.find_all(name='p')
print(p_all_tag)
# attrs
# 查找第一个class为sister的节点
p_class = soup.find(attrs={'class': 'sister'})
print(p_class)
# 查找所有class为sister的节点
p_all_class = soup.find_all(attrs={'class': 'sister'})
print(p_all_class)
# text
# 查找文本
p_text = soup.find(text='$37')
print(p_text)
# 配合使用
# 找到一个id为link2,文本为Lacie的a标签
p_all = soup.find(name='a', attrs={'id': 'link2'},text='Lacie')
print(p_all)
'''
正则过滤器
'''
import re
# name
# 根据标签名查找
p_tag = soup.find(name=re.compile('p'))
print(p_tag)
'''
列表过滤器
'''
import re
# name
# 根据标签名查找
p_tags = soup.find_all(name=['p', 'a', re.compile('html')])
print(p_tags)
'''
bool过滤器
'''
# 找到有id的p标签
p_tag = soup.find(name='p', attrs={'id': True})
print(p_tag)
'''
方法过滤器
'''
# 匹配标签名为a,属性有id,没有class的标签
def have_id_not_class(tag):
if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
return tag
tag = soup.find(have_id_not_class)
print(tag)
4.爬取豌豆荚app数据
'''
url:
https://www.wandoujia.com/category/6001
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9
32个
'''
import requests
from bs4 import BeautifulSoup as bs
import lxml
import re
# 发送get请求
def get_page(url):
response = requests.get(url)
return response
# 解析详情页
def parse_detail(data):
soup = bs(data, 'lxml')
# 获取APP名称
name = soup.find(name='span', attrs={'class', 'title'}).text
# print(name)
# 好评率
love = soup.find(name='span', attrs={'class', 'love'}).text
# print(love)
# 评论人数
commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
# print(commit_num)
# 小编点评
commit = soup.find(name='div', attrs={'class', 'con'}).text
# print(commit)
# 下载链接
download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
# print(download)
print(
'''
APP名称:{}
好评率:{}
评论人数:{}
小编点评:{}
下载链接:{}
'''
.format(name, love, commit_num, commit, download)
)
# 解析主页
def parse_text(data):
soup = bs(data, 'lxml')
'''
<li data-pn="com.tuyoo.fish.uc" class="card" data-suffix=""><div class="icon-wrap"><a href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc" > <img src="//img.ucdl.pp.uc.cn/upload_files/wdj_web/public/img/grey-128x128.png" data-original="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" alt="捕鱼大作战" class="icon lazy" width="68" height="68"> </a></div><div class="app-desc"><h2 class="app-title-h2"><a href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc" title="捕鱼大作战" class="name">捕鱼大作战</a></h2><div class="meta"> <span class="install-count">13.9万人安装</span> <span class="dot">・</span> <span title="33.67MB">33.67MB</span></div><div class="comment"> 捕鱼大作战,经典街机新体验 </div></div> <a class="tag-link" href="https://www.wandoujia.com/category/6001?pos=w/cardtag/gamecategory_com.tuyoo.fish.uc">休闲益智</a> <a data-app-id="7471166" data-app-vid="700485088" data-app-name="捕鱼大作战" data-app-pname="com.tuyoo.fish.uc" data-app-vcode="41000" data-app-vname="4.1" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" data-app-rtype="1" class="detail-check-btn" href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc">查看 </a> </li>
'''
# 获取所有app的li标签
li_data = soup.find_all(name='li', attrs={'class': 'card'})
for li in li_data:
# 图标地址
img = li.find(name='img').attrs['data-original']
print('图标地址:'+img)
# 下载人数
count = li.find(name='span', attrs={'class': 'install-count'}).text
print('下载人数:'+count)
# 大小
size = li.find(name='span', text=re.compile('\d+MB')).text
print('大小:'+size)
# 详情页链接
detail_url = li.find(name='a').attrs['href']
print('详情页链接:'+detail_url)
# 访问详情页
detail_response = get_page(detail_url)
# print(detail_response.text)
# 解析详情页
parse_detail(detail_response.text)
def main():
for i in range(1, 33):
url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)
response = get_page(url)
# print(response)
# 反序列化为字典
data = response.json()
app_li = data['data']['content']
# 解析主页面
parse_text(app_li)
if __name__ == '__main__':
main()
5.MongoDB
MongoDB是一款强大、灵活、且易于扩展的通用型非关系型数据库。

关系型数据库与非关系型数据库

MongoDB与SQL对比
'''
一、安装运行
1.下载安装
-https://www.mongodb.com/download-center#community
2.安装路径为D:\MongoDB,将D:\MongoDB\bin目录加入环境变量
3.新建目录与文件
-D:\MongoDB\data\db
-D:\MongoDB\log\mongod.log
4.在C盘建立文件夹C:/data/db
-数据存放路径
5.输入mongod启动服务
进入终端(以管理员身份),输入mongod启动MongoDB服务
6.输入mongo进入MongoDB客户端(不要关闭服务端)
打开一个新的cmd,输入mongo进入客户端
二、数据库操作
1.切换库
SQL:
use admin; 有则切换,无则报错
MongoDB:
use tank; 有则切换,无则创建并切换
2.查数据库
SQL:
show database;
MongoDB:
show dbs; 仅显示有数据的库
3.删除库
SQL:
drop database;
MongoDB:
db.dropDatabase();
三、集合操作 mysql中叫做表
1.创建集合
SQL:
creat table f1,f2...
MongoDB:
# 在当前库中通过.创建集合
db.student
2.插入数据
# 插入一条数据
db.student.insert({"name":"hy"})
# 插入多条数据
db.student.insert({"name1":"hy1"},{"name2":"hy2"})
3.查找数据
# 查找student集合中所有数据
db.student.find({})
# 查找一条数据,一条name为hy的数据
db.student.find({"name":"hy"})
'''
6.pymongo使用
from pymongo import MongoClient
# 1.链接MongoDB客户端
# 参数一:mongoDB的IP地址
# 参数二:mongoDB端口号,默认27017
client = MongoClient('localhost', 27017)
print(client)
# 2.进入hy_db库,没有则创建
print(client['hy_db'])
# 3.创建集合
print(client['hy_db']['prople'])
# 4.向hy_db库插入一条数据
data1 = {'name': 'hy', 'age': '23', 'sex': 'male'}
client['hy_db']['people'].insert(data1)
# 5.插入多条数据
data1 = {'name': 'hy1', 'age': '23', 'sex': 'male'}
data2 = {'name': 'hy2', 'age': '22', 'sex': 'male'}
data3 = {'name': 'hy3', 'age': '21', 'sex': 'male'}
client['hy_db']['people'].insert([data1, data2, data3])
# 官方推荐使用
# 插入一条
client['hy_db']['people'].insert_one()
# 插入多条
client['hy_db']['people'].insert_many()
# 6.查数据
# 查看所有数据
data_s = client['hy_db']['people'].find()
# 循环打印所有数据
for data in data_s:
print(data)
# 查看一条数据
data = client['hy_db']['people'].find_one()
print(data)