2019-06-20 python day-08

今日内容:

        1.beautifulsoup

        2.mongoDB

 1.beautifulsoup

-1    基础知识

'''

    什么是bs4?

        一个基于re开发的解析库,可以提供一些强大的解析功能。

        提高提取数据的效率

'''

from bs4 import BeautifulSoup

# 从bs4中倒入beautifulSoup

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

# 参数一:解析文本

# 参数二:解析器

soup = BeautifulSoup(html_doc,'lxml')

print(soup)

print('*' * 100)

print(type(soup))

print('*' * 100)

# 文档美化

html = soup.prettify()

print(html)

-2     文档遍历

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc,'lxml')

'''

1、直接使用

2、获取标签的名称

3、获取标签的属性

4、获取标签的内容

5、嵌套选择

6、子节点、子孙节点

7、父节点、祖先节点

8、兄弟节点

'''

# 直接使用

print(soup.p)

print(soup.a)

# 获取标签的名称

print(soup.head.name)

# 获取标签的属性

#  获取a标签中的href属性

print(soup.a.attrs['href'])

# 获取标签的内容

print(soup.p.text)

# 嵌套选择

print(soup.html.head)

# 子节点、子孙节点

# 返回迭代器对象,强转为list

print(list(soup.body.children))

# 返回生成器对象(generator)

print(list(soup.body.descendants))

# 父节点、祖先节点

print(soup.p.parent)

print(list(soup.p.parents))# 返回生成器对象

兄弟节点

# 找下一个兄弟

print(soup.a.next_sibling)

print(list(soup.a.next_siblings))

print(soup.a.previous_sibling)

print(list(soup.a.previous_siblings))

-3    文档搜索

html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc,'lxml')

# 字符串过滤器

# name

p_tag = soup.find(name='p')

print(p_tag)

# attrs

# 查找第一个class为sister的节点

p = soup.find(attrs={"class":"sister"})

print(p)

tag_s = soup.find_all(attrs={"class":"sister"})

print(tag_s)

tag_s_2 = soup.find_all(name='p')

print(tag_s_2)

# text

# 配合使用

a = soup.find(name="a",attrs={"id":"link2"},text="Lacie")

print(a)

#正则过滤器

import re

p_tag = soup.find(re.compile('p'))

print(p_tag)

# 列表过滤器

import re

tags = soup.find_all(['a','p',re.compile('html')])

print(tags)

# bool 过滤器

# True 匹配

p = soup.find(name='p',attrs={"id":True})# 找到有id的p标签

print(p)

方法过滤器

匹配标签名为a、属性有id没有class的标签

def hava_id_class(tag):

    if tag.name == 'a' and tag.has_attr('id') and  tag.has_attr('class'):

        return tag

tag = soup.find(name=hava_id_class)

print(tag)

-4    实战:爬取豌豆荚游戏应用

'''

主页:

    图标地址、下载次数、大小、详情页地址

详情页:

    游戏名、图标名、好评率、评论数、小编点评、简介、网友评论、1-5张截图链接地址、下载地址

    https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=666&page=2&ctoken=6ZKhTRrA2zSEgGntNW_gdGwy

    page页数总共23

'''

import requests

from bs4 import BeautifulSoup

def get_page(url):

    response = requests.get(url)

    return response

def parse_detail(text):

    soup = BeautifulSoup(text,'lxml')

    # app名称

    name = soup.find(name="span",attrs={"class":"title"}).text

    print(name)

    # 好评率

    love = soup.find(name="span",attrs={"class":"love"}).text

    # 评论数

    commit_num = soup.find(name="a",attrs={"class":"love"}).text

    # 小编点评

    commit_content = soup.find(name="div",attrs={"class":"con"}).text

    # 下载地址

    download_url = soup.find(name='a',attrs={"class":"install-btn"}).attrs['href']

    print(download_url)

def parse_index(data):

    soup = BeautifulSoup(data,'lxml')

    # 图标地址:

    '''

    <li data-pn="ly.pp.justpiano" class="card" data-suffix=""><div class="icon-wrap"><a href="https://www.wandoujia.com/apps/ly.pp.justpiano">  <img src="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" data-original="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" alt="极品钢琴 Just Piano" class="icon lazy" width="68" height="68" style="">  </a></div><div class="app-desc"><h2 class="app-title-h2"><a href="https://www.wandoujia.com/apps/ly.pp.justpiano" title="极品钢琴 Just Piano" class="name">极品钢琴 Just Piano</a></h2><div class="meta">  <a rel="nofollow" class="tag gooddev" href="https://www.wandoujia.com/manual_testing?from=www" target="_blank" title="编辑亲测基本可用,无明显恶意行为。"></a>    <span class="install-count">2.2万人安装</span>  <span class="dot">・</span> <span title="14.03MB">14.03MB</span></div><div class="comment">  趣味独到的绿色音乐游戏,好玩有趣  </div></div>  <a class="tag-link" href="https://www.wandoujia.com/category/6001?pos=w/cardtag/gamecategory_ly.pp.justpiano">休闲益智</a>    <a data-app-id="2387217" data-app-vid="200645762" data-app-name="极品钢琴 Just Piano" data-app-pname="ly.pp.justpiano" data-app-vcode="41" data-app-vname="4.3" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2018/04/24/11/109_2294fa64de8b7262cb1318c26bd04a9e_con_130x130.png" data-app-rtype="1" data-app-requireid="1" class="detail-check-btn" href="https://www.wandoujia.com/apps/ly.pp.justpiano">查看 </a>  </li>

    '''

    app_list = soup.find_all(name='li',attrs={"class":"card"})

    for app in app_list:

        # 图标地址

        img = app.find(name='img').attrs['data-original']

        print(img)

        # 下载次数

        down_num = app.find(name = 'span',attrs={"class":"install-count"}).text

        # 大小

        import re

        size = soup.find(name='span',text=re.compile("\d+MB")).text

        # 详情页地址

        detail_url = soup.find(name='a',attrs={"class":"detail-check-btn"}).attrs['href']

        response = get_page(detail_url)

        parse_detail(response.text)

def main():

    for line in range(1,23):

        url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=666&page=(line)&ctoken=6ZKhTRrA2zSEgGntNW_gdGwy"

        response = get_page(url)

        print(response.text)

        print('*' *100 )

        data = response.json()

        app_li = data['data']['content']

if __name__ == "__main__":

    main()


  2.mongoDB

基础操作

from pymongo import MongoClient

# 1.链接mongoDB客户端

# 参数1.mongoDB的ip address

# 参数2.mongoDB的 端口号 默认 27017

client = MongoClient('localhost',27017)

# print(client)

# # 2.获取你的数据库

# print(client['me_db'])

# # 3.创建集合

# print(client['me_db']['people'])

# # 4.给你的数据库插入数据

# data1 = {

#    'name':'k',

#    'age':18,

#    'sex':'male'

# }

# client['me_db']['people'].insert(data1)

#    # 插入多条

# data2 = {

#    'name':'l',

#    'age':20,

#    'sex':'female'

# }

# data3 = {

#    'name':'p',

#    'age':23,

#    'sex':'male'

# }

# client['me_db']['people'].insert([data1,data2,data3])

# 5.查数据

for data in client['me_db']['people'].find({}):

    print(data)

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容