一、python中的生成表达式
1. 列表推导式
用途:快速生成一个列表
格式
# 格式 [表达式 for 临时变量 in 可迭代对象 [条件语句]]
与普通for循环创建列表进行对比:
# 使用普通for循环创建列表
# 生成一个空列表
li = []
# for 临时变量 in 可迭代对象:
# 循环体
for i in range(10):
# 向列表中添加元素
li.append(i)
print(li)
用列表推导式创建列表
# 以上我们使用4行代码显示一个0-9的列表
print([i for i in range(10)])
示例1:
# 生成随机数
from random import randint
print(randint(60, 100))
# 生成一个列表['序号:998','序号:992', '序号:993' ]
# [100,999]共十个元素
# for循环
from random import randint
l1 = []
# _ 是给读代码的人看的,表示下面将不使用 _
for _ in range(10):
seq = '序号:{}'.format(randint(100, 999))
l1.append(seq)
print(l1) # ['序号:948', '序号:475', '序号:727', '序号:291', '序号:788', '序号:708', '序号:554', '序号:217', '序号:407', '序号:594']
# 列表推导式
l2 = ['序号:{}'.format(randint(100, 999)) for _ in range(10)]
# print(l2) # ['序号:177', '序号:568', '序号:518', '序号:878', '序号:523', '序号:526', '序号:938', '序号:556', '序号:137', '序号:432']
示例2:
# 生成一个列表
from random import randint
li = []
for _ in range(10):
li.append(randint(30, 100))
print(li)
# 已知列表li ,然后筛选所有的偶数
li = [92, 53, 84, 54, 82, 92, 95, 38, 52, 42]
# for 循环 遍历这个列表
result = []
for x in li:
if x%2 == 0:
result.append(x)
print(result) # [92, 84, 54, 82, 92, 38, 52, 42]
# 格式 [表达式 for 临时变量 in 可迭代对象 [条件语句]]
[x for x in li if x%2==0]
练习:使用列表推导式生成一个含有二十个元素的随机数列表[],再筛选出所有的奇数。
from random import randint
li = [randint(0, 100) for _ in range(20)]
print([i for i in li if i%2 == 1]) # [33, 79, 45, 65, 59, 95, 57, 65, 97]
2. 三目运算符
用途:可以使用一行的 if else
格式
# 格式: 返回值 if 满足条件的表达式 else 不满足时要执行的事情
示例1:
li = ['dada']
if len(li)==0:
li = 'aa'
else:
li = li[0]
print(li)
# 三目运算符
# 格式: 返回值 if 满足条件的表达式 else 不满足时要执行的事情
s = 'aa' if len(li) == 0 else li[0]
print(s)
二、爬取当当图书信息(优化后)
import requests
from lxml import html
# 安装pandas
# pip install pandas
# 导入pandas
import pandas as pd
def spider(isbn):
""":param #param是参数
当当网图书信息爬虫
"""
# url = "http://search.dangdang.com/?key=python%B4%D3%C8%EB%C3%C5%B5%BD%CA%B5%BC%F9&act=input"
# isbn 国际标准书号(唯一的) 9787115428028
url = "http://search.dangdang.com/?key={}&act=input".format(isbn)
print(url)
# 获取网页的源代码
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
html_data = requests.get(url, headers=headers).text
print(html_data)
#使用xpath语法提取我们想要的内容
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('有{}家商铺售卖此书'.format(len(ul_list)))
# 用于存储图书的所有信息,每一家是一个字典
# [{},{},{}]
book_info_list = []
# 遍历
for li in ul_list:
# 爬取所有书籍的标题
title = li.xpath('a/@title')[0]
# print(title)
# 获取所有购买链接
link = li.xpath('a/@href')[0]
# print(link)
# 获取价格
price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')[0]
# print(price)
# 去掉¥符号
price = price.replace('¥', ' ')
# print(price)
# 爬取除了当当自营以外的所有店铺(作业)
# //标签1[@属性1=属性值1]/.../text()
# //标签1[@属性1=属性值1]/.../@属性的名字
# store = li.xpath('p[@class="search_shangjia"]/a/text()')
store = li.xpath('p[4]/a/@title')
# store列表是当当自营的时候是空的
# if len(store) == 0:
# # 当当自营
# store = "当当自营"
# else:
# store = store[0]
# 代码优化
store = '当当自营' if len(store) == 0 else store[0]
print(store)
# print(store)
book_info_list.append({
'title': title,
'link': link,
'price': price,
'store': store
})
# 排序
book_info_list.sort(key=lambda x: float(x['price']), reverse=True)
# 遍历图书列表
for book in book_info_list:
print(book)
# import pandas as pd
# 转化成dataframe格式
df = pd.DataFrame(book_info_list)
# 存储成csv ,csv 是逗号分隔值文件
df.to_csv('当当图书信息.csv')
isbn = input('请输入您要查询的书号')
spider(isbn)
三、爬取豆瓣即将上映电影信息(优化后)
多了个下载电影图片功能
from xpinyin import Pinyin
import requests
from lxml import html
import pandas as pd
# pip install xpinyin
def spider(city):
# splitter 是分隔使用符号,默认是‘-’
city_pinyin = Pinyin().get_pinyin(city, splitter='')
url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
print('您要爬取的目标站点是', url)
print('爬虫进行中,请稍后...')
# 请求头信息, 目的是伪装成浏览器进行爬虫
headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
# 获取网页的源代码
response = requests.get(url, headers=headers)
html_data = response.text
# 提取我们想要的内容
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
movie_info_list = []
for div in div_list:
movie_name = div.xpath('div[1]/h3/a/text()')
# if len(movie_name)==0:
# movie_name = '没有查询到数据'
# else:
# movie_name = movie_name[0]
# 代码优化 ,可以使程序错误也继续运行
movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
# print(movie_name)
# 上映日期
date = div.xpath('div[1]/ul/li[1]/text()')
date = '没有查询到数据' if len(date) == 0 else date[0]
# print(date)
# 类型
type = div.xpath('div[1]/ul/li[2]/text()')
type = '没有查询到数据' if len(type) == 0 else type[0]
# print(type)
# 国家
country = div.xpath('div[1]/ul/li[3]/text()')
country = '没有查询到数据' if len(country) == 0 else country[0]
# print(country)
# 想看人数
want_see = div.xpath('div[1]/ul/li[4]/span/text()')
want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
want_see = int(want_see.replace('人想看', ''))
# print(want_see)
# 图片链接
img_link = div.xpath('a/img/@src')
img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
# 将信息放入一个列表中 [{}, {}, {}]
movie_info_list.append({
"movie_name": movie_name,
"date": date,
"type": type,
"country": country,
"want_see": want_see,
"img_link": img_link
})
# 排序
movie_info_list.sort(key=lambda x: x['want_see'])
# 遍历
for movie in movie_info_list:
print(movie)
# 图片爬取
with open('./douban_img/{}.jpg'.format(movie['movie_name']), 'wb') as f:
f.write(requests.get(movie['img_link']).content)
pd.DataFrame(movie_info_list).to_csv('{}douban_movie_info.csv'.format(city_pinyin))
# 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
city = input('请输入您要查看即将上映电影信息的城市:')
# 调用函数
spider(city)
四、数据可视化
常用的可视化工具:echarts(百度开源的)
安装并导入matplotlib、numpy库
# matplotlib # 用于绘图的库
# 安装
# pip install matplotlib、numpy
# 导入
from matplotlib import pyplot as plt
# 导入numpy
import numpy as np
设置支持中文字体
# 设置支持中文字体
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
1. 绘制 正弦曲线(线形图)
# 绘制 正弦曲线
# 选取100个等间距的点(x,y)然后进行绘制曲线图
# 生成[0,2π]区间100个等间距的点
x = np.linspace(0, 2*np.pi, num=100) # linspace 可以生成等间距的范围
y = np.sin(x)
cosy = np.cos(x)
# 绘制线形图
plt.plot(x, y,
color='r', # 线的颜色
linestyle = ':', # 线的风格
# - 实线, -- 虚线, : 点化线
marker='o',# 标记点的样式
# o 实心圆, *,+
markerfacecolor='r', # 标记点的颜色
alpha=0.8, # 设置透明度
label='代表正弦曲线' # 标签
)
plt.plot(x, cosy,
label='cos(x)',
linestyle='-',
color='g',
marker='*',
markerfacecolor='r',
alpha=0.6
)
plt.xlabel('time(s)')
plt.ylabel('电压(v)')
plt.title('电压随时间变化曲线')
plt.legend() # 设置图例
plt.show()
输出线形图:
2. 绘制条形图
from random import randint
x = ['口红{}'.format(i) for i in range(1, 7)]
print(x)
y = [randint(200, 1000) for _ in range(6)]
print(y)
plt.bar(x, y)
plt.grid() # 带网格
plt.xlabel('口红品牌')
plt.ylabel('口红价格(元)')
plt.show()
输出条形图:
五、作业
1. 三国人物分析top10绘制条形图
import jieba
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy
# 读取文件
with open('novel/threekingdom.txt', 'r', encoding='UTF-8') as f:
data = f.read()
# 分词
words_list = jieba.lcut(data)
# print(words_list)
# print(type(words_list)) # <class 'list'>
# 构建一个集合,定义无关词 ,用集合因为可以去重
excludes = {"将军", "却说", "二人", "不可", "荆州", "不能", "如此", "丞相",
"商议", "如何", "主公", "军士", "军马", "左右", "次日", "引兵",
"大喜", "天下", "东吴", "于是", "今日", "不敢", "魏兵", "陛下",
"都督", "人马", "不知", "孔明曰", "玄德曰", "刘备", "云长"}
# , ""
#构建一个容器,存储我们要的数据
#{"夏侯渊":34,"害怕":33...}\
counts = {}
print(type(counts)) # <class 'dict'>
# 遍历wordlist 目标是筛选出人名
for word in words_list:
# print(word)
if len(word) <= 1:
# 过滤无关词语即可
continue
else:
# 向字典counts里更新值
# counts[word] = 字典中原来该词出现的次数 + 1
# counts[word] = counts[word] + 1
# counts["正文"] = count["正文"] + 1
counts[word] = counts.get(word, 0) + 1
# print(counts)
# 指向同一个词的人进行合并 , 记得把合并的词变为无关词,放到excludes中
counts['孔明'] = counts['孔明'] + counts['孔明曰']
counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['刘备']
counts['关公'] = counts['关公'] + counts['云长']
# 删除无关的词语
for word in excludes:
del counts[word]
# 排序筛选
# 把字典转化成列表[(),()] [{}]
items = list(counts.items())
print(items)
# 按照词频次数进行排序
items.sort(key=lambda x: x[1], reverse=True)
print(items)
# 显示出现词语前10的词
x = []
y = []
for i in range(10):
# 将返回的数据拆开,拆包
role, count = items[i]
x.append(role)
y.append(count)
plt.bar(x, y)
plt.xlabel('人物')
plt.ylabel('频次')
plt.title('三国人物出现次数top10')
plt.grid()
plt.show()
输出:
2. 豆瓣中最想看的即将上映电影top5条形图
from xpinyin import Pinyin
import requests
from lxml import html
from matplotlib import pyplot as plt
import numpy
import pandas as pd
# pip install xpinyin
def spider(city):
# splitter 是分隔使用符号,默认是‘-’
city_pinyin = Pinyin().get_pinyin(city, splitter='')
url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
print('您要爬取的目标站点是', url)
print('爬虫进行中,请稍后...')
# 请求头信息, 目的是伪装成浏览器进行爬虫
headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
# 获取网页的源代码
response = requests.get(url, headers=headers)
html_data = response.text
# 提取我们想要的内容
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
movie_info_list = []
for div in div_list:
movie_name = div.xpath('div[1]/h3/a/text()')
# if len(movie_name)==0:
# movie_name = '没有查询到数据'
# else:
# movie_name = movie_name[0]
# 代码优化 ,可以使程序错误也继续运行
movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
# print(movie_name)
# 上映日期
date = div.xpath('div[1]/ul/li[1]/text()')
date = '没有查询到数据' if len(date) == 0 else date[0]
# print(date)
# 类型
type = div.xpath('div[1]/ul/li[2]/text()')
type = '没有查询到数据' if len(type) == 0 else type[0]
# print(type)
# 国家
country = div.xpath('div[1]/ul/li[3]/text()')
country = '没有查询到数据' if len(country) == 0 else country[0]
# print(country)
# 想看人数
want_see = div.xpath('div[1]/ul/li[4]/span/text()')
want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
want_see = int(want_see.replace('人想看', ''))
# print(want_see)
# 图片链接
img_link = div.xpath('a/img/@src')
img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
# 将信息放入一个列表中 [{}, {}, {}]
movie_info_list.append({
"movie_name": movie_name,
"date": date,
"type": type,
"country": country,
"want_see": want_see,
"img_link": img_link
})
# 排序
movie_info_list.sort(key=lambda x: x['want_see'],reverse=True)
print(movie_info_list)
# 绘制想看人数前五的条形图
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 笨方法:
# a = []
# b = []
# for movie in movie_info_list:
# movie_name = movie['movie_name']
# want_see = movie['want_see']
# a.append(movie_name)
# b.append(want_see)
# x = []
# y = []
# for i in range(5):
# x.append(a[i])
# y.append(b[i])
# 优化方法:
x = [movie['movie_name'] for movie in movie_info_list]
x = [x[i] for i in range(5)]
y = [movie['want_see'] for movie in movie_info_list]
y = [y[i] for i in range(5)]
plt.bar(x, y)
plt.xlabel('电影名称')
plt.ylabel('想看人数')
plt.title('豆瓣即将上映电影想看人数TOP5')
plt.grid()
plt.show()
# 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
city = input('请输入您要查看即将上映电影信息的城市:')
# 调用函数
spider(city)
输出: