网址https://movie.douban.com/top250
分析url,将首页url按照格式修改https://movie.douban.com/top250?start=0&filter=也可以访问,既可以构造url
前5页
urls=['https://movie.douban.com/top250?start={}&filter='.format(i) for in in range(0,125,25)]
爬取每个电影详情页链接,详情页有关于电影的更多信息
def get_url(url):
html=requests.get(url,headers=headers)
xdata=etree.HTML(html.text)
moive_urls=xdata.xpath('//div[@class="item"]/div[@class="pic"]/a[1]/@href')
# print(moive_urls)
for moive_url in moive_urls:
get_info(moive_url) #调用get_info
在mysql中选择一个库并创建存储数据的格式表(数据库命令)
create table dbmoives(
id int auto_increment primary key,
name text,
year text,
director text,
actor text,
style text,
country text,
language text,
release_time text,
time text,
other_name text,
score text) charset=utf8;
表结构
然后获取电影的详细参数并写入数据库mysql
def get_info(url):
try:
html=requests.get(url,headers=headers)
xdata=etree.HTML(html.text)
name=xdata.xpath('//div[@id="wrapper"]//h1/span/text()')[0]
print(name)
year=xdata.xpath('//div[@id="wrapper"]//h1/span/text()')[1][1:5]
print(year)
director=xdata.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')[0]
print(director)
actor=xdata.xpath('//div[@id="info"]//span[@class="actor"]//a/text()')[0] #第一个主演
print(actor)
styles=xdata.xpath('//div[@id="info"]//span[@property="v:genre"]/text()')
style='-'.join(styles) #将list---》str
print(style)
country=re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>',html.text,re.S)[0]
print(country)
language=re.findall(' <span class="pl">语言:</span> (.*?)<br/>',html.text,re.S)[0].replace(' / ','-')
print(language)
release_time=re.findall('<span class="pl">上映日期:</span> <span property="v:initialReleaseDate" content=".*?">(.*?)</span>',html.text,re.S)[0]
print(release_time)
time=re.findall('<span class="pl">片长:</span> <span property="v:runtime" content=".*?">(.*?)</span>.*?<br/>',html.text,re.S)[0]
print(time)
other_name=re.findall('<span class="pl">又名:</span> (.*?)<br/>',html.text,re.S)[0]
print(other_name)
score=xdata.xpath('//div[@id="interest_sectl"]//strong/text()')[0]
print(score)
#写入数据库mysql
# insert_sub='insert into dbmoives values(0,'+'"'+str(song_name)+'"'+','+'"'+str(songhref)+'"'+','+'"'+str(songtime)+'"'+')'
insert_sub='insert into dbmoives values(0,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'%('"'+str(name)+'"','"'+str(year)+'"','"'+str(director)+'"','"'+str(actor)+'"','"'+str(style)+'"','"'+str(country)+'"','"'+str(language)+'"','"'+str(release_time)+'"','"'+str(time)+'"','"'+str(other_name)+'"','"'+str(score)+'"')
# print(insert_sub)
cursor.execute(insert_sub)
# cursor.execute("insert into dbmoives values(0,'泰坦尼克号','1997','詹姆斯','莱昂纳多','剧情','美国','英语','1998','194分钟','铁达尼号','9.4')")
except Exception as e:
print(e)
完整代码
import requests
from lxml import etree
import re
import pymysql
# conn=pymysql.connect(host='127.0.0.1',user='root',passwd='123456',db='jt',port=3306,charset='utf-8')
conn=pymysql.connect(
host='127.0.0.1',
port=3306,
db='jt',
user='root',
passwd='123456',
# vharset='utf8'
)
cursor=conn.cursor()
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
#根据每页获取每个电影详细的URL
def get_url(url):
html=requests.get(url,headers=headers)
xdata=etree.HTML(html.text)
moive_urls=xdata.xpath('//div[@class="item"]/div[@class="pic"]/a[1]/@href')
# print(moive_urls)
for moive_url in moive_urls:
get_info(moive_url) #调用get_info
#获取电影的详细参数
def get_info(url):
try:
html=requests.get(url,headers=headers)
xdata=etree.HTML(html.text)
name=xdata.xpath('//div[@id="wrapper"]//h1/span/text()')[0]
print(name)
year=xdata.xpath('//div[@id="wrapper"]//h1/span/text()')[1][1:5]
print(year)
director=xdata.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')[0]
print(director)
actor=xdata.xpath('//div[@id="info"]//span[@class="actor"]//a/text()')[0] #第一个主演
print(actor)
styles=xdata.xpath('//div[@id="info"]//span[@property="v:genre"]/text()')
style='-'.join(styles) #将list---》str
print(style)
country=re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>',html.text,re.S)[0]
print(country)
language=re.findall(' <span class="pl">语言:</span> (.*?)<br/>',html.text,re.S)[0].replace(' / ','-')
print(language)
release_time=re.findall('<span class="pl">上映日期:</span> <span property="v:initialReleaseDate" content=".*?">(.*?)</span>',html.text,re.S)[0]
print(release_time)
time=re.findall('<span class="pl">片长:</span> <span property="v:runtime" content=".*?">(.*?)</span>.*?<br/>',html.text,re.S)[0]
print(time)
other_name=re.findall('<span class="pl">又名:</span> (.*?)<br/>',html.text,re.S)[0]
print(other_name)
score=xdata.xpath('//div[@id="interest_sectl"]//strong/text()')[0]
print(score)
#写入数据库mysql
# insert_sub='insert into dbmoives values(0,'+'"'+str(song_name)+'"'+','+'"'+str(songhref)+'"'+','+'"'+str(songtime)+'"'+')'
insert_sub='insert into dbmoives values(0,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'%('"'+str(name)+'"','"'+str(year)+'"','"'+str(director)+'"','"'+str(actor)+'"','"'+str(style)+'"','"'+str(country)+'"','"'+str(language)+'"','"'+str(release_time)+'"','"'+str(time)+'"','"'+str(other_name)+'"','"'+str(score)+'"')
# print(insert_sub)
cursor.execute(insert_sub)
# cursor.execute("insert into dbmoives values(0,'泰坦尼克号','1997','詹姆斯','莱昂纳多','剧情','美国','英语','1998','194分钟','铁达尼号','9.4')")
except Exception as e:
print(e)
if __name__=="__main__":
urls=['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0,150,25)]
for url in urls:
get_url(url)
# get_info('https://movie.douban.com/subject/1292722/')
conn.commit()
cursor.close()
conn.close()
效果截图