任务1
爬取网址:http://www.doupoxs.com/doupocangqiong
爬取内容:斗破苍穹的各章节正文
爬取方式:正则表达式 & BeautifulSoup(get_info2(url),选用)
import requests
from bs4 import BeautifulSoup
import re
import time
base_url = "http://www.doupoxs.com/doupocangqiong"
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
r = requests.get(base_url, headers = headers)
soup = BeautifulSoup(r.text,"lxml")
def get_url():
detail_urls = soup.select("div.book_list a")
url_list = []
for url in detail_urls:
detail_url = url.get("href")
all_url = "http://www.doupoxs.com" + detail_url
url_list.append(all_url)
return url_list
##正则表达式
def get_info(url):
r = requests.get(url,headers = headers)
dpcp_texts = re.findall('<p>(.*?)</p>',r.content.decode('utf-8'),re.S)
for dpcp_text in dpcp_texts:
f.write(dpcp_text + '\n')
##BeautifulSoup
def get_info2(url):
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.content,"lxml")
infos = soup.select("div.read_chapterDetail p")
for info in infos:
dpcq_text = info.text
f.write(dpcp_text + '\n')
if __name__ == "__main__":
url_list = get_url()
f = open("F:/doupo.txt",'a+')
for url in url_list:
get_info(url)
time.sleep(1)
f.close()
部分结果为:
天才一秒记住本站网站 www.doupoxs.com 中间是<span style="color:blue">斗破 拼音+小说 首字母</span> 连起来就是斗破小说,喜欢我就记住我吧!
第一章陨落的天才
“斗之力,三段!”
望着测验魔石碑上面闪亮得甚至有些刺眼的五个大字,少年面无表情,唇角有着一抹自嘲,紧握的手掌,因为大力,而导致略微尖锐的指甲深深的刺进了掌心之中,带来一阵阵钻心的疼痛…
“萧炎,斗之力,三段!级别:低级!”测验魔石碑之旁,一位中年男子,看了一眼碑上所显示出来的信息,语气漠然的将之公布了出来…
中年男子话刚刚脱口,便是不出意外的在人头汹涌的广场上带起了一阵嘲讽的骚动。
“三段?嘿嘿,果然不出我所料,这个“天才”这一年又是在原地踏步!”
“哎,这废物真是把家族的脸都给丢光了。”
“要不是族长是他的父亲,这种废物,早就被驱赶出家族,任其自生自灭了,哪还有机会待在家族中白吃白喝。”
任务2
爬取网址:https://www.qiushibaike.com/text/
爬取内容:用户ID、用户等级、用户性别、发表段子文字信息、好笑数量、评价数量
爬取方式:正则表达式 & BeautifulSoup(get_info2(url),选用)
import requests
from bs4 import BeautifulSoup
import re
import time
def get_sex(sex_info):
if sex_info == "manIcon":
return "男"
else:
return "女"
##正则表达式
def get_info(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
r = requests.get(url, headers = headers)
print(r.status_code)
#infos = soup.select("div.article")
ids = re.findall("<h2>(.*?)</h2>",r.text,re.S)
try:
levels = re.findall('<div class="articleGender .*?">(.*?)</div>',r.text,re.S)
sex_infos = re.findall('<div class="articleGender (.*?)"',r.text,re.S)
except:
levels = '未知'
sex_infos = '未知'
contents = re.findall('<div class="content">.*?<span>(.*?)</span>',r.text,re.S)
laughs = re.findall('<span class="stats-vote">.*?<i class="number">(.*?)</i>',r.text,re.S)
comments = re.findall('<span class="stats-comments">.*?<i class="number">(.*?)</i>',r.text,re.S)
for id,level,sex_info,content,laugh,comment in zip(ids,levels,sex_infos,contents,laughs,comments):
info = {
'id':id,
'level':level,
'sex':get_sex(sex_info),
'content':content,
'laugh':laugh,
'comment':comment
}
info_lists.append(info)
#print(info)
for info_list in info_lists:
try:
f.write(info_list['id']+'\n')
f.write(info_list['level']+'\n')
f.write(info_list['sex']+'\n')
f.write(info_list['content']+'\n')
f.write(info_list['laugh']+'\n')
f.write(info_list['comment']+'\n\n')
except UnicodeEncodeError: ##写入txt文件时去掉错误编码。如果仅打印内容不需要此语句
pass
##BeautifulSoup
def get_info2(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
r = requests.get(url, headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"lxml")
infos = soup.select("div.article")
for info in infos:
id = info.select("h2")[0].text.strip()
try:
level = info.select("div.articleGender")[0].text
sex_info = info.select("div.articleGender")[0].get("class")[1]
if sex_info == "womenIcon":
sex = "女"
elif sex_info == "manIcon":
sex = "男"
except:
level = ""
sex = "未知"
content = info.select("div.content")[0].text.strip().replace('\u200b','').replace('\xba','')
laugh = info.select("span.stats-vote i")[0].text
comment = info.select("span.stats-comments i")[0].text.replace('\u2718','')
f.write(id +' '+ level +' '+ sex +' '+ content +' '+ laugh +' '+ comment+'\n')
#print(id,level,sex,content,laugh,comment)
if __name__ == "__main__":
url_list = ["https://www.qiushibaike.com/text/page/{}/".format(i) for i in range(1,14)]
info_lists = []
f = open("F:/qiushibaike.txt",'a+')
for url in url_list:
get_info(url)
time.sleep(1)
f.close()