豆瓣日记
先分析脚本结构,再用BeautifulSoup处理
#-*- coding:utf-8 -*-
import re
import requests
import numpy
from bs4 import BeautifulSoup
i=1#计数君
name="https://www.douban.com/people/petitespot/notes?start=30&type=note"#链接地^
#翻页的话把30改成40、50……
a= requests.get(name).text
soup = BeautifulSoup(a)
for url in soup.find_all("div", attrs={"class": "note-header-container"}):
res = url.find(class_='j a_unfolder_n')
link= res['href']#获取每篇文章的链接地址
a= requests.get(link).text
soup1 = BeautifulSoup(a)#第二个soup结构
title=soup1.find("meta", attrs={"property": "og:title"})
title=title['content']#获取title
res= soup1.find("div", attrs={"id": "link-report"})
st=" "
for a in res.find_all('p'):#如果网页结构理想,便只有<p></p>
st+='\n'
st+=unicode(a.string)
if st==" ":#否则简单处理,去掉<br>变为换行
st=str(res)
st=st.replace("<br/>","\n")
string = str(i) + title + '.txt'
fp = open(string, 'wb')
fp.writelines(st)
fp.close()
i = i + 1
else:
string = str(i) + title + '.txt'
fp = open(string, 'wb')
fp.writelines(st.encode("u8"))
fp.close()
i = i + 1
新浪博客
新浪的编码特别奇怪,网上的.decode().encode
处理之后仍然是乱码,最后还是机缘巧合找到了解决方案。
http://blog.chinaunix.net/uid-13869856-id-5747417.html
https://segmentfault.com/q/1010000000665231
#-*- coding:utf-8 -*-
import re
import requests
import numpy
import urllib
import urllib2
import chardet
import sys
import chardet
from bs4 import BeautifulSoup
i=1
reload(sys)
url = 'http://blog.sina.com.cn/s/articlelist_1270505543_0_1.html'
req = requests.get(url)
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
soup = BeautifulSoup(encode_content,'html5lib')
#必须是html5lib,否则会损失信息
for url in soup.find_all("p", attrs={"class": "atc_main SG_dot"}):
if i<=36:
i=i+1
continue
res = url.find(target='_blank')
link= res['href']
a= requests.get(link)
encodings = requests.utils.get_encodings_from_content(a.text)
encoding = encodings[0]
content = a.content.decode(encoding, 'replace').encode('utf-8', 'replace')
soup1 = BeautifulSoup(content, 'html5lib')
title=soup1.title.string
res= soup1.find("div", attrs={"id": "sina_keyword_ad_area2"})
st=" "
st=str(res)
string = str(i) + title + '.html'
fp = open(string, 'wb')
fp.writelines(st)
fp.close()
i = i + 1
由于新浪的正文中出现大量html结构,只好用html格式存储,没有转换成txt文本。