使用 urllib 和 BeautifulSoup pymysql
抓取页代码
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
import pymysql
datas = []
def getHtml(url):
page = urlopen(url)
if page.getcode() != 200:
return None;
html = page.read().decode(encoding='utf-8')
return html
def parser(html_doc):
soup = BeautifulSoup(html_doc,'html.parser')
summs = soup.findAll('div',class_="summary")
#res_data 不能放在循环外面
#当声明一个字典 info = {} 的操作时候,该字典就已经在内存中获取了某一块地址。
#对该字典进行操作时,如 info['name'] = 'github' 的时候,这个字典依旧是之前所占用的地址。
config = {
'host':'127.0.0.1',
'port':3306,
'user':'root',
'password':'root',
'db':'python_test',
'charset':'UTF8',
'cursorclass':pymysql.cursors.DictCursor,
}
#因为上面的是一个字典,所以传进来需要 **
for summ in summs:
res_data = {}
res_data['title'] = summ.find('h2',class_="title").find('a').get_text()
tags = summ.findAll('li',class_="tagPopup")
tags_tag = set()
for tag in tags:
tags_tag.add(tag.find('a',class_='tag').get_text())
res_data['tags'] = tags_tag
datas.append( res_data )
const = pymysql.Connect(**config)
try:
cursor = const.cursor()
sql = 'insert into segment(title,tags) values(%s,%s)'
cursor.execute(sql,(res_data['title'],str(res_data['tags'])))
#没有设置默认自动提交,需要主动提交,以保存所执行的语句
const.commit()
except Exception as e:
print(e)
finally:
cursor.close()
const.close()
if __name__ == '__main__':
url = 'https://segmentfault.com/t/javascript?type=newest&page='
count = 1
while count < 100:
new_url = url+str(count)
html_dom = getHtml(new_url )
print('正在执行第'+str(count)+'页的内容抓取')
parser(html_dom)
count = count + 1
print('程序执行完毕')
统计页代码
import pymysql
config = {
'host':'127.0.0.1',
'port':3306,
'user':'root',
'password':'root',
'db':'python_test',
'charset':'UTF8',
'cursorclass':pymysql.cursors.DictCursor,
}
def get_tags():
connect = pymysql.connect(**config)
try:
cursor = connect.cursor()
sql = "select tags from segment"
cursor.execute(sql)
result = cursor.fetchall()
fout = open('output1.html','w')
fout.write('<html>')
fout.write('<body>')
fout.write('<table border="1" cellspacing="0" cellpadding="0">')
fout.write('<tr>')
fout.write('<td>tag</td>')
fout.write('<td>统计</td>')
fout.write('</tr>')
tags_box = {}
for tag in result:
for item in eval(tag['tags']):
count = 0;
if item in tags_box:
tags_box[item] = tags_box[item] + 1
else:
tags_box[item] = count + 1
new_tag = sorted(tags_box.items(),key = lambda x:x[1],reverse = True)
for item in new_tag:
fout.write('<tr>')
for i in item:
fout.write("<td> %s </td>" % i)
fout.write('</tr>')
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
fout.close()
except Exception as e:
print( e )
finally:
connect.close()
if __name__ == '__main__':
get_tags()