爬取简书推荐作者信息并入库
以下是代码(我是初学者,写的不好请见谅)
涉及到知识点包括:
一、python基础知识
二、html基础知识
三、http基础知识
四、sql基础知识
五、requests库
六、urllib2库
七、BeautifulSoup库
八、MySQLdb库
结论:{"平均文章数": 148, "平均粉丝数": 14732, "平均关注数": 110, "平均写字数": 257475, "平均喜欢数": 4871}
# coding:utf-8
import requests
import MySQLdb
import json
import time
from bs4 import BeautifulSoup
# 连接数据库方法
def connect_db():
db = MySQLdb.connect(host='localhost', user='root', passwd='ar281835e67_', db='python', charset='utf8')
cursor = db.cursor()
return db, cursor
# 创建数据库表方法
def create_table(db, cur):
sql = """create table author4(
id int auto_increment not null,
username varchar(55) not null,
summary varchar(500),
carenum int,
fansnum int,
articlenum int,
writenum int,
likenum int,
primary key(id))"""
try:
cur.execute(sql)
db.commit()
print '创建表成功'
except:
db.rollback()
print '创建表失败'
# 往数据库表插入数据方法
def insert_table(db, cur, username, summary, carenum, fansnum, articlenum, writenum, likenum,num):
param = [username, summary, carenum, fansnum, articlenum, writenum, likenum]
sql = "insert into author4(username,summary,carenum,fansnum,articlenum,writenum,likenum) Values(%s,%s,%s,%s,%s,%s,%s)"
try:
cur.execute(sql, param)
db.commit()
print '成功插入'+str(num)+'条数据'
except:
db.rollback()
print '插入数据失败'
# 创建需要爬取的url管理器,传递count参数表示希望爬取多少个推荐用户
def url_collect(count):
inturl = 'https://www.jianshu.com/users/recommended?seen_ids=&count=5&only_unfollowed=true'
firstUrl = inturl.split('5')[0]
lastUrl = inturl.split('5')[1]
url = firstUrl + str(count) + lastUrl
try:
response = requests.get(url,timeout=10)
res = json.loads(response.content)
userList = res['users']
slugList = []
for user in userList:
slugList.append(user['slug'])
print '成功获取用户slug数据'
userUrl = 'https://www.jianshu.com/u/'
userUrlList = []
for slug in slugList:
userUrlList.append(userUrl + str(slug))
print '成功获取到需爬取的URL集合'
return userUrlList
except:
print '请求推荐作者接口数据拉取失败'
# 定义获取用户字典信息存入列表的方法
def get_userDict(firstNum,lastNum,userUrllist,userList):
for url in userUrllist[firstNum:lastNum]:
userInfo = {}
response = requests.get(url)
res = response.content
soup = BeautifulSoup(res, 'html.parser')
username = soup.find('a', attrs={'class': 'name'}).text
summary = soup.find('div', attrs={'class': 'js-intro'}).text
otherInfo = soup.select('.meta-block p')
carenum = otherInfo[0].text
fansnum = otherInfo[1].text
articelnum = otherInfo[2].text
writenum = otherInfo[3].text
likenum = otherInfo[4].text
userInfo['username'] = username
userInfo['summary'] = summary
userInfo['carenum'] = carenum
userInfo['fansnum'] = fansnum
userInfo['articelnum'] = articelnum
userInfo['writenum'] = writenum
userInfo['likenum'] = likenum
userList.append(userInfo)
firstNum=firstNum+1
print '成功获取到'+str(firstNum)+'个用户字典并存入列表中'
time.sleep(5)
return userList
# 通过用户url爬取用户信息的方法,传入count表示希望爬取多少个推荐用户
def get_userInfo(count):
userUrllist = url_collect(count)
userNum=len(userUrllist)
userList = []
num=0
try:
print '开始请求推荐作者主页数据'
for url in userUrllist[num:userNum]:
userInfo = {}
response = requests.get(url)
res = response.content
soup = BeautifulSoup(res, 'html.parser')
username = soup.find('a', attrs={'class': 'name'}).text
summary = soup.find('div', attrs={'class': 'js-intro'}).text
otherInfo = soup.select('.meta-block p')
carenum = otherInfo[0].text
fansnum = otherInfo[1].text
articelnum = otherInfo[2].text
writenum = otherInfo[3].text
likenum = otherInfo[4].text
userInfo['username'] = username
userInfo['summary'] = summary
userInfo['carenum'] = carenum
userInfo['fansnum'] = fansnum
userInfo['articelnum'] = articelnum
userInfo['writenum'] = writenum
userInfo['likenum'] = likenum
userList.append(userInfo)
num=num+1
print '成功获取到'+str(num)+'个用户字典并存入列表中'
time.sleep(5)
print '成功获取到所有的用户信息'
return userList
except:
print '请求推荐作者主页数据拉取失败,正在重新尝试'
print '正在尝试获取第%s个用户字典' %(num+1)
return get_userDict(num+1,userNum,userUrllist,userList)
# 定义最终的爬虫方法,把爬来的数据进行入库
def spider(db, cur, count):
userList = get_userInfo(count) # uersList是一个这样结构的数据 [{user1},{user2},{user3}]
num=0
for user in userList:
username = user['username']
summary = user['summary']
carenum = user['carenum']
fansnum = user['fansnum']
articelnum = user['articelnum']
writenum = user['writenum']
likenum = user['likenum']
num=num+1
insert_table(db, cur, username, summary, carenum, fansnum, articelnum, writenum, likenum,num)
print '爬虫工作结束'
dbInfo = connect_db()
db_src = dbInfo[0]
cur_src = dbInfo[1]
create_table(db_src,cur_src)
spider(db_src, cur_src, 10)
以下是代码执行过程,都打印出来了
image.png