import requests,csv
from lxml import etree
#获取总页数
def get_number(url):
res = requests.get(url).text
select = etree.HTML(res)
text = select.xpath('//div[@class="info"]/text()')[0]
count_text = text.split('·')[0]#以 · 来分割字符串取前半部
# print(OnlyCharNum(count_text))
number =int(OnlyCharNum(count_text))
page = number//10 + 2 if number%10 > 0 else number//10+1
# print(page)
for i in range(1,page):
number_url = numbers_url.format(i)
# print(number_url)
get_article(number_url)
print(pd)
for k,v in pd.items():
# print(k,len(v),v)
writer.writerow((k, len(v),v))
def get_article(url):
res = requests.get(url).text
select = etree.HTML(res)
names = select.xpath('//a[@class="blue-link"]/text()')
titles = select.xpath('//a[@class="title"]/text()')
# print(title)
for i in range(0, len(names)):
if names[i] in pd:
pd[names[i]].append(titles[i])
else:
pd[names[i]] = [titles[i]]
# if pd.has_key(names[i]):
# print(titles[i])
#只取数字
def OnlyCharNum(s,oth=''):
s2 = s.lower()
fomart = '0123456789'
for c in s2:
if not c in fomart:
s = s.replace(c,'')
return s
if __name__ == '__main__':
pd = {}
f = open('zhuanti.csv', 'w+', encoding='utf-8')
writer = csv.writer(f)
writer.writerow(('简书名', '次数', '提交的title集合'))
baseurl = 'http://www.jianshu.com/c/1b31f26b6af0'
numbers_url = 'http://www.jianshu.com/c/1b31f26b6af0?order_by=added_at&page={}'
get_number(baseurl)
【Python爬虫】爬取专题所有文章和提交次数
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 不止一次让我感谢一个人,那就是程程老师!从零基础开始学起Python爬虫过程中,从完全无知,到逐渐了解了专业的一些...