#!/usr/bin/python
# coding=utf-8
import requests
import re
# 从一个网页里获取文章id
def getArticleIDStringFromWebAtUrlString(urlString):
return getIDStringFromWebAtUrlString(urlString, '<a class="title" target="_blank" href="/p/(.*?)">')
# 从一个网页里获取专题id
def getTopicIDStringFromWebAtUrlString(urlString):
return getIDStringFromWebAtUrlString(urlString,'<a class="collection-tag" target="_blank" href="/c/(.*?)">')
# 从一个网页里按正则获取对应的字符串集合
def getIDStringFromWebAtUrlString(urlString,regString):
html = requests.get(urlString)
commentsIDStrings = re.findall(regString,html.text,re.S)
return commentsIDStrings
# 从文章网页里拿到评论的id
def getComentsIDStringFromArticleUrlString(urlString):
html = requests.get(urlString)
commentsIDStrings = re.findall('data-note-id="(.*?)">',html.text,re.S)
return commentsIDStrings[0]
# 从文章网页里拿到文章id
def getArticleIDStringFromArticleUrlString(urlString):
prefixStr = 'http://www.jianshu.com/p/'
if prefixStr in urlString:
articleIDString = urlString.replace(prefixStr, '')
return articleIDString
return 'this is not a articleUrlString'
# 评论一篇文章,文章地址,评论的内容
def commentArticle(articleUrlString, commentString):
cookieString = 'remember_user_token=W1sxOTI1NjJdLCIkMmEkMTAkeUdIRkRGdFN4L0RpZldKV3lKalR6ZSIsIjE1MDMzMjY2NTMuODY3NTI3Il0%3D--7255eb9eb5c5465a41781ef8e5772b10e1b2d72e; _ga=GA1.2.714896647.1497170140; _gid=GA1.2.462532388.1503150885; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1503323642,1503323726,1503325746,1503326272; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1503326790; _session_id=ZkhpL1IzRzA4bEs2dTZRRlhSTlVaQ3JFMWQ0enc4akxtSjh1dlUrSlpCUGpJZGltZG42UDdDMmtodllJbkovZFZ1eEkxVlNjZDUzVG1RdVc1cysyQUZTVUxFZFBUUC9zL3dIZjNVdFJVeERsZWpET2NRK0J3dHFnN21xRkE3QlFuYlBPU1RZc09xTXFJRlBFNGwrU1U3Q0gzQnpTRnNmZXVvNmR1dzJOZ2J0MU5KMkQ4cnd5OTlpYlIyZXRyb21rdjBMeE9xS1lPWGRSL2t0UXlyeDN2bml2TWNwMDF4akpGSkVQaW5iSXBDdWJCNUxrZEJ3dGJDZUhnOU50a2wyV2N2b3g0dXkvd0M5VWkzNTh6eUtPRUdTZUJuUHo2REg1dXhqeGpNOExubm40enU5U3BPNG1xdnlOK0dxYTJFeU5GZVQ3OTlFTW1GdndUS043T0N6QlpGczY4VFJCYW5QTDlxUzRMOGJYN3RyeU05N2cvS0lDWDNidVJZNkgwbmF3aHVMY083TWgrc2RsRHRYVDBjYjIwZVJTdXRHc21MWnk4Z0I3VFdMWHNmOD0tLU8zaHdXcEFUZjNaLzVINWpIdUZnK0E9PQ%3D%3D--4ff5e23579bb725201c91a9bfa6d7bb5975bd076'
ArticleIDString = getArticleIDStringFromArticleUrlString(articleUrlString)
noteIDString = getComentsIDStringFromArticleUrlString(articleUrlString)
data = {"content": commentString}
#header
headers = {'Host':'www.jianshu.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Accept':'application/json',
'Accept-Language':'zh-CN,zh;q=0.8',
'Content-Type':'application/json',
'Cookie':cookieString,
'Referer':'http://www.jianshu.com/p/%s'%ArticleIDString,
'Origin':'http://www.jianshu.com',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate',
'DNT':'1'}
response = requests.post(url='http://www.jianshu.com/notes/%s/comments'%noteIDString,headers=headers,json=data)
print response.status_code, response.reason
# 喜欢一个专题里的文章
def likeTopic(topicUrlString):
articleIDStrings = getArticleIDStringFromWebAtUrlString(topicUrlString)
for articleIDString in articleIDStrings:
# 喜欢一个文章跟评论一个文章类似的写法
likeArticle(articleIDString)
# 喜欢一文章
def likeArticle(articleUrlString):
cookieString = 'remember_user_token=W1sxOTI1NjJdLCIkMmEkMTAkeUdIRkRGdFN4L0RpZldKV3lKalR6ZSIsIjE1MDMzMjY2NTMuODY3NTI3Il0%3D--7255eb9eb5c5465a41781ef8e5772b10e1b2d72e; _ga=GA1.2.714896647.1497170140; _gid=GA1.2.462532388.1503150885; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1503323642,1503323726,1503325746,1503326272; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1503326790; _session_id=ZkhpL1IzRzA4bEs2dTZRRlhSTlVaQ3JFMWQ0enc4akxtSjh1dlUrSlpCUGpJZGltZG42UDdDMmtodllJbkovZFZ1eEkxVlNjZDUzVG1RdVc1cysyQUZTVUxFZFBUUC9zL3dIZjNVdFJVeERsZWpET2NRK0J3dHFnN21xRkE3QlFuYlBPU1RZc09xTXFJRlBFNGwrU1U3Q0gzQnpTRnNmZXVvNmR1dzJOZ2J0MU5KMkQ4cnd5OTlpYlIyZXRyb21rdjBMeE9xS1lPWGRSL2t0UXlyeDN2bml2TWNwMDF4akpGSkVQaW5iSXBDdWJCNUxrZEJ3dGJDZUhnOU50a2wyV2N2b3g0dXkvd0M5VWkzNTh6eUtPRUdTZUJuUHo2REg1dXhqeGpNOExubm40enU5U3BPNG1xdnlOK0dxYTJFeU5GZVQ3OTlFTW1GdndUS043T0N6QlpGczY4VFJCYW5QTDlxUzRMOGJYN3RyeU05N2cvS0lDWDNidVJZNkgwbmF3aHVMY083TWgrc2RsRHRYVDBjYjIwZVJTdXRHc21MWnk4Z0I3VFdMWHNmOD0tLU8zaHdXcEFUZjNaLzVINWpIdUZnK0E9PQ%3D%3D--4ff5e23579bb725201c91a9bfa6d7bb5975bd076'
prefixStr = 'http://www.jianshu.com/p/'
if prefixStr not in articleUrlString:
articleUrlString = prefixStr+articleUrlString
noteIDString = getComentsIDStringFromArticleUrlString(articleUrlString)
data = {}
headers = {'Host':'www.jianshu.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Accept':'application/json',
'Accept-Language':'zh-CN,zh;q=0.8',
'Content-Type':'application/json',
'Cookie':cookieString,
'Referer':articleUrlString,
'Origin':'http://www.jianshu.com',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate',
'DNT':'1'}
response = requests.post(url='http://www.jianshu.com/notes/%s/like'%noteIDString,headers=headers,json=data)
print response.status_code, response.reason
按上面我们会遇到一个问题,就是拿不到一个用户的所有文章,一个专题的所有文章,解决方案:离线网页,再抓取网页源码
# 从本地一个html文件中解析出文章的id
def getArticalIDStringFromLocalHtmlFile(localHtmlFilePath):
with open(r'%s'%localHtmlFilePath, "r") as f:
page = f.read()
commentsIDStrings = re.findall('<a class="title" target="_blank" href="http://www.jianshu.com/p/(.*?)">',page,re.S)
return commentsIDStrings
# 这里我们可以开始调用了,这里要写全路径 不能写 ~/Desktop
articalIDStrings = getArticalIDStringFromLocalHtmlFile('/Users/mac/Desktop/test.htm')
for articalIDString in articalIDStrings:
likeArticle(articalIDString)