import requests, random, datetime, re, os, time, base64, pymssql
from lxmlimport etree
from fontTools.ttLibimport TTFont
from ioimport BytesIO
if not os.path.exists("allUrl"):
os.makedirs("allUrl")
def readfile(path):# 读取文件的函数
content= [line.strip() for linein open(path, encoding='utf-8', errors='ignore').readlines()]
return content
def savefile(savepath,content):
fp= open(savepath, 'a+', encoding='utf8', newline="", errors='ignore')
fp.write(content+"\r\n")
fp.flush()
fp.close()
def make_font_file(base64_string: str):
bin_data= base64.decodebytes(base64_string.encode())
with open('text.otf', 'wb') as f:
f.write(bin_data)
return bin_data
def get_num(string,html):
c= base_64(html)
ret_list= []
for charin string:
decode_num= ord(char)
num= c[decode_num]
num= int(num[-2:])-1
ret_list.append(num)
return ret_list
def base_64(html):
pattern= re.compile(r"'data:application/font-ttf;charset=utf-8;base64,(.*?)'", re.I)
base64_str= "".join(pattern.findall(html))
bin_data= make_font_file(base64_str)
font= TTFont(BytesIO(bin_data))
font.saveXML("text.xml")
font= TTFont(BytesIO(make_font_file(base64_str)))
uniList = font['cmap'].tables[0].ttFont.getGlyphOrder()
c= font['cmap'].tables[0].ttFont.tables['cmap'].tables[0].cmap
return c
def sqlInfo(data, table): ## 插入数据库
conn= pymssql.connect(host="172.30.100.148", user="", password="", database="LiangZB", charset='utf8')
cur= conn.cursor()
keys= ', '.join(data.keys())
values= ', '.join(['%s'] * len(data))
sql= "INSERT INTO {0} ({1}) VALUES ({2})" .format(table, keys, values)
try:
cur.execute(sql, tuple(data.values()))
conn.commit()
#print('插入成功!')
cur.close()
conn.close()
except Exception as ex:
print("错误在这>>>>>", ex, "<<<<<错误在这")
savefile("./allUrl/ErUrl.log", str(ex))
conn.rollback()
def getUA():
USER_AGENTS= [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
]
headers= {
"cookie": "als=0; wmda_uuid=c5544afe7d0808fc59edf66a50d80a4c; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2F%3Ffrom%3Dnavigation%7C; sessid=822172B8-0C58-5884-73B2-B62464C991EE; ajk_member_captcha=7ee0c936789ed4b80f136496306be275; __xsptplus8=8.23.1563787771.1563787774.2%232%7Csp0.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%230rqgP09ab8EK59L-eUjI82DRSDo0GvqE%23; _ga=GA1.2.1380953827.1562922627; _gid=GA1.2.1974631730.1563787776; 58tj_uuid=3fd91ac1-3f8d-42c9-9372-37c8f91fb5a0; new_uv=19; twe=2; ctid=26; aQQ_ajkguid=CA200195-1F12-F331-0163-0E0C21DC28DB; wmda_session_id_6289197098934=1563844517440-3ce63403-a0fd-d14f; xzfzqtoken=jfw1KwG52vwOvUY%2B7FXZkEz77v59E%2BTTnz6i1wVuH13eI6X6c03vXAKSyKTxfDxXin35brBb%2F%2FeSODvMgkQULA%3D%3D",
"referer": "https://zhengzhou.anjuke.com/",
"upgrade-insecure-requests": "1",
"user-agent": random.choice(USER_AGENTS)}
return headers
def response(url, num_retries=3):
try:
res= requests.get(url, headers=getUA(), allow_redirects=True, timeout=None)
time.sleep(random.uniform(0.5, 0.8))
res.raise_for_status() # 如果不是200会抛出HTTPError错误
res.encoding= res.apparent_encoding
if res.encoding== "utf-8" or res.encoding== "UTF-8" or res.encoding== "Windows-1254":
html= res.content.decode("utf-8", "ignore")
else:
html= res.content.decode("GBK", "ignore")
except requests.HTTPErroras ex:
html= None
if num_retries > 0: #如果不是200就重试,每次递减重试次数
return response(url, num_retries - 1)
except requests.exceptions.ConnectionErroras ex: #如果url不存在会抛出ConnectionError错误,这个情况不做重试
return None
return html
def parse(html,regu):
text= etree.HTML(html).xpath(regu)
return text
def getHTMLText(url, ss):
attempts= 0
success= False
while attempts< 3 and not success:
try:
html= response(url)
AnJuKeItem= {}
AnJuKeItem["城市"] = "".join(parse(html, '//div[@class="city-view"]/text()')).strip()
AnJuKeItem["编号"] = url.split("?")[0].split("fangyuan/")[1]
AnJuKeItem["租赁方式"] = "".join(parse(html, '//ul[@class="title-label cf"]/li[1]/text()')).strip()
AnJuKeItem["项目名称"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[1]/text()')).strip()
AnJuKeItem["区域"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[2]/text()')).strip()
AnJuKeItem["板块"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[3]/text()')).strip()
AnJuKeItem["地铁"] = "".join(parse(html, '//ul[@class="title-label cf"]/li[3]/text()')).strip()
AnJuKeItem["房型"] = ""
horseType_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[2]/span[2]//text()')).strip()
for linein horseType_str:
if linenot in type_list:
AnJuKeItem["房型"] += line
else:
line= get_num(line,html)[0]
AnJuKeItem["房型"] += str(line)
AnJuKeItem["面积"] = ""
area_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[3]/span[2]//text()')).replace("平方米","").strip()
for linein area_str:
if linenot in type_list:
AnJuKeItem["面积"] += line
else:
line= get_num(line,html)[0]
AnJuKeItem["面积"] += str(line)
AnJuKeItem["租金"] = ""
rent_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[1]/span[1]/em//text()')).strip()
for linein rent_str:
if linenot in type_list:
AnJuKeItem["租金"] += line
else:
line= get_num(line,html)[0]
AnJuKeItem["租金"] += str(line)
AnJuKeItem["装修"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[6]/span[2]//text()')).strip()
AnJuKeItem["朝向"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[4]/span[2]//text()')).strip()
AnJuKeItem["楼层"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[5]/span[2]//text()')).strip()
AnJuKeItem["类型"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[7]/span[2]//text()')).strip()
pattern= re.compile(r'var instance = new anjuke.Ajax.MapInPV\("map-canvas", {\n(.*?)</script>',re.I|re.DOTALL|re.M|re.S)
baiduLine_list= "".join(pattern.findall(html)).strip().split("\n")
AnJuKeItem["百度经度"] = baiduLine_list[0].split(":")[1].replace(",", "").strip()
AnJuKeItem["百度纬度"] = baiduLine_list[1].split(":")[1].replace(",", "").strip()
AnJuKeItem["发布时间"] = ""
tellTime_list= "".join(parse(html, '//div[@class="right-info"]/b/text()')).strip()
tellTime_str= re.sub(r"年|月|日", "/", tellTime_list)
for linein tellTime_str:
if linenot in type_list:
AnJuKeItem["发布时间"] += line
else:
line= get_num(line,html)[0]
AnJuKeItem["发布时间"] += str(line)
to_day= datetime.datetime.now()
AnJuKeItem["采集时间"] = "{}/{}/{}".format(to_day.year, to_day.month, to_day.day)
AnJuKeItem["品牌"] = "".join(parse(html, '//div[@class="broker-line"]/a/@title')).strip()
AnJuKeItem["采集网站"] = "安居客"
addressUrl= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[1]/@href')).strip()
res1= response(addressUrl)
try:
AnJuKeItem["地址"] = "".join(parse(res1, '//div[@class="comm-title"]/h1/span/text()')).split("-")[1].strip()
AnJuKeItem["年代"] = "".join(parse(res1, '//*[@id="basic-infos-box"]/dl/dd[5]/text()')).strip()
except Exception as ex:
cuoWu= "".join(parse(res1, '//*[@id="list-content"]/div[1]/span/text()')).strip()
if "为您找到" in cuoWu:
AnJuKeItem["地址"] = ""
AnJuKeItem["年代"] = ""
else:
print("三级页面已出现验证码!!!%s" %addressUrl)
print(Exception, ":", ex)
print(input("请破解验证码:"))
print("*"*100)
print("二级页面URL:", url)
print(ss, "\t",AnJuKeItem["城市"],AnJuKeItem["编号"],AnJuKeItem["项目名称"],AnJuKeItem["房型"],AnJuKeItem["装修"],AnJuKeItem["地址"],AnJuKeItem["年代"])
print("*" * 60)
table= "AnJuKe"
sqlInfo(AnJuKeItem, table)
success= True
except Exception as ex:
attempts+= 1
if attempts== 3:
print(ss, url + "\t" + "2级URL报错,存入 anJKeUrl.log日志。。。")
print(Exception, ":", ex)
savefile("./allUrl/anJKeUrl.log", url + "|#|" + str(ex) + "2级")
print("*" * 60)
break
if __name__== '__main__':
print("I'm Working ...")
print('*****\t当前时间为:{}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
st= datetime.datetime.now()
type_list= ['驋', '龒', '龤', '閏', '麣', '鸺', '龥', '齤', '餼', '鑶']
city_dict= {"nb":"宁波", "nc":"南昌", "km":"昆明", "nn":"南宁", "gy":"贵阳", "nt":"南通", "su":"苏州", "zz":"郑州", "yz":"扬州", "wlmq":"乌鲁木齐"}
ss= 0
for key, valuein city_dict.items():
for iin range(1, 51):
url= "https://{}.zu.anjuke.com/fangyuan/p{}/".format(key, i)
attempts= 0
success= False
while attempts< 3 and not success:
try:
html= response(url)
genUrl_list= parse(html, '//*[@id="list-content"]/div/div[1]/h3/a/@href')
print("本网站 {} 第 {} 页共有 {} 个房源URL!".format(value, i, len(genUrl_list)))
for startUrlin genUrl_list[:]:
startUrl= startUrl.split("&")[0]
ss+= 1
getHTMLText(startUrl, ss)
success= True
except Exception as ex:
attempts+= 1
if attempts== 3:
print(url+ "\t" + "1级URL报错,存入anJkUrl.log日志。。。")
print(Exception, ":", ex)
savefile("./allUrl/anJKeUrl.log", url+ "|#|" + str(ex) + "1级")
print("*" * 100)
break
print("程序执行结束!")
print('当前时间为:{}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
et= datetime.datetime.now()
print('[info]耗时: %s' %(et- st))
import requests, random,json, re, time, execjs, urllib3
from ioimport BytesIO
from ioimport StringIO
from PILimport Image
urllib3.disable_warnings()
class AJK_Slide_Captcha():
def __init__(self):
self.headers= {
"Referer": "https://www.anjuke.com/captcha-verify/?callback=shield&from=antispam",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
def get_sessionId(self, captcha_url):
resp= requests.get(captcha_url, headers=self.headers, verify=False, timeout=None)
sessionId= re.search('name="sessionId".*?value="(.*?)"', resp.content.decode()).group(1)
return sessionId
def get_responseId_bgImgUrl(self, sessionId):
resp= requests.get("https://verifycode.58.com/captcha/getV3", headers=self.headers, verify=False, timeout=None,
params={
"callback": "callback",
"showType": "embed",
"sessionId": sessionId,
"_": str(int(time.time() * 1000))
})
captchaData= json.loads(resp.text.replace("callback(", "").replace(")", ""))
responseId= captchaData["data"]["responseId"]
bgImgUrl= captchaData["data"]["bgImgUrl"]
return (responseId,bgImgUrl)
def get_image(self, bgImgUrl):
resp= requests.get("https://verifycode.58.com" + bgImgUrl, headers=self.headers, verify=False, timeout=None)
# req.content是二进制的字符串 传化为file 的 io对象
f= BytesIO(resp.content)
image= Image.open(f)
return image
def get_position(self,image):
image= image.resize((284, 160))
image= image.convert('L')
yuzhi= 150
yuzhi2= 40
ll= 10
for iin range(55, image.size[0] - 20): # 260
for jin range(0, image.size[1] - 20): # 160
flag= True
for lin range(0, ll):
pixel= image.getpixel((i, j)) - image.getpixel((i+ 1, j+ l))
if pixel< yuzhi2: flag= False
# pixel = image.getpixel((i - l, j))
# if pixel
for lin range(0, ll):
pixel= image.getpixel((i, j+ l))
if pixel< yuzhi: flag= False
if flag:
cropedimage = image.crop((i, j, i+ 30, j+ 30))
return i- 7
def get_trace(self,xPos, traceTxtPath):
with open(traceTxtPath, 'r+') as fp:
lines= fp.readlines()
allValueLineList= []
for linein lines:
if line.strip() == '': continue
start= int(re.search('"(\d+)', line).group(1))
end= int(re.search('(\d+)\,\d+\,\d+\|"', line).group(1))
if end- start== xPos or end- start== xPos + 1 or end- start== xPos - 1:
allValueLineList.append((end- start, line.strip().strip('"')))
lastXpos, trace= random.choice(allValueLineList)
changeNumCnt= 0
while changeNumCnt< 4:
changeNumCnt+= 1
num= random.choice(range(0, 10))
try:
search= random.choice(re.findall('(\d+%d)\|' % num, trace))
subSearch= str(int(search) + random.choice([1, -1]))
line = re.sub(search, subSearch, trace)
except:
changeNumCnt-= 1
return (lastXpos, trace)
def get_fpToken(self):
res2= requests.get("https://cdata.58.com/fpToken",headers=self.headers, timeout=None, verify=False,
params={
"callback": "callback",
})
html2= res2.content.decode("utf-8", "ignore")
fpToken= html2.split('"token":"')[1].replace('"})', '').strip()
return fpToken
def get_jiami_data(self, responseId, fpToken,lastXpos, trace):
jsCode= execjs.compile(open("./jiami.js", "r").read())
jiami_data= jsCode.call("getSlideAnswer", responseId, fpToken, lastXpos, trace)
return jiami_data
def slove(self,jiami_data,responseId,sessionId):
response= requests.get("https://verifycode.58.com/captcha/getV3", headers=self.headers, timeout=None, verify=False,
params={
"data": jiami_data,
"responseId": responseId,
"sessionId": sessionId,
"_": str(int(time.time() * 1000))
})
return response.text
def run(self):
# step1: 在验证码页面中 获取 sessionId
sessionId= self.get_sessionId('https://www.anjuke.com/captcha-verify/?callback=shield')
print('step1: sessionId->', sessionId)
# step2: 获取 responseId 和 bgImgUrl
(responseId, bgImgUrl) = self.get_responseId_bgImgUrl(sessionId)
print('step2: responseId->', responseId)
# Step 3, Get Image
image= self.get_image(bgImgUrl)
print('step3: image->', image)
# Step 4 ,caculate position
position= self.get_position(image)
print('step4: position->', position)
# Step 5 get trace
(lastXpos, trace) = self.get_trace(position, traceTxtPath='CaptchaTrace.txt')
print('step5: lastXpos->', lastXpos, "==", 'trace->', trace,)
# Step 6 get fpToken
fpToken= self.get_fpToken()
print('step6: fpToken->', fpToken)
# # Step 7 加密data
jiami_data= self.get_jiami_data(responseId, fpToken, lastXpos, trace)
print('step7: jiami_data->', jiami_data)
# Step 8 slove
responseText= self.slove(jiami_data, responseId, sessionId)
print('\nstep8: 最后请求结果->', responseText)
if __name__== '__main__':
AJK_Slide_Captcha().run()
from urllib.parseimport quote
from lxmlimport etree
import requests, os
def pdf_Dict(monUrl, ss):
headers= {
"Cookie": "Hm_lvt_d885bd65f967ea9372fc7200bc83fa81=1568943378; Hm_lpvt_d885bd65f967ea9372fc7200bc83fa81=1568944562",
"Host": "www.shclearing.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
genUrl= "http://www.shclearing.com/xxpl/fxpl/"
resp= requests.get(monUrl, headers=headers, timeout=10)
htmp= resp.content.decode("utf-8","ignore")
titleUrl_list= etree.HTML(htmp).xpath('//ul[@class="list"]/li/a/@href')
sss= 0
for titleUrlin titleUrl_list:
sss+= 1
titleUrl= titleUrl.split("./")[1]
pdfUrl= genUrl+ titleUrl
res= requests.get(pdfUrl, headers = headers, timeout = None)
html= res.content.decode("utf-8","ignore")
### 创建路径
dataTime= "".join(etree.HTML(html).xpath('//*[@id="content"]/span/text()')).replace("日期:", "").strip()
yearTime= dataTime.split("-")[0]
monTime= "".join(dataTime.split("-")[:2]).strip()
pdfurlPath= "./" + yearTime+ "/" + monTime+"/" + dataTime+ "/"
if not os.path.exists(pdfurlPath):
os.makedirs(pdfurlPath)
### post 参数
scriptStr= "".join(etree.HTML(html).xpath('//*[@id="content"]/div[@class="attachments"]//text()')).strip()
fileNames_list= scriptStr.split("var fileNames = '")[1].split("';")[0].replace("./","").strip().split(";;")
descNames_list= scriptStr.split("var descNames = '")[1].split("';")[0].strip().split(";;")
pdfDict= dict( (fileNames_list,descNames_list) for fileNames_list,descNames_listin zip(fileNames_list,descNames_list))
for FileName, DownNamein pdfDict.items():
print("第 {} 页 {} 个URL!post参数:".format(ss, sss), FileName+ '\t' + DownName)
res1= requests.post(
"http://www.shclearing.com/wcm/shch/pages/client/download/download.jsp",
data={
"FileName": FileName,
"DownName": quote(DownName)
},
headers = headers,
timeout = None
)
pdf= res1.content# 响应的二进制文件
with open(pdfurlPath+ DownName, 'wb') as f: # 二进制写入
f.write(pdf)
if __name__== '__main__':
### 前五页
monUrl_list= ["http://www.shclearing.com/xxpl/fxpl/index.html","http://www.shclearing.com/xxpl/fxpl/index_1.html","http://www.shclearing.com/xxpl/fxpl/index_2.html","http://www.shclearing.com/xxpl/fxpl/index_3.html","http://www.shclearing.com/xxpl/fxpl/index_4.html"]
ss= 0
for monUrlin monUrl_list:
ss+= 1
pdf_Dict(monUrl, ss)