下载简书交友的图片.网站 https://www.jianshu.com/c/bd38bd199ec6
import urllib.request
import urllib.parse
import re
import os
import random
def get_road(url0):
req=urllib.request.Request(url0)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile(r'<a class="title" target="_blank" href="(.*?)"')
result=re.findall(pattern,html)
return result
def get_jiaoyou_url(result,s0):
s=s0
return geturl(result,s)
def gethtml(ur):
url=ur
req=urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
return html
def getpath(html):
#reg=r'.*?\.png'
reg=r'<img data-original-src="(.*?\.png)"'
imgre=re.compile(reg)
urls=imgre.findall(html)
return urls
def geturl(url,s):
urls=[s+str(i) for i in url]
for i in range(len(urls)):
print(urls[i])
print("url_length=",len(urls))
return urls
def download(urls):
global x
print("++++++++++++++++")
print(urls)
print("length=",len(urls))
if len(urls)==0:
print("not download")
return 0
for url in urls:
filename='/home/dflx/下载/jiaoyou_photo/'+str(x)+'.png'
urllib.request.urlretrieve(url,filename)
x+=1
print(x)
def download_one(url):
#ur='https://www.jianshu.com/p/407dac18983c'
ur=url
html=gethtml(ur)
path=getpath(html)
urls=geturl(path,'https:')
download(urls)
def download_all(urls):
print(len(urls))
print('---------------')
index=0
while index<len(urls):
print(urls[index])
download_one(urls[index])
index+=1
print("********")
#urpath="https://www.jianshu.com/c/bd38bd199ec6?order_by=added_at&page="
def page(url,start,end):
print("$$$$$$$$$")
lturl=[]
for i in range(start,end):
lturl.append(url+str(i))
print(lturl)
return lturl
x=0
def main():
if __name__ == '__main__':
"""
ur='https://www.jianshu.com/p/189d1b8101e6'
download_one(ur)
"""
urpath="https://www.jianshu.com/c/bd38bd199ec6?order_by=added_at&page="
urall=page(urpath,0,999)
for url in urall:
print("the end url")
print(url)
result=get_road(url)
allurls=get_jiaoyou_url(result,'https://www.jianshu.com')
download_all(allurls)
"""
url0="https://www.jianshu.com/c/bd38bd199ec6"
#ur='https://www.jianshu.com/p/407dac18983c'
ur='https://www.jianshu.com/p/189d1b8101e6'
html=gethtml(ur)
path=getpath(html)
urls=geturl(path,'https:')
download(urls)
url0="https://www.jianshu.com/c/bd38bd199ec6"
result=get_road(url0)
allurls=get_jiaoyou_url(result,'https://www.jianshu.com')
download_all(allurls)
"""
有500m,应该大概遍历了所有的文章
爬取 http://www.mm29.com/ ,下载图片
import urllib.request
import urllib.parse
import re
import os
import random
def get_road(url0):
req=urllib.request.Request(url0)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
pattern=re.compile(r'<a href="http://www.mm29.com/tag/(.*?)"')
result=re.findall(pattern,html)
print(result)
return result
def get_jiaoyou_url(result,s0):
s=s0
return geturl(result,s)
def gethtml(ur):
url=ur
req=urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
return html
def getpath(html):
#reg=r'.*?\.png'
reg=r'<img class="scrollLoading" data-original="http://img.mm29.com/images/(.*?.jpg)/800.jpg"'
imgre=re.compile(reg)
urls=imgre.findall(html)
print("////////////////////////")
print(urls)
return urls
def geturl(url,s0):
urls=[s0+str(i) for i in url]
for i in range(len(urls)):
print(urls[i])
print("url_length=",len(urls))
return urls
def get_imag_url(url,s1,s2):
urls=[s1+str(i) for i in url]
urls=[str(i)+s2 for i in urls]
for i in range(len(urls)):
print(urls[i])
print("url_length=",len(urls))
return urls
def download(urls):
global x
print("++++++++++++++++")
print(urls)
print("length=",len(urls))
if len(urls)==0:
print("not download")
return 0
for url in urls:
filename='/home/dflx/下载/mm_picture/'+str(x)+'.jpg'
urllib.request.urlretrieve(url,filename)
x+=1
print(x)
def download_one(mm_url):
print("*****++++++")
print(mm_url)
html=gethtml(mm_url)
url=getpath(html)
imag_url=get_imag_url(url,"http://img.mm29.com/images/","/800.jpg")
download(imag_url)
def download_all(urls):
print("for count=",len(urls))
print('---------------')
index=0
while index<len(urls):
print(urls[index])
download_one(urls[index])
index+=1
print("********")
#urpath="https://www.jianshu.com/c/bd38bd199ec6?order_by=added_at&page="
def page(url,start,end):
print("$$$$$$$$$")
lturl=[]
for i in range(start,end):
lturl.append(url+'/'+str(i))
print(lturl)
return lturl
x=0
def main():
url0="http://www.mm29.com/"
s0="http://www.mm29.com/tag/"
result=get_road(url0)
mm_url=get_jiaoyou_url(result,s0)
print("88888",mm_url[22])
for i in range(23):
url=page(mm_url[i],0,16)
download_all(url)
```
下载了5000多张,大约也是500M,网站应该有反扒措施,我离开了后被禁止了.