#!/usr/bin/python
import threading
from time import ctime,sleep
import pycurl
import urllib2
import sys,os
import StringIO
from lxml import etree
import datetime
import hashlib
starttime = datetime.datetime.now()
def testf():
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.weituanpin.com/")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.perform()
html = b.getvalue();
print html;
#sleep(500)
#print b.getvalue()
def urllibget(i,j):
response = urllib2.urlopen(i)
html = response.read()
#write_file(html,str(j))
show_pach(html,i)
def show_pach(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html")
nodes=nodes[0].xpath("//img[@src]")
for n in nodes:
src = n.attrib["src"]
if src.find("http") == -1:
src = url + src
download_img(src)
def write_file(html,file):
fsock = open("file/"+file+".txt", "a")
fsock.write(html)
fsock.close()
print file + " is OK\n"
def download_img(url):
response = urllib2.urlopen(url)
html = response.read()
ms=hashlib.md5();
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
a = ('http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.weituanpin.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/')
c = ("http://2mn.tv","http://2mn.tv")
j = 0
for i in c:
j = j + 1
t = threading.Thread(target=urllibget,args=(i,j,))
t.start()
if j == 3:
break
#https pycurl 防止脚本超时
def testf_https():
c = pycurl.Curl()
c.setopt(pycurl.URL, "https://detail.1688.com/offer/528970869962.html?spm=a312h.7841636.1998813769.d_pic_14.Cm06wt&tracelog=p4p")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
show_pach_https( html,"https://detail.1688.com/offer/528970869962.html?spm=a312h.7841636.1998813769.d_pic_14.Cm06wt&tracelog=p4p")
def show_pach_https(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html/body")
nodes=nodes[0].xpath("//img")
for n in nodes:
src = n.attrib["src"]
if src.find("http") == -1:
src = "http:"+src
else:
src = src
print src
download_img_https(src)
def download_img_https(url):
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
ms=hashlib.md5()
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
endtime = datetime.datetime.now()
print (endtime - starttime).seconds
python pycurl lxml threading 抓取数据
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 目标 多线程数据抓取-58同城转转网的二手产品 实作 1. 建立一个项目 新建一个项目58tongcheng1 ...
- 作者:xlzd 链接:https://zhuanlan.zhihu.com/p/20435541 来源:知乎 著作...
- 闲来无事,看看了Python,发现这东西挺爽的,废话少说,就是干 准备搭建环境因为是MAC电脑,所以自动安装了Py...
- Summer 2015 Report #026 - 05/07/15 Jianjian Wu 1. Plan fo...