爬点小黄图-1-简单无需登录无禁爬虫网页

你知道wanimal么？你知道他拍的的小黄图么？
还是很文艺的，喜欢他拍的tits，breast，so beautiful！！！

哈哈，事情并没有想象的那么顺利，今天解决的事情是正则匹配和python的一点知识。但是主要还是阻挡在，wanimal的网页可爬，但是图片服务器不允许，也可能是被墙了，所以图片暂时还不能保存到本地，做一个悲伤的表情。可能就要放到下次的伪装浏览器里面去解决了。

事实证明是被屏了，开个全局vpn就ok了

# encoding=utf-8
import urllib;
import urllib.request;
import re;
from collections import deque;

class DownLoadPic:#先来个类，用它来下载wanimal的图喽
    #######################################
    #学到点什么呢，就是类里面的这个全局变量更类似于C++里面的static
    #是所有类共享的
    #哈哈，事实不是这样的
    #self.value这样访问变量就不是static，即非共享，
    #self.__class__.value这样访问才是static，原来还和访问方式有关！！！
    #20161101今天才知道，自由static才在这里定义，也许也不需要在这里定义（没试），但是非static直接在用的时候self.var就行了
    #######################################
    __m_deque=deque();#存放将要访问的下一页
    __m_visited=set();#根据集合的特性，存放已经访问过网址，包括图片的和下一页的
    __file=open('e:/forLook.txt','w',encoding='utf-8'); #存放解析出来的网址，就是为了看结果对不对
    __adress=open('e:/adress.txt','w',encoding='utf-8');#因为现在图片不能存在本地，那就先把图片链接存到本地，因为有人说可以用迅雷下载，但是我没装迅雷。
    __url_init="";#网页入口
    __m_cnt=0;#当前第几页
    __page_limit=0;#页数限制
    #######################################
    #学到点什么呢，这个就是构造函数了，而且类里面所有的函数都必须有参数self
    #注意到这些变量和函数前面的__了吧，加了__就是私有的
    #######################################
    def __init__(self,url_tmp,cnt_limit=99999999):#构造函数
        self.__url_init=url_tmp;
        self.__m_deque.append(self.__url_init);
        self.__m_cnt=1;
        self.__page_limit=cnt_limit;

    def __del__(self):#析构函数
        self.__url_init='';
        self.__m_cnt=0;
        self.__m_visited={};
        self.__m_deque=[];
        self.__file.close();
        self.__adress.close();
    
    def DLP(self):#有了网址就开始解析下载了
        while(self.__m_deque and self.__m_cnt<=self.__page_limit):
            #print(self.__m_deque);
            cur_url=self.__m_deque.popleft();
            self.__m_visited |={cur_url};
            print("已经抓取 ",self.__m_cnt," 页 +++","当前网页--->",cur_url,"\\n");
            self.__m_cnt +=1;
            try:
                url_opening=urllib.request.urlopen(cur_url);
            except:
                self.__file.write("网页打开失败--->"+cur_url+"\\n");
                continue;
            if 'html' not in url_opening.getheader('Content-Type'):
                continue;
            try:
                page_data=url_opening.read().decode('utf-8');
            except:
                self.__file.write("网页解码失败--->"+cur_url+"\\n");
                continue;
            linkNext='http://wanimal1983.org/page/'+str(self.__m_cnt);#哈哈，先默认从第一页开始吧，刚开始写正则太辛苦了
            self.__m_deque.append(linkNext);#把下一页放到待解析队列
            self.__m_visited |={linkNext};#访问过的网页
            imageDiv=re.compile('<img src=.+?>');#img标签
            imageLink=re.compile('http:.+?\\.jpg');#图片连接
            nameLike=re.compile('[^/]+\\.jpg');#取出图片名称
            for img in imageDiv.findall(page_data):#取出当前页所有img标签
                #print(len(imageDiv.findall(page_data)));
                imgLink=imageLink.findall(img);#从当前img标签取出图片连接
                if(1 <= len(imgLink)):
                    get_img=imgLink[0];
                else:
                    continue;
                if 'http' in get_img and get_img not in self.__m_visited:
                    self.__m_visited |={get_img};
                    name=nameLike.findall(get_img)[0];#取出图片名字
                    self.__file.write("图片名字-->"+name+"\\n");
                    self.__file.write("图片链接-->"+get_img+"\\n");
                    print("正在保存图片",name,end="==========\\n");
                    self.__adress.write(get_img+"\\n");
                    #picFile=open(name,"wb");#暂时直接爬是不行的，好像图片服务器禁止了简单爬虫，那就下一次在伪装浏览器行为，先把链接都保存下来，用其他下载器下载吧
                    #pic=urllib.request.urlopen(get_img).read();
                    #picFile.write(pic);
                    #picFile.close();

#调用
a=DownLoadPic("http://wanimal1983.org/",150);
a.DLP();
del a;

所以这么写算是更标准的格式

# encoding=utf-8
import urllib;
import urllib.request;
import re;
from collections import deque;

class DownLoadPic:
    
    def __init__(self,url_tmp,cnt_start=0,cnt_limit=99999999):#构造函数
        self.__url_init=url_tmp;
        self.__m_deque=deque(); 
        self.__m_deque=deque(); 
        self.__m_deque.append(self.__url_init);
        self.__m_cnt=cnt_start;
        self.__page_limit=cnt_limit;
        self.__file=open('e:/forLook.txt','w',encoding='utf-8');

    def __del__(self):#析构函数
        self.__url_init="";
        self.__m_cnt=0;
        self.__m_visited={};
        self.__m_deque=[];
        self.__file.close();
    
    def DLP(self):#有了网址就开始解析下载了
        while(self.__m_deque and self.__m_cnt<=self.__page_limit):
            cur_url=self.__m_deque.popleft();
            self.__m_visited |={cur_url};
            print("正在抓取 ",self.__m_cnt," 页 +++","当前网页--->",cur_url,"\\n");
            self.__m_cnt +=1;
            try:
                url_opening=urllib.request.urlopen(cur_url);
            except:
                self.__file.write("网页打开失败--->"+cur_url+"\\n");
                continue;
            if 'html' not in url_opening.getheader('Content-Type'):
                continue;
            try:
                page_data=url_opening.read().decode('utf-8');
            except:
                self.__file.write("网页解码失败--->"+cur_url+"\\n");
                continue;
            linkNext='http://wanimal1983.org/page/'+str(self.__m_cnt);
            self.__m_deque.append(linkNext);#把下一页放到待解析队列
            self.__m_visited |={linkNext};#访问过的网页
            imageDiv=re.compile('<img src=.+?>');#img标签
            imageLink=re.compile('http:.+?\\.jpg');#图片连接
            nameLike=re.compile('[^/]+\\.jpg');#取出图片名称
            for img in imageDiv.findall(page_data):#取出当前页所有img标签
                #print(len(imageDiv.findall(page_data)));
                imgLink=imageLink.findall(img);#从当前img标签取出图片连接
                if(1 <= len(imgLink)):
                    get_img=imgLink[0];
                else:
                    continue;
                if 'http' in get_img and get_img not in self.__m_visited:
                    self.__m_visited |={get_img};
                    name=nameLike.findall(get_img)[0];#取出图片名字
                    self.__file.write("图片名字-->"+name+"\\n");
                    self.__file.write("图片链接-->"+get_img+"\\n");
                    print("正在保存图片",name,end="==========\\n");
                    picFile=open('e:/'+name,"wb");
                    pic=urllib.request.urlopen(get_img).read();
                    picFile.write(pic);
                    picFile.close();

#调用
if __name__=='__main__':
            a=DownLoadPic("http://wanimal1983.org/",150);
            a.DLP();

爬点小黄图-1-简单无需登录无禁爬虫网页

推荐阅读更多精彩内容