爬虫编写中常用的cv代码


title: 爬虫编写中常用的cv代码
tags: ['scrapy','python','time','request']
date: 2021-06-20
categories: "搬砖"


CV 用的好,天天下班早(想得美)

时间格式

# 时间有几天前,的那种论坛贴吧
pubtime_string =  response.xpath("//div[@class='time']/span/@title").get()
pubtime_string = pubtime_string if pubtime_string != None else response.xpath("string(//div[@class='time'])").get()
pubtime = re.findall("\d{4}年\d{2}月\d{2}日\s*\d{2}\:\d{2}",str(pubtime_string))[0]
pubtime = re.search("\d{4}-\d{2}-\d{2} \d{2}:\d{2}",str(pubtime)).group()
pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M")

item['pubtime'] = pubtime

# 其他格式的时间格式化成需要的格式
pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M")
datetime.datetime.strptime(pubtime, "%Y-%m-%d %H:%M").strftime("%Y-%m-%d %H:%M")

# 只有日期补全格式
 + time.strftime(' %H:%M')
    
# 去除空格和换行符    
pubtime = response.xpath("//div[@class='noticepubtime bshare-custom']/text()").get().replace('\r','').replace('\n','').strip()

时间戳格式转化

        timeStamp = data['createTime']
        item['pubtime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime(float(timeStamp)))

需要代理的爬虫制式

 # 爬虫文件
    def __init__(self, start_url=None):
        super(JuronggovSpider, self).__init__()
        if start_url:
            jobs = start_url.split('|')
        else:
            jobs = self.start_urls
        for job in jobs:
            self.post_params.append({"url": job.split()[0], 'job': job, "channel": job.split()[1], "channel_code": job.split()[2]})
        self.proxy_list = self.configure.get_proxy_from_api("common", 100, 'zm')

    def start_requests(self):
        for channel in self.post_params:
            url = channel['url']
            post_data = "wlwzTypeId=%s" %channel['channel_code']
            proxy = random.choice(self.proxy_list) if self.proxy_list else None
            yield Request(url=url, method='post', body=post_data, headers=self.headers,
                          meta={"channel": channel["channel"],"proxy":proxy,
                                "start_hash": self.configure.interface.get_start_hash(channel["job"])},
                          callback=self.parse_link)
            
# 后续在meta中传递
self.proxy_list = self.configure.get_proxy_from_api("common", 100, 'zm')
proxy = random.choice(self.proxy_list) if self.proxy_list else None
,"proxy":proxy
,"proxy":response.meta['proxy']
# special_config
class SpecialConfig():
        interface = base_config.InterfaceServe()

        @staticmethod
        def get_proxy_from_api(href=None, tp=None, response=None):
                return ['http://121.207.92.119:50269', 'http://117.94.182.206:39792', 'http://183.154.51.253:40689']

# 运行一次,将代理ip复制到这边

商情type修正

    @staticmethod
    def get_type(title):
        if u"中标" in title or u"结果" in title or u"成交" in title or u"流标" in title or u"废标" in title or u"合同" in title:
            flag = "RN"
        elif u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title:
            flag = "CN"
        elif u"预审" in title:
            flag = "PF"
        else:
            flag = "PN"
        return flag
    
    # 使用
    item['type'] = self.get_type(item['title'],item['type'])
    # 加强版
    @staticmethod
    def get_type(title,type):
        if u"中标" in title or u"结果" in title or u"成交" in title or u"流标" in title or u"废标" in title or u"合同" in title:
            flag = "RN"
        elif u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title:
            flag = "CN"
        elif u"预审" in title:
            flag = "PF"
        elif u'招标' in title or u'采购' in title or u'谈判' in title or u'询价' in title or u'单一来源' in title\
                or u'最高限价' in title or u'磋商' in title:
            flag = "PN"
        else:
            flag = type
        return flag

3天前 2天前 昨天 3小时前 49分钟前 2021年5月12日

    @staticmethod
    def format_time(time_string):
        curr_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d %H:%M"), "%Y-%m-%d %H:%M")
        pubtime = curr_time
        print time_string
        if u"3天前" in time_string:
            delta = datetime.timedelta(days=3)
            pubtime = curr_time - delta
        elif u"2天前" in time_string:
            delta = datetime.timedelta(days=2)
            pubtime = curr_time - delta
        elif u"昨天" in time_string:
            delta = datetime.timedelta(days=1)
            pubtime = curr_time - delta
        elif u"年" in time_string:
            pubtime = re.findall("\d{4}年\d{2}月\d{2}日",str(time_string))[0]
            pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日").strftime("%Y-%m-%d") + time.strftime(' %H:%M')
            return pubtime
        else:
            if u"分钟" in time_string:
                minute = re.findall("\d*",time_string)[0]
                delta = datetime.timedelta(minutes=int(minute))
                pubtime = curr_time - delta
            elif u"小时" in time_string:
                minute = re.findall("\d*",time_string)[0]
                delta = datetime.timedelta(hours=int(minute))
                pubtime = curr_time - delta
        return pubtime.strftime("%Y-%m-%d %H:%M")

从标题筛选出商情信息

            if u"采购" in title or u"成交" in title or u"询价" in title or u"中标" in title or u"招标" in title \
                    or u"延期" in title or u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title \
                     or u"流标" in title or u"废标" in title or u"合同" in title or u"预审" in title \
                    or u"磋商" in title or u"单一来源" in title or u"最高限价" in title or u"中标候选人" in title:

需要带着Cookie访问的

    def parse_link(self, response):
        cookie_list = response.headers.getlist('Set-Cookie')
        cookie = {}
        for c in cookie_list:
            cookie[c.split(';')[0].split('=')[0]]=[c.split(';')[0].split('=')[1]][0]
        cookie['ItDoor'] = 'wdxl'
        items = response.xpath("//li[@class='news-name']/a")
        for item in items:
            url = item.xpath("./@href").get()
            final = urlparse.urljoin(response.url,url)
            yield Request(url=final, method='get', headers=self.headers,cookies=cookie,
                          callback=self.parse_item)

判断是否是商情信息

            if u"中标" in title or u"结果" in title or u"成交" in title or u"流标" in title or u"废标" in title or u"合同" in title or\
                u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title or u"预审" in title or u"标" in title or\
                u"招标" in title or u"采购" in title or u"谈判" in title or u"询价" in title or u"磋商" in title or u"单一来源" in title or\
                u"最高限价" in title or u"资格" in title:
                    
                    
                    
                    
SQ_list=['中标','结果','成交','流标','废标','合同','变更','答疑','澄清','更正','预审','标','招标','采购','谈判','询价','磋商','单一来源','最高限价','资格']
        
  @staticmethod
    def is_need(title):
        if u"采购" in str(title) or u"成交" in str(title) or u"询价" in str(title) or u"中标" in str(title) or u"招标" in str(title) \
                or u"延期" in str(title) or u"变更" in str(title) or u"答疑" in str(title) or u"澄清" in str(title) or u"更正" in str(title) \
                or u"流标" in str(title) or u"废标" in str(title) or u"合同" in str(title) or u"预审" in str(title) or u"标" in str(title)\
                or u"磋商" in str(title) or u"单一来源" in str(title) or u"最高限价" in str(title) or u"中标候选人" in str(title) :
            return True
        else:
            return False

拼接网页

 @staticmethod
    def get_content(jsonbody):
        content = u"""
<div>
    <tr>
    <td>来信人:</td>
    <td>{}</td>
    <td>来信日期:</td>
    <td>{}</td>
  </tr><br>
    <tr>
    <td>类型:</td>
    <td>{}</td>
    <td>编号:</td>
    <td>{}</td>
  </tr><br>
    <tr>
    <td>来信内容:</td>
    <td>{}</td>
  </tr><br>
    <tr>
    <td>办理进程:</td>
    <td>{}</td>
  </tr><br>
    <tr>
    <td>答复单位:</td>
    <td>{}</td>
  </tr><br>
    <tr>
    <td>答复日期:</td>
    <td>{}</td>
  </tr><br>
    <tr>
    <td>答复内容:</td>
    <td>{}</td>
  </tr><br>
</div>
            """.format(jsonbody['fromName'],jsonbody['createTime'][:10],jsonbody['objectiveType'],
                       jsonbody['serialNumber'],jsonbody['content'],jsonbody['status'],
                       jsonbody['replyContents'][0]['allName'],
                       jsonbody['replyContents'][0]['replyTime'][:10],
                       jsonbody['replyContents'][0]['replyContent'],)
        return content

翻页


        last_date = data_list[-1]['publishtime'][:10]
        # 当前页最后一条的日期
        last_date = datetime.datetime.strptime(last_date, "%Y-%m-%d")
        # 今天的日期
        curr_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d"),"%Y-%m-%d")
        # 时间差,用来进行翻页判断
        day = (curr_time - last_date).days
        print last_date
        print curr_time
        print day
        if day < 1:
            # 进行翻页操作
            next_page = response.meta['page'] + 1
            next_page_url = re.sub("\d+.shtml",str(next_page) + ".shtml",response.url)
            yield Request(url=next_page_url, method='get', headers=self.headers, dont_filter=True,
                          meta={"channel": response.meta["channel"], "start_hash": response.meta['start_hash'],
                                "page": next_page},
                          callback=self.parse_link)
            # 翻页提示
            # print "翻页,打开第%s页" % str(next_page)

时间戳转换格式

  • 时间戳转标准

    def time_format(timestamp):
        pubtimeArray = datetime.datetime.fromtimestamp(float(int(timestamp) / 1000))
        pubtime = pubtimeArray.strftime("%Y-%m-%d %H:%M")
        return pubtime
    # 如果是精确到毫秒的,pubtime要除1000
    datetime.datetime.fromtimestamp(float(int(pubtime))).strftime("%Y-%m-%d %H:%M")
    

header 格式化

#!/usr/bin/python
# -*- coding: utf-8 -*-
import hashlib
import json
aaaaa ="""
Host: szdmobile.suzhou.gov.cn
X-API-TIMESTAMP: 1626249118516wjDZfe
User-Agent: su zhou dao/1.6.0 (iPhone; iOS 14.6; Scale/2.00)
cityName: 
deviceName: six
deviceCode: iPhone12,8
X-API-SIGNATURE: OTQzY2E3ZGVkMDBhN2ViOTQ3YjkwMWI0NDZmYzQ1MTVmMzdmNGY2OQ==
appVersion: 1.6.0
latitude: 31.305387
accessToken: 65705a0cb0244f1fa3ad7766540a73c1
system: ios
version: 14.6
manufacturer: Apple
deviceId: E2E6A424-5EB5-4C44-A896-2788EB37136F
sign: 2xsSTEb8o/w=
Connection: keep-alive
X-AUTH-TYPE: sha1
longitude: 120.591694
Accept-Language: zh-CN
network: WIFI
Accept: */*
Accept-Encoding: gzip, deflate, br
X-API-KEY: eecca5b6365d9607ee5a9d336962c534
X-API-VERSION: 1.6.0
registrationID: 141fe1da9e7db3c637a"""


def format_header(header_str):
    hea_format = {}
    str = header_str
    for line in str.split("\n"):
        if line:
            hea_format[line.split(': ')[0]] = line.split(': ')[1]
    print hea_format
    return header_str

format_header(aaaaa)

不同的内容 兼容性

@staticmethod
    def get_detail_info(response):
        pubtime = ''
        title = ''
        content = ''
        type1 = {
            'pubtime':response.xpath("string(//div[contains(@class,'biaoti')]//following-sibling::div[1])"),
            'title':response.xpath("//div[contains(@class,'biaoti')]"),
            'content':response.xpath("//div[contains(@class,'zhenwen')][last()]")
        }
        type2 = {
            'pubtime': response.xpath("string(//div[@class='left-time'])"),
            'title': response.xpath("//div[@class='content']/h1/text()"),
            'content': response.xpath("//div[@class='content']")
        }
        type_list = []
        type_list.append(type1)
        type_list.append(type2)
        for type in type_list:
            if type['title']:
                pubtime = type['pubtime'].get().strip().replace('\r','').replace('\n','')
                pubtime = re.findall("\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}",str(pubtime))[0]
                pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M")
                title = type['title'].get()
                content = type['content'].get()
        return title,pubtime,content

视频插入content

"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
<video width="600" height="450"  preload="auto" controls>
    <source src="{}" type="video/mp4">
</video>
</body>
</html>""".format(data['video_url'])

request访问时候使用代理的格式

requests.request("POST", url, data=post_data, headers=headers10, cookies=cookies,proxies={'http': proxy, 'https': proxy.replace("http","https")})
       
requests.request("POST", url, data=post_data, headers=headers10, cookies=cookies,proxies={'http': proxy, 'https': proxy})
       
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容