python爬取糗事百科

以下使用面向过程版的代码

impore urllib
import urllib2
import re
page = 1
url = 'http://www.qiushibaike.com/hot/page/'+str(page)
#url = 'http://www.yllin.cn'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
try:
    request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
#print content
    pattern = re.compile('<div class=\"content\"[\s\S]+?<span>([\s\S]+?)<\/span>')
items = re.findall(pattern,content)
    for item in items:
    print item
    except urllib2.URLError, e:
    if hasattr(e,"code"):
    print e.code
    if hasattr(e,"reason"):
    print e.reason

面向对象版

import urllib
import urllib2
import re

class QSBK:
    url ='' 
    headers = ''
    def __init__(self,url,headers):
        self.url = url
        self.headers = headers
    def request(self):
        request = urllib2.Request(url,headers=self.headers)
        response = urllib2.urlopen(request)
        return response
    def decode(self):
        return self.request().read().decode('utf-8')
    
    def solve_data(self):
        pattern = re.compile('<div class=\"content\"[\s\S]+?<span>([\s\S]+?)<\/span>')
        content = self.decode()
        items = re.findall(pattern,content)
        return items
    def print_data(self):
        data = self.solve_data()
        for item in data:
            print item
        

page = 1
url = 'http://www.qiushibaike.com/hot/page/'+str(page)

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}

test = QSBK(url,headers)
test.print_data()

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容