Python 爬虫基础｜Python网络数据采集笔记

安装BeautifulSoup

*Linux *

#debian
$sudo  apt-get install python-pip
#redhat
$sudo yum install pip
$pip install beautifulsoup4

Windows

安装Windows版本的pip 
>pip install beautifulsoup4

运行BeautifulSoup

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
bs0bj = BeautifulSoup(html.read())
print bs0bj.h1

可靠的网络连接

try:
    html = urlopen("http://www.pythonscraping.com/pages/page1.html")
except HTTPError as e:
    print e
    #返回空值，中断程序，或者执行另一方案
else:
    #程序继续。注意：如果已经在上面异常捕获那一段代码里返回或中断（break），那么就不需要使用else语句，这段代码也不会执行

示例代码

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
    def getTitle(url):
        try:
            html = urlopen(url)
        except HTTPError as e:
            return None
        try:
            bs0bj = BeautifulSoup(html.read())
            titile  = bs0bj.body.h1
        except AtrributeError as e:
            return None
        return title
title = gettitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
    print "Title could not be found"
else:
    print title

BeautifulSoup的find()和findAll()

BeautifulSoup的find()和findall() 可能是你最常用的两个函数，借助它们，你可以通过标签的不同属性过滤HTML页面，查找需要的标签组或者单个标签。
BeautifulSoup文档里两者的定义就是这样:

findAll(tag, attributes, recursive, text, limit, keywords)
find(tag, attributes, recursive,text,keywords)
# 95%的时间 只需使用前两个参数: tag和attribute
# findAll 默认是支持递归查找的（recursive 默认值是True）
#文本参数text有点不同，它是用标签的文本内容去匹配，而不是用标签的属性。

处理子标签和其他后代标签

在BeautifulSoup库里，孩子 child和后代 descendant有显著的不同：和人类的家谱一样，子标签就是一个父标签的下一级，而后代标签是指一个父标签下面所有级别的标签。

处理兄弟标签

BeautifulSoup的处理兄弟标签的函数可以让收集表格数据成为简单的事情，尤其是处理带标题行的表格
next_siblings()函数
next_sibling()函数
previous_sibling()函数
previous_siblings()函数
他们之间的区别只是返回的是单个标签 和返回一组标签的区别。

from urllib.request  import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj = BeautifulSoup(html)

for sibling in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
    print sibling

print bs0bj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()

解析JSON数据

import json
from urllib.request import urlopen
def getCountry(ipAddress):
    response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
    responseJson = json.loads(response)
    return responseJson.get('country_code')

print getCountry("50.58.253.58")

import json
jsonString = '{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],
                      "arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj = json.loads(jsonString)

print json0bj.get("arrayOfNums")
print json0bj.get("arrayOfNums")[1]
print json0bj.get("arrayOfNums")[1].get("number") + json0bj.get("arrayOfNums")[2].get("number")
print json0bj.get("arrayOfFruits")[2].get("fruit")

存储数据

下载小文件

# 方法一：使用urllib库
# -*- coding:utf-8 -*-
import urllib
import time

url = 'http://mvideo.spriteapp.cn/video/2017/0414' \
      '/697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
print "downloading with urllib"
start = time.time()
urllib.urlretrieve(url, "video.mp4")
end = time.time()
print 'Finish in ：', end - start

# 方法二：使用urllib2库
# -*- coding:utf-8 -*-
import urllib2
import time

url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
      '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'

print "downloading with urllib2"
start = time.time()
data = urllib2.urlopen(url).read()
with open('video.mp4', 'wb') as video:
    video.write(data)
end = time.time()
print 'Finish in ：', end - start

# 方法三：使用requests库
# -*- coding:utf-8 -*-
import requests
import time

url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
      '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'

print "downloading with requests"
start = time.time()
r = requests.get(url)
with open('video.mp4', 'wb') as video:
    video.write(r.content)
end = time.time()
print 'Finish in ：', end - start

下载大文件

# 方法一：使用urllib2库
# -*- coding:utf-8 -*-
import urllib2
import time

url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
      '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'

r = urllib2.Request(url)
u = urllib2.urlopen(r)
start = time.time()
with open('video.mp4', 'w') as f:
    while True:
        tmp = u.read(1024)
        if not tmp:
            break
        f.write(tmp)
end = time.time()
print 'Finish in ：', end - start

# 方法二：使用requests库
# -*- coding:utf-8 -*-
import requests
import time
url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
      '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
# 当把get函数的stream参数设置成False时，
# 它会立即开始下载文件并放到内存中，如果文件过大，有可能导致内存不足。

# 当把get函数的stream参数设置成True时，它不会立即开始下载，
# 使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载
r = requests.get(url, stream=True)
f = open("file_path", "wb")
start = time.time()
for chunk in r.iter_content(chunk_size=1024):
    if chunk:
        f.write(chunk)
        f.flush()
# iter_content：一块一块的遍历要下载的内容
# iter_lines：一行一行的遍历要下载的内容
# 这两个函数下载大文件可以防止占用过多的内存，因为每次只下载小部分数据
end = time.time()
print 'Finish in ：', end - start

把数据存储到CSV

import csv
csvFile = open("../files/test.csv",'w+')
try:
    writer = csv.write(csvFile)
    writer.writerow(('number','number plus 2','number times 2'))
    for i in range(10):
        writer.writerow((i, i+2, i*2))
finally:
    csvFile.close()

import csv 
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bs0bj=BeautifulSoup(html)
tables = bs0bj.findAll("table",{"class":"wikitable"})[0]
rows = tables.findAll("tr")

csvFile = open("../files/editors.csv",'wt',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td','th']):
            csvRow.append(cell.get_text())
            writer.writerow(csvRow)
finally:
    csvFile.close()

读取CSV文件

手动把csv文件下载到本地，然后用python定位文件位置
写python程序下载文件，读取之后再把源文件删除
从网上直接把文件读成一个字符串，然后转换成一个stringIO对象，使它具有文件的属性

from urllib import urlopen
from io import stringIO
import csv
data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii','ignore')
dataFile = stringIO(data)
csvReader = csv.reader(dataFile)

for row in csvReader:
    print row

from urllib.request import urlopen
from io import stringIO
import csv

data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii','ignore')
dataFile = stringIO(data) 
dictReader = csv.DictReader(dataFile)

#打印表头第一行
print dictReader.fieldnames

for row in dictReader:
    print row

最后编辑于：2017.12.07 06:47:02

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 216,287评论 6赞 498
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 92,346评论 3赞 392
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 162,277评论 0赞 353
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,132评论 1赞 292
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,147评论 6赞 388
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,106评论 1赞 295
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,019评论 3赞 417
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,862评论 0赞 274
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,301评论 1赞 310
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,521评论 2赞 332
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,682评论 1赞 348
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,405评论 5赞 343
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,996评论 3赞 325
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,651评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,803评论 1赞 268
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,674评论 2赞 368
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,563评论 2赞 352