1.正则表达式
学习了简单的正则表达式使用。通过这个书发现,实际爬虫过程中,用正则表达式去匹配内容,并不是非常常用。
xpath更好用一点。
2.简单网页爬虫开发
2.1 用python获取网页源码
request库
2.2 多线程编码的例子
import re
import requests
import os
from multiprocessing.dummy import Pool
def get_toc(html):
toc_url_list = []
toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0] # re.S作用是忽略换行符
toc_url = re.findall('href="(.*?)"', toc_block, re.S)
for url in toc_url:
toc_url_list.append(start_url + url)
return toc_url_list
def get_article(html):
chapter_name = re.search('size="4">(.*?)<', html, re.S).group(1) # group里面的1,代表第一个括号匹配的内容
text_block = re.search('<p>(.*?)</p>', html, re.S).group(1)
text_block = text_block.replace('<br />', ' ')
return chapter_name, text_block
def save_file(chapter, article):
os.makedirs('动物庄园', exist_ok=True)
with open(os.path.join('动物庄园', chapter+'.txt'), 'w', encoding='utf-8') as f: # 文本文件操作
f.write(article)
def chapter_get_save(chapter_url):
html_chapter = requests.get(chapter_url).content.decode('GB2312')
chapter, article = get_article(html_chapter)
print("get"+chapter+"ok")
save_file(chapter, article)
print("save"+chapter+"ok")
# 主程序部分
start_url = 'http://www.kanunu8.com/book3/6879/'
print("begin:")
html_str = requests.get(start_url).content.decode('GB2312')
print("get start html ok")
chapter_url_list = get_toc(html_str)
print("get chapter url ok")
pool = Pool(5)
pool.map(chapter_get_save, chapter_url_list)
print("end!")
3.高性能内容解析
xpath和beautifulsoup4
from bs4 import BeautifulSoup
import requests
target_url = 'http://exercise.kingname.info/exercise_bs_1.html'
print('begin get html')
html_str = requests.get(target_url).content.decode('utf-8')
print('end get html')
soup = BeautifulSoup(html_str, 'html.parser')
info = soup.find(class_ = 'test')
print(info.string)
info2 = soup.find(class_ = 'useful')
all_content = info2.find_all('li')
for li in all_content:
print(li.string)
4.数据库
mongoDB
redis