前一篇文章记录的获取网页内的古诗网址、作者、古诗题目,接下来就是下载古诗了
# coding:utf-8
import urllib2,re
from bs4 import BeautifulSoup
import bs4
def retrive_tangshi_300():
url = 'http://www.gushiwen.org/gushi/tangshi.aspx'
r = urllib2.urlopen(url)
soup = BeautifulSoup(r.read(),'html.parser',from_encoding='utf-8')
# 通过select选取标签内容、地址
#tags = soup.select('div a')
#for tag in tags:
# print tag['href']
shige_list = []
current_poem = {}
tags = soup.find_all('div', class_ = "guwencont2")
for tag in tags:
#print tag.a['href']
for t in tag.children:
#print t,type(t)
if type(t) == bs4.element.Tag:
pattern = re.compile(r'(.*)\((.*)\)')
m = pattern.match(t.string)
if m:
current_poem['url'] = t['href']
current_poem['title'] = m.group(1)
current_poem['author'] = m.group(2)
shige_list.append(current_poem)
current_poem = {}
return shige_list
def load_poem(poems):
#print type(poems)
u = 'http://www.gushiwen.org'
u += poems['url']
r = urllib2.urlopen(u)
soup = BeautifulSoup(r.read(),'html.parser',from_encoding='utf-8')
#print soup
tags = soup.find_all('p',align = 'center')
for tag in tags:
if type(tag) == bs4.element.Tag:
content = tag.get_text()
poems['content'] = content
return poems
if __name__ == '__main__':
r = retrive_tangshi_300()
for i in range(3):
#print r[i]
sg = load_poem(r[i])
print sg['title'],sg['author'],sg['content']
#print r[0]