抓取详细解释,康熙字典,说文解字,字形演变信息
#---------------------------------------
# 抓取其余字典信息,保存
#---------------------------------------
import urllib.request
import re
from bs4 import BeautifulSoup
import codecs
#打开网页地址文件并抓取
def scrapt(zurl):
print(zurl)
req = urllib.request.Request(zurl)
req.add_header('Referer', 'http://www.zdic.net/z/jbs/')
req.add_header('User-Agent', 'None')
responseb = urllib.request.urlopen(req)
index_z = responseb.read()
#处理数据得到字典解释及相关字典
index_z = index_z.decode('utf8')#这个真是无语了
#抓取页面中的信息及字页面地址
soup = BeautifulSoup(index_z, "html.parser")
#抓取字典解释
tab_page = soup.find_all(attrs={'class':'tab-page'})
#抓取url中的16进制代码
keyq = re.split(r'[/.]',zurl)[-2]
print(keyq)
if len(keyq)>4:
keyq = keyq[1:]
print(keyq)
key = (b'\u' + keyq.encode()).decode('unicode-escape')
print(key)
#添加索引
for tab_page_itme in tab_page:
tab_page_itme['key'] = key
type(str(tab_page_itme))
hdfile.write(str(tab_page_itme)+'\n')#参数不能为叠加器
#保存数据
hdfile = codecs.open("hdkangxizidian", "w",'utf-8')
hdfile.write("<xml name='汉典康熙字典'>")#参数不能为叠加器
#获取地址文件中的地址
infile = open('zurlkangxi','r')
a = infile.read()
b = a.split('\n')
for zurl in b:
if len(zurl) != 0:
scrapt(zurl)
#退出前关闭文件
# zurllistfile.close()
hdfile.write("</xml>")#参数不能为叠加器
hdfile.close()
infile.close()