简化不必要的细节,现实版的实践?自觉的优化代码o(▽)o
# -*- coding: utf-8 -*-
#导入re模块
import re
import urllib.request
from bs4 import BeautifulSoup
#部首列表正则表达式
bspattern = re.compile(r"(?:%[^%']{2}){3}")
cipattern = re.compile(r"/c/[^']*?htm")
#1获取部首列表页面
req1 = urllib.request.Request('http://www.zdic.net/c/cibs/')
response1 = urllib.request.urlopen(req1)
index_page1 = response1.read()
#分析得到部首列表
#先试试直接正则表达式提取部首列表
index_page1 = index_page1.decode('utf8')
bslist = re.findall(bspattern, index_page1)
#1获取词列表 部首列表页面
for bu in bslist:
print(bu)
bu = "http://www.zdic.net/c/cibs/bs/?bs=" + b
reqb = urllib.request.Request(bu)
reqb.add_header('Referer', 'http://www.zdic.net/c/cibs/')
responseb = urllib.request.urlopen(reqb)
index_z = responseb.read()
#分析得到字列表
index_z = index_z.decode('utf8')
zlist = re.findall(bspattern, index_z)
#部首列表地址
for z in zlist:
if len(z) != 0:
z = "http://www.zdic.net/c/cibs/ci/?z=" + z
print(z)
reqz = urllib.request.Request(z)
reqz.add_header('Referer', 'http://www.zdic.net/c/cibs/')
responseb = urllib.request.urlopen(reqz)
index_c = responseb.read()
#分析得到字列表
index_c = index_c.decode('utf8')
clist = re.findall(r"/z/[^']*?\.htm", index_c)
#转化为字地址列表
for uc in clist:
line = "http://www.zdic.net/" + uc
outfile.write(line+'\n')#参数不能为叠加器
outfile.close()