我是为了把图片中那个菜单的所有文字内容以及二级菜单等等全部提取出来,并且提取出中间的所有超链接,并且整合成一个csv文件
我这里面用到了python的字符串分割以及组合,因为有一些爬取的字符串里面会有一些类似\n或者很多空格,所以会对字符串进行一个分割提取再整合拼接
import os
from lxml import etree
import pandas as pd
import csv
import codecs
csvFile = codecs.open('csvData.csv','w','utf-8')
# csvFile = open("csvData.csv", "w") # 创建csv文件
writer = csv.writer(csvFile) # 创建写的对象
writer.writerow(["id", "button_name", "info_name", "info_url"]) # 写入列的名称
temp = []
filenames = os.listdir('./htmldata/')
for filename in filenames:
filename = './htmldata/' + filename
f = open(filename, 'r', encoding='utf-8')
data = f.read()
selector = etree.HTML(data)
id = selector.xpath('/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[1]/div[1]/div[2]/span/span/span[2]/text()')[
0]
# print(id)
buttons = selector.xpath('//*[@id="accordion"]/div[@class="panel panel-default"]')
# 遍历按钮,得到每一个按钮的名称
# buttons ---- 按钮组的xpath
# button ---- 按钮的xpath
# button_info ---- 按钮名称的列表
# button_name ---- 按钮名称
for button in buttons:
button_info = button.xpath('./div[@class="panel-heading mim-panel-heading"]/span/span/a/text()')
button_name = button_info[1]
button_name = button_name.replace('\n', '')
button_name = button_name.replace(' ', '')
if button_name == '':
button_info = button.xpath('./div[@class="panel-heading mim-panel-heading"]/span/span/a/div/div[2]/text()')
button_name = button_info[0]
# print('-', button_name)
# 下面的代表这个按钮下面的标签名称以及链接
# 按钮下面的a标签有的放在div下面,有的放在dd标签下,所以进行两次,第一次是遍历div,第二次是遍历dd
# infos_div ---- 按钮下面a标签组的xpath
infos_div = button.xpath('./div[2]/div[@class="panel-body small mim-panel-body"]/div')
for info in infos_div:
info_id = info.xpath('./@id')
if info_id:
continue
info_name = info.xpath('./a/text()')[0]
info_url = info.xpath('./a/@href')[0]
if '#' == info_url[0]:
continue
# print('--', info_name, info_url)
writer.writerow([id, button_name, info_name, info_url])
infos_dd = button.xpath('./div[2]/div[@class="panel-body small mim-panel-body"]/dd')
for info in infos_dd:
info_name = info.xpath('./a/text()')[0]
info_url = info.xpath('./a/@href')[0]
# print('--', info_name, info_url)
writer.writerow([id, button_name, info_name, info_url])
csvFile.close()