首页菜单栏解析
# -*- coding: utf-8 -*-
#爬虫阳光电影网
from lxml import etree
import requests
url='http://www.ygdy8.com'
req = requests.get(url)
status_code = req.status_code
#print(status_code)
#网页解码方式
req.encoding='gb2312'
#获取网页源码 用html变量接收 text或content
html = req.text
#print(html)
selector = etree.HTML(html)
#提取菜单栏url
#infors = selector.xpath('//div[@id="menu"]/div[@class="contain"]/ul/li/a')
infros = selector.xpath('//div[@id="menu"]/div[@class="contain"]/ul/li[position()<10]/a')
#看出有多少个标签
#print(len(infors))
for info in infros:
menu_url_1=info.xpath('@href')
menu_name_1=info.xpath('text()')
#print(menu_name[0],menu_url[0])
if len(menu_name_1)==0:
pass
elif menu_url_1[0]=='/html/gndy/index.html':
pass
else:
menu_url=url+menu_url_1[0]#menu_url=/html/gndy/dyzz/index.html
menu_name=menu_name_1[0]
#print(menu_name,menu_url)
req2=requests.get(menu_url)
req2.encoding='gb2312'
html2=req2.text
#print(html2)