import requests
def sinaHtml():
url = 'https://www.sina.com.cn/'
response = requests.get(url)
print(response.text)
sinaHtml()
response 有乱码
<meta name="keywords" content="�浪,�浪�,SINA,sina,sina.com.cn,�浪�页,��,�讯" />
<meta name="description" content="æ�°æµªç½�为å
¨ç��ç�¨æ�·24å°�æ�¶æ��ä¾�å
¨é�¢å��æ�¶ç��ä¸æ��èµ�讯ï¼�å�
容è¦�ç��å�½å�
å¤�çª�å��æ�°é�»äº�件ã��ä½�å��èµ�äº�ã��娱ä¹�æ�¶å°�ã��产ä¸�èµ�讯ã��å®�ç
解决方案 加上即可
response.encoding = 'utf8'
另外,整体左移快捷方式为
shift + tab
需求:取出url、title
一开始是分别取出存在两个数组,然后在一一对应显示处理,但是却不是对应,有些链接没有标题,也就是说有些链接在页面上永远找不到
def parse_html(self,selector):
try:
text_lists = selector.xpath('//li/a/text()')
return text_lists
except Exception as e:
return 'xxx'
def extract_url(self,select):
href_lists = select.xpath("//li/a/@href")
return href_lists
def get_item(self,select):
item = select.xpath("//li/a")
print(item)
return item
[<Element a at 0x1051703c8>, <Element a at 0x1058cd088>, <Element a at 0x1058e5348>]
现在问题是如何从Element取出里面的元素及属性?
解决方案
待续
BeautifulSoup 感觉更方便些,如下:
from bs4 import BeautifulSoup
import requests
url_sina = 'https://www.sina.com.cn/'
res = requests.get(url_sina)
res.encoding = 'utf8'
html = str(res.content,'utf-8')
# print(html)
soup = BeautifulSoup(html,'html.parser')
for news in soup.select('.SC_Order_Fix'):
if(len(news.select('ul li a'))>0):
for i in news.select('a'):
if (len(i['href'])>0):
print(i.text + ' ' + i['href'])
else:
print(i.text)
# print(i.text)