展示代码###
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
info = []
headers = {
'cookie': '_gat=1; gr_user_id=7e4ae08a-42a2-4645-bcdc-c5121ebf4c28; _knewone_v2_session=bUswaVJNb2poakxGZVUwRmQ0V0dqWm9sOFpQVDBjMWF1b0grazh0Q1l6dGpub3BHK1JSanV2OGc0ZmhDZitBMkNCZFY5T2JZQ1ZxN0ZpM0dONExsN3AwREtLcmw3dC9Ub1hKbnk5N3NabG1oTGdHU01EM0lHZU5nQWRGRGdaZUhhV1crYTY4Zlk4NGdUSjhmak1jSFVNSSt1YzNEOVY5bS9zVC8rbkJHT282M0dheXpwT0FNWDFoV1lZQ0hNcnp5LS1qbEM3TStTRDNVOFFLWkF3UEh6QzNBPT0%3D--62e609c4894070dfadfe1ddfd816eae455c0803e; _ga=GA1.2.1202148329.1464331142; Hm_lvt_b44696b80ba45a90a23982e53f8347d0=1464331143; Hm_lpvt_b44696b80ba45a90a23982e53f8347d0=1464331216; gr_session_id_e7b7e334c98d4530928513e7439f9ed2=65a4dbb6-0a13-4293-8745-06637ceba521',
'referer': 'https://knewone.com/things',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36',
'x-csrf-token': 'W+zjfj7CxkXZvvltEUXvvLWVrEMMdQZFgV7wgdQx5On71c+Iqo9YcmlK1u+Cdz7V92NR59vOyBQaOfzbMcKCeA==',
}
def getText(item):
return '' if item == 0 else ''.join(item.get_text().replace('\n', ''))
def getPic(item):
return '' if item == 0 else ''.join(item.get('src'))
def getUrl(item):
return '' if item == 0 else ''.join('https://knewone.com' + item.get('href'))
def getInfo(url, data=None):
web_data = requests.get(url, headers=headers)
web_data.encoding = "utf-8"
if web_data.status_code == 200:
soup = BeautifulSoup(web_data.text, 'html.parser')
titles = soup.select('#things_list > article > section > h4 > a')
pics = soup.select('#things_list > article > header > a > img')
favos = soup.select('span.fanciers_count')
links = soup.select('#things_list > article > header > a')
if data == None:
for title, pic, favo,link in zip(titles, pics, favos,links):
time.sleep(2)
data = {
'title': getText(title),
'pic': getPic(pic),
'favo': getText(favo),
'link': getUrl(link)
}
print(data)
def get_more_page(start, end):
url = 'https://knewone.com/things?page='
for i in range(start, end):
getInfo(url + str(i))
get_more_page(1, 50)
出现问题###
这是一个老生常谈的问题,最主要是因为pycharm的编码有问题,只需要将IDE和Project 的编码问题都改成UTF-8即可。
这个问题是个大坑(找了好久我才搞定的/(ㄒoㄒ)/~~)。只要网页中有特殊字符,如 ° ,[Böhm]等,抓取的时候就都会报错。
另外的附加解决方案是
a.加入shebang,他也只是解决IDE当中的编码问题
#!/usr/bin/env python
# -*- coding: utf-8 -*-
b.给网页进行编码转换,适用于输出时有乱码。尤其需要注意网页的编码形式,尽管大多数为UTF-8
web_data = requests.get(url, headers=headers)
web_data.encoding = "utf-8"
2.还有一个小坑,对tag使用.get() or .get_test(),有些需要加[0],但是有些不需要