为何要建IP池
由于各个网站反爬的措施不断更新,部分网站采用一段时间内同一个IP高频率访问就封禁IP的方式,所以需要建立一个可供爬虫时切换使用的IP池
如何建一个IP池
1.寻找免费ip代理网站,目前我选取的是西刺,如图:
2.爬取ip并保存到mysql数据库
(另:由于ip存在时效性,故只需要在爬取时判断可用性)
代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pymysql
import random
class xici(object):
def __init__(self, page):
self.page = page
self.user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
self.ip_list = []
def get_ip(self):
url = 'https://www.xicidaili.com/nt/{0}'.format(self.page)
headers = {
'User-Agent': '{0}'.format(random.choice(self.user_agent))
}
ip_data = requests.get(url, headers=headers)
soup = BeautifulSoup(ip_data.content, 'xml')
len_list = []
for i in soup.find_all('tr'):
len_list.append(i.text)
for i in range(len(len_list) - 1):
ip_d = {}
ip_d['ip'] = len_list[i + 1].split('\n')[2]
ip_d['port'] = len_list[i + 1].split('\n')[3]
ip_d['type'] = len_list[i + 1].split('\n')[8]
self.ip_list.append(ip_d)
return self.ip_list
class with_mysql(object):
def __init__(self, db):
self.config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'HzH951126',
'db': db,
'charset': 'utf8mb4',
}
def save_dict(self, table, dic):
db = pymysql.connect(**self.config)
cursor = db.cursor()
table = table
keys = ', '.join(dic.keys())
values = ', '.join(['%s'] * len(dic))
sql = 'REPLACE INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
try:
if cursor.execute(sql, tuple(dic.values())):
print('Successful')
db.commit()
except Exception as e:
print(e)
db.rollback()
db.close()
if __name__ == '__main__':
xc = xici(1)
mysql = with_mysql('spiders')
print(xc.get_ip())
for i in xc.get_ip():
mysql.save_dict('ip', i)
运行结果:
mysql数据库:
结语
IP 池的建立, 对于之后的数据爬取与分析会提供必要的帮助,接下来开始正常的数据爬取与分析