搞爬虫第一遇到的反爬问题就是被封IP,想要继续愉快的玩耍当然是要使用代理ip了,百度“代理ip”发现都是收费的网站,然后会开放一小部分免费的代理ip,想想这些网站的IP是怎么来的,立马想到爬虫呗,一个网站爬一部分,多找几个网站,爬下来的代理IP数量就很可观了。下面是以西刺代理网站为例,实现这一功能。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/12/7 23:14
# @Author : python_spider
# @Site :
# @File : 01get_ip.py
# @Software: PyCharm
import urllib2
from bs4 import BeautifulSoup
import csv
import random
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def IPspider(numpage):
csvfile = file('ips.csv', 'wb')
writer = csv.writer(csvfile)
url = 'http://www.xicidaili.com/nn/'
user_agent = random.choice(user_agent_list)
headers = {'User-agent': user_agent}
for num in xrange(1, numpage + 1):
ip_url = url + str(num)
print('Now downloading the ' + str(num * 100) + ' ips')
request = urllib2.Request(ip_url, headers=headers)
content = urllib2.urlopen(request).read()
bs = BeautifulSoup(content, 'html.parser')
res = bs.find_all('tr')
for item in res:
try:
temp = []
tds = item.find_all('td')
temp.append(tds[1].text.encode('utf-8'))
temp.append(tds[2].text.encode('utf-8'))
writer.writerow(temp)
except IndexError:
pass
# 假设爬取前十页所有的IP和端口
IPspider(10)