本次目标是将客路网商品套餐日程表爬取,并储存至MongoDB中
源代码
import requests
import json
from datetime import datetime
import pymongo
import re
import pandas as pd
import time
import random
def getheaders():
user_agent_list = ["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "]
UserAgent=random.choice(user_agent_list)
return UserAgent
def get_proxy():
return requests.get("http://127.0.0.1:5055/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5055/delete/?proxy={}".format(proxy))
def getHtml(url):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
print(proxy)
while retry_count > 0:
try:
html = requests.get(url, proxies={"http": "http://{}".format(proxy)}, headers=headers)
print(html)
# 使用代理访问
return html.text
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None
def str2utc(local_str):
# 本地时间string转 UTC 时间( -8:00 )
local_time = datetime.strptime(local_str, "%Y-%m-%d %H:%M:%S")
return datetime.utcfromtimestamp(local_time.timestamp())
def save_to_Mongo(result):
# 数据储存到mongodb
try:
if db[MONGO_TABLE].insert(result):
print('存储到MongoDB成功', result)
except Exception:
print('存储到MongoDb失败', result)
def get_package_id_list(data_infos):
package_id_list = []
pattern = re.compile(r'package_id.*?,')
result = pattern.findall(data_infos)
for i in range(0, len(result)):
package_id_list.append(result[i][13:-1])
return package_id_list
def get_product_id():
client = pymongo.MongoClient('localhost', 27017)
db = client['klook']
table = db['products']
data = pd.DataFrame(list(table.find()))
id_list = data['_id']
return id_list
def get_packge_id(id_num):
client = pymongo.MongoClient('localhost', 27017)
db = client['klook']
table = db['products']
data = pd.DataFrame(list(table.find()))
return get_package_id_list(str(data['packages'][id_num]))
def get_package_data(count_number):
# 获得单个套餐信息
print("商品:", get_product_id()[count_number], "开始获取~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
for j in range(0, len(get_packge_id(count_number))):
print("商品:", get_product_id()[count_number], "套餐:", get_packge_id(count_number)[j])
url = 'https://www.klook.com/xos_api/v1/usrcsrv/packages/' + str(get_packge_id(count_number)[j]) + '/schedules_and_units?_=1565855432617'
res = getHtml(url)
thing_json = json.loads(res)
for i in range(0, len(thing_json['result']['schedules'])):
thing_json['result']['schedules'][i]['_id'] = thing_json['result']['schedules'][i].pop("arrangement_id")
# thing_json['result']['schedules'][i]['block_out_time'] = str2utc(thing_json['result']['schedules'][i]['block_out_time'])
save_to_Mongo(thing_json['result']['schedules'][i])
url_price = 'https://www.klook.com/xos_api/v1/usrcsrv/arrangements/' + str(thing_json['result']['schedules'][i]['_id']) + '/units?_=1565762299561'
res_price = requests.get(url_price).text
price_json = json.loads(res_price)
db[MONGO_TABLE].update_one(
{'_id': int(thing_json['result']['schedules'][i]['_id'])},
{'$set': {'prices': price_json['result']['prices'][0]}})
db[MONGO_TABLE].update_one(
{'_id': int(thing_json['result']['schedules'][i]['_id'])},
{'$set': {'product_id': str(get_product_id()[count_number])}})
db[MONGO_TABLE].update_one(
{'_id': int(thing_json['result']['schedules'][i]['_id'])},
{'$set': {'date_obj': str2utc(thing_json['result']['schedules'][i]['date'])}})
db[MONGO_TABLE].update_one(
{'_id': int(thing_json['result']['schedules'][i]['_id'])},
{'$set': {'block_out_time_obj': str2utc(thing_json['result']['schedules'][i]['block_out_time'])}})
print("商品:", get_product_id()[count_number], "成功获取~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
MONGO_URl = 'localhost:27017'
MONGO_DB = 'klook'
client = pymongo.MongoClient(MONGO_URl)
db = client[MONGO_DB]
MONGO_TABLE = 'schedules'
headers = {
"cookie": "abtest_revamp=1563875646033; device_id_new=ryEGX8eZpJ00300000000000005B8Gc9qXKS00314982965WpYWiKzBGKGAWkn1DGD5S16Goh5Mk004Kht7irbDUr00000YVxEr0000041IK5K68xk78dzoTmemq:40::f9905f43e6590003; _gcl_au=1.1.31383824.1563875650; tag_fok=1563875648000; _ga=GA1.2.1742545335.1563875651; _pxvid=e628e4c7-ad2f-11e9-bcd0-0242ac12000e; klk_lang=zh-CN; __stripe_mid=beb6214f-fedc-4ec7-bd1e-8a336755b064; _gcl_aw=GCL.1563934680.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; _gac_UA-54803406-1=1.1563934680.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; _gac_UA-86696233-1=1.1563934685.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; klk_currency=CNY; _gid=GA1.2.389333073.1565751780; px-abgroup=A; px-abper=100; webp_support=1; retina_support=0; JSESSIONID=280B2DDE101FA68E5B8D0A2BA2695BC0; CSRF-Token=MTU2NTkzNTA5NXxOREFWd2tQeldtTERCWXZaTW9ucjdJTXJXR05Xc1drQ3w47gIhUzoBKj9lGBxwUkIl6sSj0_z_cw8tPAMy6kA9bw==; CSRF-Token-Valid=valid; mp_c2ca8b423fd75a10792debf44cd6b51a_mixpanel=%7B%22distinct_id%22%3A%20%2216c1e409f9f2cc-08e5cb5145a7c4-37607c04-13c680-16c1e409fa031c%22%2C%22%24device_id%22%3A%20%2216c1e409f9f2cc-08e5cb5145a7c4-37607c04-13c680-16c1e409fa031c%22%2C%22%24search_engine%22%3A%20%22google%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fwww.google.com%2F%22%2C%22%24initial_referring_domain%22%3A%20%22www.google.com%22%2C%22Language%22%3A%20%22zh-CN%22%2C%22Platform%22%3A%20%22Web%22%2C%22Backend%20User%20Country%22%3A%20%22CN%22%2C%22Test-WS2199%22%3A%20%22variant%22%2C%22Page%20Type%22%3A%20%22Destination%20Page%22%2C%22__timers%22%3A%20%7B%7D%2C%22Login%20Status%22%3A%20false%2C%22Test-3%22%3A%20%22variant-10%22%2C%22Test-14%22%3A%20%22variant-55%22%2C%22Test-WS2196%22%3A%20%22control%22%2C%22Test-WS2350%22%3A%20%22variant%22%2C%22'Test-BB1%22%3A%20%22control%22%2C%22Test-12%22%3A%20%22control%22%2C%22Test-23%22%3A%20%22variant-70%22%2C%22Test-AAAAA%22%3A%20%22variant%22%2C%22Test-24%22%3A%20%22control%22%2C%22Test-25%22%3A%20%22control%22%2C%22Test-26%22%3A%20%22variant-75%22%2C%22WS-2515%22%3A%20%22WS-2515-variant1%22%2C%22WS-2351%22%3A%20%22WS-2351-variant1%22%7D; wcs_bt=s_2cb388a4aa34:1565935116; _px3=51c25ddccc41460714b0c77f9086094ebca4547fe6aff217bada6cd0a71b9cda:1K7R1eyox+K6FywON0Wjpr/BvHj0YRXaQx9pH45gDDO4QEcYa7eI+hSsgvjvtAdRFfNFo/12w1i3MBbgQsHVhA==:1000:4duS7MSxB3gm7SQSqY7aj6Hnnyzqw2hPcZl8z6X6Ee56B7pT4yuuroAOE6n43zXK+D22dsZWIFh4kp3252pn2sm9khCmkHbsNckMqPyDeKKSVWjo/8QOfv+t2pDd0D6nVliwyxyI5OVY9hhoBdkkKJS41SwORVvfALvpEDnEnBg="
, "user-agent": getheaders()
, "Sec-Fetch-Mode": "cors"
}
for i in range(0, len(get_product_id())):
time.sleep(5)
get_package_data(i)