转载请注明出处
aiohttp 模块
import asyncio
import aiohttp
import json
async def fetch(session, url):
# response = await session.get(url)
# html = await response.text()
# print(response.status)
# return html
# 这里即可以用 async with 也可以用 原始的。
async with session.get(url) as response:
html = await response.text()
# 这里的数据会被返回到上一个 await 那里去
return html
async def main():
async with aiohttp.ClientSession() as session:
data = await fetch(session, 'http://www.baidu.com')
print(data)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
爬取少数派
解决 open_file 文件过多,在Linux中通过 ulimit -n 100000
设置
import asyncio
import aiohttp
import time
import re
start = time.clock()
async def fetch(session, url):
headers = {'content-type': 'application/json'}
async with session.get(url, headers=headers) as response:
html = await response.text()
return await parse(html)
async def main(url):
cookies = {'cookies_are': 'working'}
async with aiohttp.ClientSession(cookies=cookies) as session:
return await fetch(session, url)
async def parse(data):
title = re.search('(?<="title":").*(?=","released_at")', data).group()
print(title)
return title
url_list = ("http://sspai.com/api/v1/articles?offset={}&limit=1".format(offset) for offset in range(100))
tasks = (asyncio.ensure_future(main(url)) for url in url_list)
loop = asyncio.get_event_loop()
result = loop.run_until_complete(asyncio.gather(*tasks))
end = time.clock()
print(end - start)