asyncio.as_completed 不能在 main 这样的同步代码块中直接使用 await,因为它需要在一个异步函数中。
import requests
from bs4 import BeautifulSoup as bs
import time
import asyncio
import aiohttp
async def get_html(url, headers):
try:
async with aiohttp.ClientSession() as session:
async with session.get(url=url, headers=headers) as res:
res.raise_for_status() # 如果状态码不是200,将引发异常
html = await res.text()
soup = bs(html, 'html.parser')
titles = []
models = []
areas = []
# 注意:这里的选择器应该根据实际的HTML结构进行调整
for title in soup.select('a.title'): # 假设类名是直接应用于<a>标签的
titles.append(title.text.strip())
for model in soup.select('a.model'): # 同样,这里的类名需要根据实际情况
models.append(model.text.strip())
for area in soup.select('a.area'): # 同上
areas.append(area.text.strip())
return titles, models, areas # 直接返回元组
except aiohttp.ClientError as e:
print(f"Error fetching the URL: {e}")
return None, None, None # 返回三个None以匹配返回的元组结构
async def main():
urls = [f'https://xsool.com/plugin.php?id=tom_tcpc&site=1&mod=list&page={num}' for num in range(1, 76)]
headers = {'User-Agent': 'Mozilla/5.0'}
datas = []
tasks = [get_html(url, headers) for url in urls]
for task in asyncio.as_completed(tasks):
result = await task
if result is not None: # 检查result是否为None
titles, models, areas = result
# 这里我们假设每个页面的标题、模型和区域数量是相同的,这通常不是实际情况
# 在实际情况下,您可能需要处理不同长度的列表
for title, model, area in zip(titles, models, areas):
datas.append((title, model, area))
print(f'Collected {len(datas)} items of data.')
return datas
if __name__ == '__main__':
start = time.time()
asyncio.run(main()) # 使用asyncio.run来运行异步主函数
end = time.time()
print(f'All Completed in {end - start} seconds.')