之前做爬虫 爬取贴吧松爱协会的内容是存在txt文件的 这个并不好 所以这一次存在Mongdb
这次是在windows 安在Mongodb里
官网下载
https://www.mongodb.com/download-center?jmp=nav#community
启动:
mongod.exe --logpath "c:\data\log\mongodb.log" --logappend --dbpath "c:\data\db" --serviceName "MongoDB" --install
net start MongoDB
存:
#coding=utf-8
import requests
import datetime
from bs4 import BeautifulSoup
from pymongo import MongoClient
import sys
import time
reload(sys)
sys.setdefaultencoding('utf-8')
client = MongoClient('localhost',27017)
db = client.zhengdai_database
collection = db.zhengai
link = "https://tieba.baidu.com/p/4877675324"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"lxml")
content_list = soup.find_all("div",class_ = "d_post_content j_d_post_content ")
for i in range(len(content_list)):
conent = content_list[i].text.strip()
print ("诗集"+str(i+1)+":")
print (conent)
post = {
"id":i,
"content":conent,
"date":datetime.datetime.utcnow()#获取当前时间
}
collection.insert_one(post)