目的是爬取Weight Loss里面的文章。(自己写报告用,不宜作为学习)
内容包括{标题,副标题,作者,头像,日期,图片,正文}
不多bb直接上代码嗷嗷,scrapy框架的代码应该都是这样吧
mySpider.py
import scrapy
from demo.itemsimport WeightItem
from bs4import UnicodeDammit
from bs4import BeautifulSoup
from urllib.requestimport urlopen
class MySpider(scrapy.Spider):
name="mySpider"
source_url ='https://www.womenshealthmag.com/weight-loss/'
def start_requests(self):
url = MySpider.source_url
yield scrapy.Request(url=url, callback=self.parse)
print("发送请求")
def parse(self, response):
i=0
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
print("收到源代码")
links = selector.xpath("//div[position()>2][starts-with(@class,'simple-item grid-simple-item ')]/a[@class='simple-item-image item-image']")
print("主页xpath")
for linkin links:
newslink = link.xpath("./@href").extract_first()
yield scrapy.Request(url=MySpider.source_url + newslink, callback=self.parse1)
except Exception as err:
print(err)
def parse1(self,response):
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
text = selector.xpath("//p[@class='body-text']/text()").extract()
text ="\n".join(text)
#text = selector.xpath("//p[@class='body-text']/text()")[0]
#text = text.xpath("string(.)")
pic = selector.xpath("/html/body/div[2]/div[4]/div[1]/div[1]/div/img/@data-src").extract_first()
header = selector.xpath("//header[@class='content-header standard-header']/div[@class='content-header-inner']")
title = header.xpath(".//h1/text()").extract_first()
subtitle = header.xpath(".//p/text()").extract_first()
profilephoto = header.xpath(".//img/@data-src").extract_first()
author = header.xpath(".//span[@class='byline-name']/text()").extract_first()
date = header.xpath(".//time[@class='content-info-date']/text()").extract_first()
item = WeightItem()
item["title"] = title.strip()if titleelse ""
item["subtitle"] = subtitle.strip()if subtitleelse ""
item["author"] = author.strip()if authorelse ""
item["date"] = date.strip()if dateelse ""
item["profilephoto"] = profilephoto.strip()if profilephotoelse ""
item["text"] = text.strip()if textelse ""
item["pic"] = pic.strip()if picelse ""
yield item
item.py
import scrapy
class WeightItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
subtitle=scrapy.Field()
author = scrapy.Field()
date = scrapy.Field()
profilephoto=scrapy.Field()
text = scrapy.Field()
pic=scrapy.Field()
piplines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class WeightPipeline(object):
def open_spider(self, spider):
print("开始")
try:
self.con = pymysql.connect(host="127.0.0.1",
port=3306,
user="root",
passwd="密码",
charset="utf8"
)
self.cursor =self.con.cursor(pymysql.cursors.DictCursor)
print("已连接到mysql")
try:
self.cursor.execute("create database mydb")
print("创建新的数据库")
except:
pass
self.con.select_db("mydb")
try:
self.cursor.execute("drop table Woman'sHealth")
print("删除原来的表")
except:
pass
try:
sql ="""
create table WomansHealth(
Id varchar(8) primary key,
Title varchar(512) ,
Subtitle varchar(256),
Profilephoto varchar(256),
Author varchar(64),
Date varchar(16),
Text text,
Pic varchar(256))
"""
self.cursor.execute(sql)
print("创建新的表")
except:
self.cursor.execute("delete from WomansHealth")
self.opened =True
self.count =0
except Exception as err:
print(err)
self.opened =False
def close_spider(self, spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened =False
print("closed")
print(self.count)# 无法显示
def process_item(self, item, spider):
try:
print("----------------------")
print("标题:"+item["title"])
print("副标题:"+item["subtitle"])
print("作者:"+item["author"])
print("日期:"+item["date"])
print("头像链接:"+item["profilephoto"])
print("正文:"+item["text"])
print("图片链接:"+item["pic"])
print("---------------------")
if self.opened:
self.count +=1
print(self.count)
ID =str(self.count)
while len(ID) <8:
ID ="0"+ID
self.cursor.execute(
"insert into WomansHealth(Id,Title,Subtitle,Profilephoto,Author,Date,Text,Pic) values (%s,%s,%s,%s,%s,%s,%s,%s)",
(ID, item["title"], item["subtitle"],item["profilephoto"], item["author"], item["date"], item["text"], item["pic"]))
except Exception as err:
print(err)
return item
run.py
from scrapyimport cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
setting.py 里面加一句
ITEM_PIPELINES = {
'demo.pipelines.WeightPipeline':300, }
然后就存到数据库了