前言
- 爬取数据时不要随意打开存储文件,否则会造成数据写入失败。
- 由于没有多线程等,速度会慢一些。Total data: (2000 - 3000)
- 爬虫的数据存储主要是 pandas 的 DataFrame。
- 网站无益,使用者自重。
- 到目前为止,也只见其更换图片库源的域名。
- 详情自行搜索。。。
MM131 图片爬取
-
爬取网站数据,并建立本地的Excel数据表格
-
输入搜索内容名称(仅搜标题名称)
-
内容确认并下载(这里是看不到图的,需要的请自行优化)
-
下载完成的文件夹展示(其他原因,这里仅展示部分)
网站页面数据爬取(mm131.py)
from requests import get
from lxml import etree
from item import Item
from urllib import parse
from os.path import join
import re
import time
class MM131(object):
url = "https://m.mm131.net"
def __init__(self, path, nav_items_name="nav_items.xlsx", data_name="img_data.xlsx"):
self.nav_items = Item(join(path, nav_items_name))
self.img_items = Item(join(path, data_name))
self.nav_items.get_Frame(["类型名称", "网址", "页数", "编号"])
self.img_items.get_Frame(["标题", "网址", "编号", "页数", "类型名称"])
@staticmethod
def get(url, decode="gbk"):
cont = get(url=url)
return cont.content.decode(decode)
@staticmethod
def get_list_url(url, type_num, page):
return parse.urljoin(url, "list_{}_{}.html".format(type_num, str(page)))
def parse_nav(self):
items = []
parser = etree.HTML(self.get(self.url))
nav_items = parser.xpath("//nav[@class='slide-menu']/ul/li[@class='dropdown']"
"/ul//li[contains(@class,'cat-item')]")
for nav in nav_items:
try:
# 作为添加参数的 key
type_url = nav.xpath("a/@href")[0]
if type_url == "https://m.mm131.net/app":
continue
type_name = nav.xpath("a/text()")[0]
items.append([type_name, type_url])
except IndexError as e:
pass
return items
def parse_list_info(self, url, item):
parser = etree.HTML(self.get(url))
content = parser.xpath("//content[@id='content']")
if len(content) != 0:
# 各类型列表页数
pages = content[0].xpath("nav//span[@id='spn']/text()")
if len(pages) != 0:
page_res = re.findall(r"\d+", pages[0])
total_page = page_res[1] if len(page_res) != 0 else 0
item.append(str(total_page))
# 列表的类型编号
type_num = content[0].xpath("nav//a[@id='xbtn']/@href")
if len(type_num) != 0:
num_res = re.findall(r"\d+", type_num[0])
type_num = num_res[0] if len(num_res) != 0 else 0
item.append(str(type_num))
return item
def parse_nav_info(self):
items = self.parse_nav()
for inx, nav in enumerate(items):
self.nav_items.append(nav + ['0', '0'] if inx == 0 else self.parse_list_info(nav[1], nav))
self.nav_items.save()
return self.nav_items.get_data()
def parse_list_detail(self, url, type_name):
parser = etree.HTML(self.get(url))
content = parser.xpath("//content[@id='content']")
if len(content) != 0:
articles = content[0].xpath("article[@class='post']")
for article in articles:
try:
title = article.xpath("div[@class='post-header']/h2/a/text()")[0]
p_url = article.xpath("div[@class='post-header']/h2/a/@href")[0]
av = p_url[p_url.rfind('/') + 1: p_url.rfind('.')]
self.img_items.append([title, p_url, str(av),
str(self.parse_detail_page(p_url)),
type_name], str(av), 2)
except IndexError as e:
continue
self.img_items.show()
self.img_items.save()
def parse_detail_page(self, url):
parser = etree.HTML(self.get(url))
articles = parser.xpath("//article")
if len(articles) != 0:
pages = articles[1].xpath("div[@class='paging']/span/text()")
pages = pages[0] if len(pages) != 0 else ''
page_res = re.findall(r"\d+", pages)
return page_res[1] if len(page_res) != 0 else 0
def parse_type_detail_info(self, pos):
ni = self.nav_items
name = ni.get_data()[pos, 0]
for i in range(1, int(ni.get_data()[pos, 2]) + 1):
print("开始获取 第{}页 信息".format(i))
t_url = self.get_list_url(ni.get_data()[pos, 1], ni.get_data()[pos, 3], i)
self.parse_list_detail(t_url, name)
time.sleep(2)
def start(self):
self.parse_nav_info()
for i in range(0, len(self.nav_items.get_data())):
print("类型 {}:".format(i))
self.parse_type_detail_info(i)
time.sleep(5)
if __name__ == '__main__':
mm = MM131("./")
mm.start()
网站页面数据的存储类(item.py)
from pandas import DataFrame
from pandas import read_excel
from os.path import exists
import numpy as np
def trans_arr(obj):
return list(obj.values())
class Item(object):
df: DataFrame
df_title = []
save_counter = 0
def __init__(self, path="./Default.xlsx"):
self.path = path
def get_Frame(self, title_arr):
res = self.load()
if not res:
self.df_title = title_arr
self.df = DataFrame(columns=title_arr)
def get_col_name(self, col):
ct = type(col)
col_name = ''
if ct == int:
col_name = self.df_title[col]
elif ct == str:
col_name = ct
return col_name
# Axis :: [Row: 0 && Column: 1]
def get_len(self, axis):
return self.df.shape[axis]
def get_df(self):
return self.df
def get_data(self):
return np.array(self.df)
def set_index(self, col_name):
self.df = self.df.set_index(col_name, drop=False)
def set_path(self, path):
self.path = path
def pop(self, col, items):
col_name = self.get_col_name(col)
self.set_index(col_name)
self.df = self.df.drop(items)
def data_only(self, data, col_inx):
return data in self.df.iloc[:, col_inx].values
def append(self, data_arr, check=None, col_inx=None):
if check is not None and col_inx is not None:
if self.data_only(check, col_inx):
return False
self.df.loc[self.df.shape[0]] = data_arr
return True
def load(self):
res = exists(self.path)
if res:
data = read_excel(self.path)
self.df = DataFrame(data)
return res
def save(self):
self.save_counter += 1
self.df.to_excel(self.path, index=False)
print('\n', '保存次数:', self.save_counter, '\n')
def show(self):
print(self.df.iloc[:, :])
爬取后的数据以xlsx表格存储。
根据爬取的数据进行图片下载
- 具体的图片爬取时,网站默认会自动使用第二张图片进行填充,如果不是正在打开的那张图片页面的话。
- 后来直接从网页进行请求,发现该网页的请求头会带有 'Referer': "https://m.mm131.net/"。请求头附加上这个便可以切换回原来的图片进行下载。
- 这里的 sleep 是防止过快的爬取图片。在利用和改造网络请求的同时,也要呵护,尊重,保护运营方的服务环境。
# coding: utf8
import requests
from os.path import join
from openpyxl.reader.excel import load_workbook
import re
import os
class MM131ItemLoader(object):
max_row = 0
max_col = 0
data = []
def __init__(self, path, name):
self.path = path
self.name = name
self.book = self.load_book(self.path, self.name)
self.sheet = self.book.active
if self.sheet:
self.max_row = self.sheet.max_row
self.max_col = self.sheet.max_column
def load_book(self, path, name):
self.book = load_workbook('{}.xlsx'.format(join(path, name)))
return self.book
def match(self, p_col, val, num=None):
if len(val) == 0:
return
for row in range(2, num if num and 0 < num < self.max_row else self.max_row + 1):
s_val = self.sheet.cell(row, p_col).value
res = re.findall(r".*{}.*".format(val), s_val)
if len(res) == 0:
mr = ""
for s in val:
mr += "{}.*".format(s)
mc = re.compile(mr)
res = re.findall(mc, s_val)
if len(res) != 0:
temp = []
for col in range(1, self.max_col + 1):
temp.append(self.sheet.cell(row, col).value)
self.data.append(temp)
def get_items(self):
return self.data
class MM131ImageDownloader(MM131ItemLoader):
def __init__(self, path, name, save_path):
super().__init__(path, name)
self.path = path
self.save_path = join(save_path, "mm131")
self.origin_path = os.getcwd()
if not os.path.exists(self.save_path):
os.mkdir(self.save_path)
os.chdir(self.save_path)
def set_excel_path(self, path):
self.path = path
def set_save_path(self, save_path):
self.save_path = save_path
@staticmethod
def download_img(img_url, path, name):
print("图片地址:", img_url)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; "'
'"Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
'Referer': "https://m.mm131.net/"}
r = requests.get(img_url, headers=headers, stream=True)
if r.status_code == 200:
open(join(path, name), 'wb').write(r.content) # 将内容写入图片
print("请求成功")
else:
print("请求失败")
@staticmethod
def get_img_url(av, page):
"""
2020: img1.mmmw.net
2021: img1.nthjjz.com
"""
return "https://img1.nthjjz.com/pic/{}/{}.jpg".format(str(av), str(page))
def download(self, av, page, name, path):
url = self.get_img_url(av, page)
path = join(self.save_path, path) if path else self.save_path
self.download_img(url, path, name)
def download_match(self, val, num=None):
self.match(1, val, num)
print("关键词搜索: {}".format(val))
items = self.get_items()
if len(items) == 0:
print("暂无此搜索记录")
return
else:
print("已经找到 {} 条, 准备下载。。。".format(len(items)))
ir = input("是否现在下载? (Y/N) ")
if ir.upper() == "Y":
for (inx, item) in enumerate(items):
title = item[0]
img_type = item[4]
if not os.path.exists(img_type):
os.mkdir(img_type)
os.chdir(img_type)
if not os.path.exists(title):
os.mkdir(title)
os.chdir(self.save_path)
else:
os.chdir(self.save_path)
continue
i_path = join(self.save_path, img_type, title)
av = item[2]
pages = int(item[3])
print("{}. {}\t[准备下载]".format(str(inx + 1), title))
for page in range(1, pages + 1):
self.download(av, page, "{}.jpg".format(page), i_path)
else:
im = input("喵喵,还需要继续搜索吗? (Y/N)")
if im.upper() == "Y":
self.data = []
val = input("请输入搜索词:")
self.download_match(val, num)
else:
print("喵喵 ! ! !")
if __name__ == '__main__':
# MM131ImageDownloader(xlsx文件存储路径, xlsx文件名, 图片存储路径)
mm = MM131ImageDownloader("./", "img_data", "D:/")
mm.download_match("沐")