BeautifulSoup4爬虫练习

爬豆瓣读书,不知道是不是反爬虫,只能到50页。。后面直接封ip了

import pymysql
import requests
from bs4 import BeautifulSoup
import time


#%d用作数字占位
baseUrl = "https://book.douban.com/tag/日本文学?start=%d&type=T"   
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ul", "subject-list").find_all("li")# 获取所有的书本内容
    for i in items:
        books = {}      # 临时存取电影的数据
        books["name"] = i.find("div", "info").a['title']   # 书本名字
        books["pub"] = i.find("div", "info").find("div", "pub").text # 出版信息
        books["score"] = i.find("span", "rating_nums").text if(i.find("span", "rating_nums")) else ""  # 评分
        books["comment_num"] = i.find("span", "pl").text # 评论人数
        books["detail"] = i.find("p", "").text if(i.find("p", "")) else ""# 书本详情
        books["link"] = i.find("div","pic").find("a").get("href")   # 书本详情页链接
        books["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 书本海报地址
        lists.append(books) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        pub VARCHAR(100) NOT NULL,
        score VARCHAR(100) NOT NULL,
        comment_num VARCHAR(100) NOT NULL,
        detail VARCHAR(300) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,1000,20):
        lists = get_books(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`name`,`pub`,`score`,`comment_num`,`detail`,`link`,`poster`) VALUES(%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["pub"], i["score"], i["comment_num"], i["detail"], i["link"], i["poster"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        time.sleep(0.5)
    db.close()
将结果导入MySQL

重点来了:
改进方法,安全的爬了四万五千条豆瓣读书数据

import pymysql
import requests
from bs4 import BeautifulSoup
import time


headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    #%d用作数字占位
#==============================================================================
#     baseUrl = "https://book.douban.com/tag/旅行?start=%d&type=T"
#==============================================================================
    url_list = ["https://book.douban.com/tag/绘本?start=%d&type=T","https://book.douban.com/tag/推理?start=%d&type=T","https://book.douban.com/tag/青春?start=%d&type=T","https://book.douban.com/tag/言情?start=%d&type=T","https://book.douban.com/tag/科幻?start=%d&type=T","https://book.douban.com/tag/武侠?start=%d&type=T","https://book.douban.com/tag/奇幻?start=%d&type=T","https://book.douban.com/tag/随笔?start=%d&type=T","https://book.douban.com/tag/散文?start=%d&type=T","https://book.douban.com/tag/诗歌?start=%d&type=T","https://book.douban.com/tag/童话?start=%d&type=T",
    "https://book.douban.com/tag/名著?start=%d&type=T","https://book.douban.com/tag/小说?start=%d&type=T","https://book.douban.com/tag/港台?start=%d&type=T","https://book.douban.com/tag/漫画?start=%d&type=T","https://book.douban.com/tag/历史?start=%d&type=T","https://book.douban.com/tag/哲学?start=%d&type=T","https://book.douban.com/tag/传记?start=%d&type=T","https://book.douban.com/tag/设计?start=%d&type=T","https://book.douban.com/tag/建筑?start=%d&type=T","https://book.douban.com/tag/电影?start=%d&type=T","https://book.douban.com/tag/回忆录?start=%d&type=T","https://book.douban.com/tag/音乐?start=%d&type=T","https://book.douban.com/tag/日本文学?start=%d&type=T",
    "https://book.douban.com/tag/旅行?start=%d&type=T","https://book.douban.com/tag/励志?start=%d&type=T","https://book.douban.com/tag/职场?start=%d&type=T","https://book.douban.com/tag/美食?start=%d&type=T","https://book.douban.com/tag/教育?start=%d&type=T","https://book.douban.com/tag/灵修?start=%d&type=T","https://book.douban.com/tag/健康?start=%d&type=T","https://book.douban.com/tag/家居?start=%d&type=T","https://book.douban.com/tag/经济学?start=%d&type=T","https://book.douban.com/tag/管理?start=%d&type=T","https://book.douban.com/tag/商业?start=%d&type=T","https://book.douban.com/tag/金融?start=%d&type=T","https://book.douban.com/tag/营销?start=%d&type=T",
    "https://book.douban.com/tag/理财?start=%d&type=T","https://book.douban.com/tag/股票?start=%d&type=T","https://book.douban.com/tag/企业史?start=%d&type=T","https://book.douban.com/tag/科普?start=%d&type=T","https://book.douban.com/tag/互联网?start=%d&type=T","https://book.douban.com/tag/编程?start=%d&type=T","https://book.douban.com/tag/交互设计?start=%d&type=T","https://book.douban.com/tag/算法?start=%d&type=T","https://book.douban.com/tag/通信?start=%d&type=T","https://book.douban.com/tag/神经网络?start=%d&type=T"] 
    lists = []
    for baseUrl in url_list:
         url = baseUrl % start
         html = requests.get(url,headers=headers)
         soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
         items = soup.find("ul", "subject-list").find_all("li")# 获取所有的书本内容
         for i in items:
             books = {}      # 临时存取电影的数据
             books["name"] = i.find("div", "info").a['title']   # 书本名字
             books["pub"] = i.find("div", "info").find("div", "pub").text # 出版信息
             books["score"] = i.find("span", "rating_nums").text if(i.find("span", "rating_nums")) else ""  # 评分
             books["comment_num"] = i.find("span", "pl").text # 评论人数
             books["detail"] = i.find("p", "").text if(i.find("p", "")) else ""# 书本详情
             books["link"] = i.find("div","pic").find("a").get("href")   # 书本详情页链接
             books["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 书本海报地址
             lists.append(books) # 保存到返回数组中
         time.sleep(3.5)
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        pub VARCHAR(100) NOT NULL,
        score VARCHAR(100) NOT NULL,
        comment_num VARCHAR(100) NOT NULL,
        detail VARCHAR(300) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    a = 0
    for start in range(0,1000,20):
        lists = get_books(start)# 获取提取到数据
        print(start)
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`name`,`pub`,`score`,`comment_num`,`detail`,`link`,`poster`) VALUES(%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["pub"], i["score"], i["comment_num"], i["detail"], i["link"], i["poster"]))
                db.commit()
                print(i["name"]+" is success")
                a+=1
                print(a)
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()

在上面的基础上改进一下,爬取豆瓣图书热门标签的所有图书,一共120个

import pymysql
import requests
from bs4 import BeautifulSoup
import time


tagUrl = "https://book.douban.com/tag/?view=cloud" 
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}


def link_title():
    url = tagUrl
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")
    items = soup.find("table", "tagCol").find_all("tr")
    for i in items:
         lines = i.find_all("td")
         for j in lines:
              books = {}      
              books["title"]=j.a.string  
              books["num"]=j.b.string 
              lists.append(books["title"])# 得到标签列表,一共120个热门标签
    return lists


def get_books(start):
    #%d用作数字占位
    url = "https://book.douban.com/tag/待选?start=%d&type=T"
    url_list = link_title()
    lists = []
    for i in url_list:
         tag = i
         baseUrl = url.replace('待选',i)# 用标签替换
         full_url = baseUrl % start
         html = requests.get(full_url,headers=headers)
         soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
         items = soup.find("ul", "subject-list").find_all("li")# 获取所有的书本内容
         for i in items:
             books = {}      # 临时存取书本的数据
             books["tag"] = tag   # 书本标签
             books["name"] = i.find("div", "info").a['title']   # 书本名字
             books["pub"] = i.find("div", "info").find("div", "pub").text # 出版信息
             books["score"] = i.find("span", "rating_nums").text if(i.find("span", "rating_nums")) else ""  # 评分
             books["comment_num"] = i.find("span", "pl").text # 评论人数
             books["detail"] = i.find("p", "").text if(i.find("p", "")) else ""# 书本详情
             books["link"] = i.find("div","pic").find("a").get("href")   # 书本详情页链接
             books["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 书本海报地址
             lists.append(books) # 保存到返回数组中
         time.sleep(3.5)
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books_alltags")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books_alltags(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        tag VARCHAR(100) NOT NULL,
        name VARCHAR(100) NOT NULL,
        pub VARCHAR(100) NOT NULL,
        score VARCHAR(100) NOT NULL,
        comment_num VARCHAR(100) NOT NULL,
        detail VARCHAR(300) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    a = 0
    for start in range(0,1000,20):
        lists = get_books(start)# 获取提取到数据
        print(start)
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `douban_books_alltags`(`tag`,`name`,`pub`,`score`,`comment_num`,`detail`,`link`,`poster`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["tag"], i["name"], i["pub"], i["score"], i["comment_num"], i["detail"], i["link"], i["poster"]))
                db.commit()
                print(i["name"]+" is success")
                a+=1
                print(a)
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()

爬豆瓣电影TOP250,参考

import pymysql
import requests
from bs4 import BeautifulSoup


#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评,没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["comment_num"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
    db.close()

将豆瓣爬下来的电影详情按年份、国家或地区、类型等分好并写入MySQL数据库

import pymysql
import requests
from bs4 import BeautifulSoup
import re

#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评,没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists


if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        time VARCHAR(300) NOT NULL,
        country VARCHAR(300) NOT NULL,
        type VARCHAR(300) NOT NULL,
        drictor_artist VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        data=[]
        for i in lists:
             action = i["detail"]
             remove=re.compile(r'                            |\n|</br>|\.*')
             bd=re.sub(remove,"",action)
             bd=re.sub('<br>',"   ",bd)#去掉<br>
             bd=re.sub('/',"   ",bd)#替换/
             words=bd.split("   ")
             for s in words:
                  if len(s)!=0 and s!=' ':#去掉空白内容
                        data.append(s)
             i["time"] = data[-3][-5:]
             i["country"] = data[-2]
             i["type"] = data[-1]
             i["drictor_artist"] = data[0]
             # 插入数据到数据库sql语句,%s用作字符串占位
             sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`time`,`country`,`type`,`drictor_artist`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
             try:
                 cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["time"], i["country"], i["type"], i["drictor_artist"], i["comment_num"]))
                 db.commit()
                 print(i["name"]+" is success")
             except:
                 db.rollback()
    db.close()

可以将TOP250电影的年份画出来

豆瓣电影TOP250年代分布

爬链家二手房,数据写入MySQL

import pymysql
import requests
from bs4 import BeautifulSoup
import time


#%d用作数字占位
baseUrl = "http://gz.lianjia.com/ershoufang/pg%d/"   
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ul", "sellListContent").find_all("li")# 获取所有的书本内容
    for i in items:
        books = {}      # 临时存取电影的数据
        books["title"]=i.find("div","title").a.string  #雅逸庭3房出售 单边位 带主套 免交个税
        books["link"]=i.find("div","title").a.get('href')  #http://gz.lianjia.com/ershoufang/GZ0002556701.html
        books["address_xiaoqu"]=i.find("div","houseInfo").a.string  #广州雅居乐花园雅逸庭 
        books["address_info"]= i.find("div","houseInfo").a.next_sibling  # | 3室2厅 | 109.5平米 | 东北 | 精装 | 无电梯
        books["flood"]= i.find("div","flood").find("div","positionInfo").span.next_sibling  #中楼层(共6层)2010年建塔楼  -  
        books["area"]= i.find("div","flood").find("div","positionInfo").a.string  #华南
        books["total_price"] = i.find("div","totalPrice").find("span").text   # 总价
        books["mean_price"] = i.find("div", "unitPrice").find("span").text  # 均价
        books["followInfo"]= i.find("div","followInfo").span.next_sibling  #103人关注 / 共42次带看 / 29天以前发布
        lists.append(books) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(100) NOT NULL,
        link VARCHAR(100) NOT NULL,
        address_xiaoqu VARCHAR(100) NOT NULL,
        address_info VARCHAR(100) NOT NULL,
        flood VARCHAR(100) NOT NULL,
        area VARCHAR(300) NOT NULL,
        total_price VARCHAR(50) NOT NULL,
        mean_price VARCHAR(100) NOT NULL,
        followInfo VARCHAR(200) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(1,100,1):
        lists = get_books(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`title`,`link`,`address_xiaoqu`,`address_info`,`flood`,`area`,`total_price`,`mean_price`,`followInfo`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["title"], i["link"], i["address_xiaoqu"], i["address_info"], i["flood"], i["area"], i["total_price"], i["mean_price"], i["followInfo"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()
链家二手房数据

对链家数据进行处理

# -*- coding: utf-8 -*-
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt

  
#读取csv文件  
df = pd.read_csv('b.csv', encoding='gbk') 
house = pd.DataFrame(df)  


#对房源信息进行分列  
houseinfo_split = pd.DataFrame((x.split('|') for x in house.address_info),index=house.index,columns=['xiaoqu','huxing','mianji','chaoxiang','zhuangxiu','dianti'])  
print(houseinfo_split.head())  


#将分列结果拼接回原数据表  
house=pd.merge(house,houseinfo_split,right_index=True, left_index=True)  
#对房源关注度进行分列  
followinfo_split = pd.DataFrame((x.split('/') for x in house.followInfo),index=house.index,columns=['guanzhu','daikan','fabu'])  
#将分列后的关注度信息拼接回原数据表  
house=pd.merge(house,followinfo_split,right_index=True, left_index=True)  
#按房源户型类别进行汇总  
huxing=house.groupby('huxing')['huxing'].agg(len)  
#查看户型汇总结果  
print(huxing)  
  

#画图了一下内容中有多个画图,运行时请注释其他的画图代码,避免互相影响  
#绘制房源户型分布条形图  
plt.rc('font', family='STXihei', size=11)  
a=np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])  
plt.barh([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],huxing,color='#052B6C',alpha=0.8,align='center',edgecolor='white')  
plt.ylabel('house type')  
plt.xlabel('number')  
plt.xlim(0,400)  
plt.ylim(0,20)  
plt.title('Housing family distribution')  
plt.legend(['Number'], loc='upper right')  
plt.grid(color='#95a5a6',linestyle='--', linewidth=2,axis='y',alpha=0.4)  
plt.yticks(a,('13室0厅','15室6厅','1室0厅','1室1厅','1室2厅','2室1厅','2室2厅','3室1厅','3室2厅','4室1厅','4室2厅','4室3厅','5室0厅','5室1厅','5室2厅','5室3厅','6室2厅','6室3厅','7室2厅','9室9厅'))  
plt.show()  
#需要注意的是根据huxing输出的内容,设置a=np.array([1,2,3,4,5,6,7,8,9])的数量,我这里是9段分布,所以使用1-9,plt.yticks部分也是一样的  
# 1shi0ting为 1室0厅,1shi1ting为1室1厅,以此类推  


#对房源面积进行二次分列  
mianji_num_split = pd.DataFrame((x.split('平') for x in house.mianji),index=house.index,columns=['mianji_num','mi'])  
#将分列后的房源面积拼接回原数据表  
house = pd.merge(house,mianji_num_split,right_index=True,left_index=True)  
#去除mianji_num字段两端的空格  
house['mianji_num'] = house['mianji_num'].map(str.strip) 


#更改mianji_num字段格式为float  
house['mianji_num'] = house['mianji_num'].astype(float)  
#查看所有房源面积的范围值  
print(house['mianji_num'].min(),house['mianji_num'].max()) 
  

#对房源面积进行分组  
bins = [0, 50, 100, 150, 200, 250, 300, 350]  
group_mianji = ['less than 50', '50-100', '100-150', '150-200','200-250','250-300','300-350']  
house['group_mianji'] = pd.cut(house['mianji_num'], bins, labels=group_mianji)  
#按房源面积分组对房源数量进行汇总  
group_mianji=house.groupby('group_mianji')['group_mianji'].agg(len)  
  

#绘制房源面积分布图 需要去掉注释  
plt.rc('font', family='STXihei', size=15)  
a=np.array([1,2,3,4,5,6,7])  
plt.barh([1,2,3,4,5,6,7],group_mianji,color='#052B6C',alpha=0.8,align='center',edgecolor='white')  
plt.ylabel('mianji group')  
plt.xlabel('number')  
plt.title('Housing area of distribution')  
plt.legend(['number'], loc='upper right')  
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)  
plt.yticks(a,('less 50', '50-100', '100-150', '150-200','200-250','250-300','300-350'))  
plt.show()  
  

#对房源关注度进行二次分列  
guanzhu_num_split = pd.DataFrame((x.split('人') for x in house.guanzhu),index=house.index,columns=['guanzhu_num','ren'])  
#将分列后的关注度数据拼接回原数据表  
house=pd.merge(house,guanzhu_num_split,right_index=True, left_index=True)  
#去除房源关注度字段两端的空格  
house['guanzhu_num']=house['guanzhu_num'].map(str.strip)  
#更改房源关注度及总价字段的格式  
house[['guanzhu_num','total_price']]=house[['guanzhu_num','total_price']].astype(float)  
#查看房源关注度的区间  
print(house['guanzhu_num'].min(),house['guanzhu_num'].max())  
  

#对房源关注度进行分组,这里的bins也需要根据上边的min()和max()输出值进行设置  
bins = [0, 20, 50, 80, 200, 500]  
group_guanzhu = ['小于20', '20-50', '50-80', '80-200','200-500']  
house['group_guanzhu'] = pd.cut(house['guanzhu_num'], bins, labels=group_guanzhu)  
group_guanzhu=house.groupby('group_guanzhu')['group_guanzhu'].agg(len)  
  

#绘制房源关注度分布图,去除注释  
plt.rc('font', family='STXihei', size=15)  
a=np.array([1,2,3,4,5])  
plt.barh([1,2,3,4,5],group_guanzhu,color='#052B6C',alpha=0.8,align='center',edgecolor='white')  
plt.ylabel('Interest groups')  
plt.xlabel('Number')  
plt.xlim(0,1000)  
plt.title('Housing attention distribution')  
plt.legend(['Number'], loc='upper right')  
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)  
plt.yticks(a,('less 20', '20-50', '50-80', '80-200', '200-500'))  
plt.show()


house = pd.DataFrame((x.split('元') for x in house.mean_price),index=df.index,columns=['mean_price','pingmi'])
house = pd.DataFrame((x.split('价') for x in house.mean_price),index=df.index,columns=['danjia','mean_price'])
house[['mean_price']]=house[['mean_price']].astype(int)
print(house['mean_price'].min(),house['mean_price'].max())
plt.plot(house['mean_price'])
房型数量分布

面积数量分布

带看量分布

均价分布

爬链家二手房100页数据共2970个:

import pymysql
import requests
from bs4 import BeautifulSoup
import time


#%d用作数字占位
baseUrl = "http://gz.lianjia.com/ershoufang/pg%d/"   
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ul", "sellListContent").find_all("li")# 获取所有的书本内容
    for i in items:
        books = {}      # 临时存取电影的数据
        books["title"]=i.find("div","title").a.string  #雅逸庭3房出售 单边位 带主套 免交个税
        books["link"]=i.find("div","title").a.get('href')  #http://gz.lianjia.com/ershoufang/GZ0002556701.html
        books["address_xiaoqu"]=i.find("div","houseInfo").a.string  #广州雅居乐花园雅逸庭 
        books["address_info"]= i.find("div","houseInfo").a.next_sibling  # | 3室2厅 | 109.5平米 | 东北 | 精装 | 无电梯
        books["flood"]= i.find("div","flood").find("div","positionInfo").span.next_sibling  #中楼层(共6层)2010年建塔楼  -  
        books["area"]= i.find("div","flood").find("div","positionInfo").a.string  #华南
        books["total_price"] = i.find("div","totalPrice").find("span").text   # 总价
        books["mean_price"] = i.find("div", "unitPrice").find("span").text  # 均价
        books["followInfo"]= i.find("div","followInfo").span.next_sibling  #103人关注 / 共42次带看 / 29天以前发布
        lists.append(books) # 保存到返回数组中
        time.sleep(3.5)
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(100) NOT NULL,
        link VARCHAR(100) NOT NULL,
        address_xiaoqu VARCHAR(100) NOT NULL,
        address_info VARCHAR(100) NOT NULL,
        flood VARCHAR(100) NOT NULL,
        area VARCHAR(300) NOT NULL,
        total_price VARCHAR(50) NOT NULL,
        mean_price VARCHAR(100) NOT NULL,
        followInfo VARCHAR(200) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(1,100,1):
        lists = get_books(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`title`,`link`,`address_xiaoqu`,`address_info`,`flood`,`area`,`total_price`,`mean_price`,`followInfo`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["title"], i["link"], i["address_xiaoqu"], i["address_info"], i["flood"], i["area"], i["total_price"], i["mean_price"], i["followInfo"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 218,525评论 6 507
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,203评论 3 395
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 164,862评论 0 354
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,728评论 1 294
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,743评论 6 392
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,590评论 1 305
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,330评论 3 418
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,244评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,693评论 1 314
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,885评论 3 336
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 40,001评论 1 348
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,723评论 5 346
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,343评论 3 330
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,919评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,042评论 1 270
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,191评论 3 370
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,955评论 2 355

推荐阅读更多精彩内容

  • Android 自定义View的各种姿势1 Activity的显示之ViewRootImpl详解 Activity...
    passiontim阅读 172,145评论 25 707
  • 1 前言 作为一名合格的数据分析师,其完整的技术知识体系必须贯穿数据获取、数据存储、数据提取、数据分析、数据挖掘、...
    whenif阅读 18,073评论 45 523
  • 爬虫是一个比较容易上手的技术,也许花5分钟看一篇文档就能爬取单个网页上的数据。但对于大规模爬虫,完全就是另一回事,...
    真依然很拉风阅读 9,662评论 5 114
  • 你是掌心的纹路 前世注定 今生难弃 你是戒不掉的瘾 思念成疾 幻灭成伤 你是未续完的曲 你是未唱完的歌 你是深夜里...
    吾语小妹阅读 137评论 0 0
  • 在社交中,着重点应放在你能给对方带来多少价值与效用,能满足别人什么需求,把这些放在首位,尔后再谈个人利益,这样会显...
    蒋燕玲阅读 110评论 0 0