目标:
爬取 http://youjia.chemcp.com/ 国内油价数据,并存入数据库,存入数据库的表名以 oil+当天日期命名。
分析过程
1.chrome浏览器输入网址,查看网页的源代码
2.观察所需爬取的数据在源代码中的位置,发现所需数据被包围在table内
以<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">开头
以</table>结尾
3.搜索<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">
,发现只有一处匹配,符合爬取条件
4.获取到table内的数据后,再获取table内每个<tr></tr>
内的数据
5.通过循环获取<tr></tr>
内每个<td></td>
内的数据,保存为一个列表
6.最后存入数据库即可
方法说明:
commom.mysql_common.py:使用python操作mysql数据库的方法封装;
common.table_name.py:命名数据库表名;
push_data_to_mysql.py:获取网站数据,存入数据库
全部代码
commom.mysql_common.py
# -*- coding: utf-8 -*-
from pymysql import *
import pymysql
class Mysql:
def __init__(self, host, port, user, password, db, charset='utf8'):
self.host = host
self.port = port
self.db = db
self.user = user
self.password = password
self.charset = charset
def connectsql(self):
self.conn = connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.password,
db=self.db,
charset=self.charset)
self.cursor = self.conn.cursor(pymysql.cursors.DictCursor)
def closesql(self):
self.cursor.close()
self.conn.close()
def execute(self, sql):
try:
self.connectsql()
self.cursor.execute(sql)
data = self.cursor.fetchall()
print(data)
self.conn.commit()
self.closesql()
except Exception as e:
print(e)
common.table_name.py
# -*- coding: utf-8 -*-
import time
import re
# 表名称
def table_name():
date_time = time.strftime("%Y-%m-%d", time.localtime())
data = re.split(r'-', date_time)
name = "oil" + data[0] + data[1] + data[2]
return name
push_data_to_mysql.py
get_html_text:获取网站的文本数据
parse_data:对html数据进行解析,获取到所需数据存入一个列表并返回
create:mysql创建一个表格,用来存入数据(表格名为 oil+当天年月日)
insert:将数据插入创建的表格
# -*- coding: utf-8 -*-
import requests
import re
from common.mysql_common import Mysql
from common.table_name import table_name
class PushOilDataToMysql:
def __init__(self):
self.html = self.get_html_text()
self.oil_data = self.parse_data()
self.name = table_name()
self.do_mysql = Mysql('localhost', 3306, 'root', '123456', 'myoildata')
@staticmethod
def get_html_text():
url = 'http://youjia.chemcp.com/'
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parse_data(self):
oil_data = []
try:
# 获取<table开头到</table>结尾的数据
# 匹配规则, re.S 是因为“.”的作用是匹配除“\n”以外的任何字符, 而在html中有很多"\n"
rule_table = '<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">\r\n(.*?)</table>'
html_data = re.findall(rule_table, self.html, re.S)
# 获取<tr>开头到</tr>结尾的数据,所有城市的内容
# 匹配规则
rule_tr = '<tr>\r\n(.*?)</tr>'
citys_data = re.findall(rule_tr, html_data[0], re.S) # data_list 为匹配的城市数据
for i in range(1, len(citys_data)):
city_data = citys_data[i]
# 匹配规则
rule_city_detail = '<td bgcolor="#FFFFFF">(.*?)</td>'
city_detail_data = re.findall(rule_city_detail, city_data, re.S)
area = city_detail_data[0].split('>')[1].split("<")[0]
oil89 = eval(city_detail_data[1])
oil92 = eval(city_detail_data[2])
oil95 = eval(city_detail_data[3])
oil98 = eval(city_detail_data[4])
oil0 = eval(city_detail_data[5])
update_time = city_detail_data[6]
oil_data.append((area, oil89, oil92, oil95, oil98, oil0, update_time))
except:
print("获取油价数据失败")
return oil_data
def create(self):
sql_create_table = 'create table `%s`(id int auto_increment,`area` varchar(20),`oil89` varchar(20),`oil92` varchar(20),`oil95` varchar(20),`oil98` varchar(20), `oil0` varchar(20), `update_time` varchar(255), primary key(id))' % self.name
self.do_mysql.execute(sql_create_table)
def insert(self):
for i in range(len(self.oil_data)):
sql_insert = 'insert into %s(area, oil89, oil92, oil95, oil98, oil0, update_time) value%s' % (self.name, self.oil_data[i])
self.do_mysql.execute(sql_insert)
if __name__ == '__main__':
test = PushOilDataToMysql()
test.create()
test.insert()