Python【记第一次爬虫】

1.首先导入一些相关模块

import requests

from lxml import etree

from xlrd import open_workbook

from xlutils.copy import copy

import pymysql.cursors

2.获取页面text内容

page = requests.get(url)

result = html.fromstring(page.text)

3.使用xpath获取标签以及标签内容

# 获取页面中的a标签内容

tmps = result.xpath("//a/text()")

# 获取页面中的a标签（有title属性）的href值

tmps = result.xpath("//a[@title]/@href")

# 获取页面中的a标签（title值为‘末页’）的href值

pages = result.xpath("//a[contains(@title,'末页')]/@href")

4.读写Excel文件

rexcel = open_workbook(file_path)

excel = copy(rexcel)

table = excel.get_sheet(0)

table.write(row, cell, content)

excel.save(file_path)

5.录入数据库操作

# 连接MySQL数据库

connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='198876', db='guest',charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

# 通过cursor创建游标

cursor = connection.cursor()

# 创建sql 语句，并执行

sql = "INSERT INTO `users` (`email`, `password`) VALUES ('huzhiheng@itest.info', '123456')"

cursor.execute(sql)

# 提交SQL

connection.commit()

# 关闭连接

connection.close()

Python【记第一次爬虫】

推荐阅读更多精彩内容