爬虫:2. 元素定位

元素定位

requests返回的response是html格式,我们需要把需要的数据提取出来,那么就需要元素定位。常用的元素定位方式有xpath和css,如果你熟悉javascript,也可以使用pyquery。
相关的库有lxml,BeautifuleSoap(官方已经将BeautifulSoup改名为bs4了)。相关的教程太多了,这里为了完整性,举一个xpath例子,做个小总结。

例子是抓取美容下所有分类和具体项目的相关信息。

# -*- coding:utf-8 -*-

"""
File Name : 'Spider_soyoung'.py
Description:
Author: 'chengwei'
Date: '2016/4/22' '9:43'
"""
import sys
import requests
import json
import random
import redis
import logging
import pymssql
import copy
import datetime
import time
import json
from lxml import etree
import re

reload(sys)
sys.setdefaultencoding('utf8')

class Spider_plastics(object):
    def __init__(self):
        self.user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
                             'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
                             'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+  (KHTML, like Gecko) Element Browser 5.0',
                             'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
                             'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
                             'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25',
                             'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                             'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)',
                             'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
                             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'
                            ]
        self.root_url = 'http://plastics.517mr.com/'
        #log
        self.logfilename = self.__class__.__name__ + '.log'
        logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
                        filename=self.logfilename, filemode='a')
        # SQL
        self.conn = pymssql.connect(host='99.48.58.23', user='sa', password='123456', database='meirong', charset="utf8")
        self.cur = self.conn.cursor(as_dict=True)

    def get_detail_url(self):
        user_agent = random.choice(self.user_agents)
        header_2 = {
                    "User-Agent": user_agent
                    }

        s = requests.Session()
        url_list = []
        html = s.get(self.root_url, headers=header_2)
        time.sleep(3)
        selector = etree.HTML(html.text)
        content_1 = selector.xpath('//*[@id="zxmr"]//div[starts-with(@class,"xm_list")]')
        content_2 = selector.xpath('//*[@id="pfmr"]//div[starts-with(@class,"xm_list")]')
        content_3 = selector.xpath('//*[@id="zsmr"]//div[starts-with(@class,"xm_list")]')
        content_4 = selector.xpath('//*[@id="jgmr"]//div[starts-with(@class,"xm_list")]')
        content_5 = selector.xpath('//*[@id="sssx"]//div[starts-with(@class,"xm_list")]')
        content_6 = selector.xpath('//*[@id="mfzz"]//div[starts-with(@class,"xm_list")]')
        content_7 = selector.xpath('//*[@id="myjc"]//div[starts-with(@class,"xm_list")]')
        content_8 = selector.xpath('//*[@id="zymr"]//div[starts-with(@class,"xm_list")]')
        content_9 = selector.xpath('//*[@id="sbxf"]//div[starts-with(@class,"xm_list")]')

        temp_list = [
            {'type': u'整形美容', 'content': content_1},
            {'type': u'皮肤美容', 'content': content_2},
            {'type': u'注射美容', 'content': content_3},
            {'type': u'激光美容', 'content': content_4},
            {'type': u'瘦身美容', 'content': content_5},
            {'type': u'毛发种植', 'content': content_6},
            {'type': u'美牙健齿', 'content': content_7},
            {'type': u'中医美容', 'content': content_8},
            {'type': u'失败修复', 'content': content_9}
            ]

        for item in temp_list:

            for element in item['content']:
                link = element.xpath('.//a/@href')
                i = 3
                for m in range(0, len(link)):
                    if m == 0:
                        continue
                    else:
                        item_dict = {}
                        item_dict['categories'] = item['type']
                        name = element.xpath('string(.)').replace(' ', '').replace('\t', '').strip().split('\n')
                        link = element.xpath('.//a/@href')
                        item_dict['location'] = name[0]
                        item_dict['project_classification'] = name[i]
                        item_dict['url'] = link[m]
                        i += 1
                        url_list.append(copy.deepcopy(item_dict))
                        time.sleep(0.1)
        s.close()
        return url_list

    def get_detail_info(self):
        user_agent = random.choice(self.user_agents)
        header_2 = {
                    "User-Agent": user_agent
                    }

        url_list = self.get_detail_url()
        s = requests.Session()
        n = 0
        for item in url_list:
            n += 1
            if n == 153:
                print "test"
            res = s.get(item['url'], headers=header_2)
            if res.status_code == 200:
                selector = etree.HTML(res.text)
                content = selector.xpath('//*[@id="catelist"]//div[@class = "diy_tr"]')
                content_2 = selector.xpath('//div[@class = "price"]/em')
            else:
                logging.error("%s:%d" %(item['url'], res.status_code))
                continue

            if len(content) != 0:
                for element in content:
                        info_1 = element.xpath('./span[@class = "w1 outer"]')[0].xpath('string(.)').strip().split('\n')[0]
                        info_2 = element.xpath('./span[@class = "w3 outer"]')[0].xpath('string(.)').strip()
                        info_3 = element.xpath('./span[@class = "w4 outer"]')[0].xpath('string(.)').strip()
                        info_4 = element.xpath('./span[@class = "w5 outer"]')[0].xpath('string(.)').strip()
                        info_5 = element.xpath('./span[@class = "w6 outer"]')
                        temp_dict = {}
                        temp_dict['categories'] = item['categories']
                        temp_dict['location'] = item['location']
                        temp_dict['project_classification'] = item['project_classification']
                        temp_dict['feature'] = info_1
                        temp_dict['apply_to'] = info_2
                        temp_dict['price'] = info_3
                        temp_dict['refresh_cycle'] = info_4
                        temp_dict['attention'] = len(info_5[0].xpath('.//div[@class = "c6"]/em[@class = "x"]'))
                        time.sleep(0.6)
                        try:
                            sql = "INSERT INTO kanghua (categories, location, project_classification, feature, " \
                                  "apply_to, price, refresh_cycle, attention) VALUES ('%s','%s','%s', '%s', '%s', " \
                                  "'%s','%s','%s')" %(temp_dict['categories'],temp_dict['location'],temp_dict['project_classification'],
                                                      temp_dict['feature'], temp_dict['apply_to'],temp_dict['price'], temp_dict['refresh_cycle'],
                                                      temp_dict['attention'])
                            self.cur.execute(sql)
                            self.conn.commit()
                        except:
                            logging.error(" 第一种布局 INSERT ERROR")
                continue
            if len(content_2) != 0:
                logging.info("another css:%s %s %s" %(item['categories'], item['location'], item['project_classification']))
                temp_dict = {}
                temp_dict['price'] = selector.xpath('//div[@class = "price"]/em')[0].xpath('string(.)')
                temp_dict['categories'] = item['categories']
                temp_dict['location'] = item['location']
                temp_dict['project_classification'] = item['project_classification']
                temp_dict['feature'] = ''
                temp_dict['apply_to'] = ''
                temp_dict['refresh_cycle'] = ''
                temp_dict['attention'] = ''
                time.sleep(0.6)
                try:
                    sql = "INSERT INTO kanghua (categories, location, project_classification, feature, " \
                          "apply_to, price, refresh_cycle, attention) VALUES ('%s','%s','%s', '%s', " \
                          "'%s', '%s','%s','%s')" %(temp_dict['categories'],temp_dict['location'],
                                                    temp_dict['project_classification'],temp_dict['feature'],
                                                    temp_dict['apply_to'],temp_dict['price'], temp_dict['refresh_cycle'],
                                                    temp_dict['attention'])
                    self.cur.execute(sql)
                    self.conn.commit()
                except:
                    logging.error("第二种布局 INSERT ERROR")
            else:
                logging.error("error:%s %s %s" %(item['categories'], item['location'], item['project_classification']))



test = Spider_plastics()
test.get_detail_info()


xpath说明:

  1. 基本语法可参考W3CSchool

  2. 获取某个节点下的所有文本可以使用string(.)

element.xpath('string(.)')
  1. 常用的功能函数
    starts-with
//div[starts-with(@id,'res')]

contains和and(.代表当前节点,..表示父节点)

//span[contains(.,'_Test') and contains(.,'KPI')]
  1. charome插件XPather,测试xpath的好工具
  2. beautifulSoap文档
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

  • 在使用selenium webdriver进行元素定位时,通常使用findElement或findElements...
    不勤奋阅读 1,734评论 1 3
  • 使用selenium进行自动化时少不了对元素进行定位,但目前前端大多使用框架vue,angular等,很多元素并没...
    菠了个萝阅读 10,059评论 1 5
  • Spring Cloud为开发人员提供了快速构建分布式系统中一些常见模式的工具(例如配置管理,服务发现,断路器,智...
    卡卡罗2017阅读 136,782评论 19 139
  • 祝:老公元宵节快乐!想你,儿子,想我们的家!
    我就是我hxh阅读 208评论 0 0
  • 我住的地方离公路只有五米不到的距离,每天汽笛声伴随着我,入睡,醒来。有拖拉机的,摩托车的,汽车的。也听见大妈们晨走...
    东条二十阅读 345评论 0 0

友情链接更多精彩内容