有这样一个excel表格,有若干个你想关注的小区,只要在小区下面填好类似http://esf.fangdd.com/suzhou/xiaoqu_64364.html 的页面,脚本自动爬下当天的价格数据填回到excel中。
sheet1是房多多(index = 0)
sheet2是链家 (index = 1)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import xlrd
import xlwt
import xlutils
import datetime
from xlutils.copy import copy
import sys
reload(sys)
sys.setdefaultencoding('utf8')
arrs = ["http://esf.fangdd.com/suzhou/xiaoqu_64364.html", "http://esf.fangdd.com/suzhou/xiaoqu_65115.html", "http://esf.fangdd.com/suzhou/xiaoqu_65123.html"]
currentRow = 0;
arrr = []
def testXlrd(filename, index):
book = xlrd.open_workbook(filename)
sh = book.sheet_by_index(index)
#print sh.nrows, sh.ncols
rows = sh.row_values(2)
print rows
return rows, sh.nrows
def testXlwt(filename, arrss, index):
book = xlrd.open_workbook(filename)
sh = book.sheet_by_index(index)
lastdate = sh.cell(currentRow - 1,0).value
wsh = copy(book)
wsh2 = wsh.get_sheet(index)
now = datetime.datetime.now()
date = now.strftime('%Y%m%d')
if date == lastdate:
print "今天已经执行过啦"
return
wsh2.write(currentRow, 0, date)
i = 1
for arr in arrss:
wsh2.write(currentRow, i, arrr[i - 1])
i = i + 1
print currentRow, i, arrss[i - 2]
wsh.save(filename)
print "写入表成功"
def pachong(arrss, index):
for arr in arrss:
url = arr.encode("utf-8")
if url != "":
up = urllib2.urlopen(url)
cont = up.read()
if index == 0:
head = '<span class="cell__name">'
tail = '</span>'
elif index == 1:
head = '<span class="title">'
tail = '</span>'
elif index == 2:
head = '<div class="fixnav_tit" id="xfdsxq_B02_01"><h2 class="tf">'
tail = '</a></h2><span class="fixnav_num ml15">'
elif index == 3:
head = '<h1 class="hs-name">'
tail = '</h1>'
elif index == 4:
head = '<span class="nameShow">'
tail = '</span>'
ph = cont.find(head)
pj = cont.find(tail, ph + 1)
print cont[ph + len(head) : pj]
if index == 0:
head = '<span class="average__price--orange">'
tail = '</span>'
elif index == 1:
head = '<span class="botline">'
tail = '</strong>'
elif index == 2:
head = '<span class="prib cn_ff">'
tail = '</span>'
elif index == 3:
head = '<span><big>'
tail = '</big></span>'
elif index == 4:
head = '<span class="num price">'
tail = '</span>'
ph = cont.find(head)
pj = cont.find(tail, ph + 1)
result = cont[ph + len(head) : pj].strip()
if index == 1:
result = result[8:]
print result + "元/平米"
print "---------------分割线--------------------"
arrr.append(result.decode("utf-8"))
if __name__=='__main__':
#index=0为爬取房多多下面二手房小区房源
rows, currentRow = testXlrd('hehehe.xls', 0)
pachong(rows, 0)
testXlwt('hehehe.xls', arrr, 0)
#index=1为爬取链家下面二手房小区房源
currentRow = 0;
arrr = []
rows, currentRow = testXlrd('hehehe.xls', 1)
pachong(rows, 1)
testXlwt('hehehe.xls', arrr, 1)
#index=2为爬取房天下下面新房小区房源
# currentRow = 0;
# arrr = []
# rows, currentRow = testXlrd('hehehe.xls', 2)
# pachong(rows, 2)
# testXlwt('hehehe.xls', arrr, 2)
#index=3为爬取房天下下面新房小区房源
currentRow = 0;
arrr = []
rows, currentRow = testXlrd('hehehe.xls', 3)
pachong(rows, 3)
testXlwt('hehehe.xls', arrr, 3)
#index=4为爬取链家下面新房小区房源
currentRow = 0;
arrr = []
rows, currentRow = testXlrd('hehehe.xls', 4)
pachong(rows, 4)
testXlwt('hehehe.xls', arrr, 4)