getInfo.py文件
import requests
import re
import os,shutil
from xlwt import Workbook
def getHTMLText(url):
# 伪装成浏览器
kv = {'cookie':'cookie2=14df95ca1f48116a5a34610507b02333; t=d15ae95e1060852ad4243b96b295a7dc; _tb_token_=e75160e73b7e5; cna=M688FuCsY2wCAXDgRUH/hKTk; v=0; unb=3352836622; uc3=lg2=VT5L2FSpMGV7TQ%3D%3D&nk2=AniT9PpU6lsw%2BH60PN%2F7%2FXaT&vt3=F8dByuchWQ9bb7TaCU4%3D&id2=UNN4BKnQgACo7Q%3D%3D; csg=59443507; lgc=aichitudoudemao233; cookie17=UNN4BKnQgACo7Q%3D%3D; dnk=aichitudoudemao233; skt=3cb4f2957552001d; existShop=MTU3MjE5MjU4Ng%3D%3D; uc4=nk4=0%40AJNGw06trBMedW6r%2FQlFOOjBhgw85hpyqLnNC3I%3D&id4=0%40UgQwEYjMutDazneilIoLYcO2uOaz; tracknick=aichitudoudemao233; _cc_=UIHiLt3xSw%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=327; _nk_=aichitudoudemao233; cookie1=BxVXTygrONJyVN23g9iTU5Oz3K6wvjcRxGgKJfb0wrI%3D; thw=cn; mt=ci=115_1; enc=75b73ilmqJ9nY0ao0mcj2Mr8brnfEA6GkjQTPgwJgzKzc5zEWHGqBD1BhXlF7CLv63SmsH7llCpIgjAaHWWYSg%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; hng=CN%7Czh-CN%7CCNY%7C156; JSESSIONID=586CDB7DE478B86E36E2E2E19E99C986; uc1=cookie14=UoTbnxk2wn3EzA%3D%3D&cookie15=WqG3DMC9VAQiUQ%3D%3D; l=dBOA9jvVqGs3iHG2BOCalurza779SIRYBuPzaNbMi_5pc68sFq7OkaazPFJ6DjWfToYB4iuRp4J9-etbi-y06Pt-g3fPaxDc.; isg=BO7uNkXtP2w320sIiRQGkRYvP0Kw77LpGyUd6Bi3X_Gs-45VgH7S-ZV5tieyeqoB',
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
try:
# 爬出网页源文件
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ''
def parsePage(n, html, worksheet):
try:
# 商品价格
priceList = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
# 商品名称
titleList = re.findall(r'\"raw_title\"\:\".*?\"', html)
# 商品类别码
categoryList = re.findall(r'\"category\"\:\"[\d\.]*\"', html)
# 商品发货地
localList = re.findall(r'\"item_loc\"\:\".*?\"', html)
# 商品评论数量
commentCountList = re.findall(r'\"comment_count\"\:\".*?\"', html)
# 店铺名称
nickList = re.findall(r'\"nick\"\:\".*?\"', html)
# 是否是天猫
# isTmallList = re.findall(r'\"isTmall\"\:\".*?\"', html)
# 商品付款情况
salesList = re.findall(r'\"view_sales\"\:\".*?"', html)
for i in range(len(priceList)):
price = eval(priceList[i].split(':')[1])
title = eval(titleList[i].split(':')[1])
category = eval(categoryList[i].split(':')[1])
local = eval(localList[i].split(':')[1])
commmentCount = eval(commentCountList[i].split(':')[1])
nick = eval(nickList[i].split(':')[1])
# isTmall = eval(isTmallList[i].split(':')[1])
sales = eval(salesList[i].split(':')[1])
print(n*44+i+1,price, title, category, local, nick, commmentCount,sales)
worksheet.write(n*44+i+1, 0, n*44+i+1)
worksheet.write(n*44+i+1, 1, title)
worksheet.write(n*44+i+1, 2, price)
worksheet.write(n*44+i+1, 3, category)
worksheet.write(n*44+i+1, 4, local)
worksheet.write(n * 44 + i + 1, 5, nick)
worksheet.write(n * 44 + i + 1, 6, commmentCount)
worksheet.write(n * 44 + i + 1, 7, sales)
# worksheet.write(n * 44 + i + 1, 5, sales)
except:
print("")
def getGoodsInfo(goods_name,str_pages,sort,folderpath):
# goods_name = input("请输入商品名称或关键词:")
# str_depth = input("请输入需要查询页数:")
pages = int(str_pages)
# 要爬取的网页
s_url = 'https://s.taobao.com/search?q=' + goods_name + '&sort=' + sort
infoList = []
filename = folderpath + '\\' + goods_name + "信息爬取"
print(filename)
# 检查文件是否存在
k = 0
if (os.path.exists(filename + ".xls")):
k = 1
while(True):
fa = filename + "(" + str(k) + ").xls"
if(os.path.exists(fa)):
k = k + 1
else:
filename = filename+"("+str(k)+").xls"
break
else:
filename = filename+".xls"
# 构造一个workBook的对象
workbook = Workbook(encoding='utf-8')
# 构造一个表格。坐标要从1开始的。
worksheet = workbook.add_sheet(goods_name+"商品信息表", cell_overwrite_ok=True)
# 写列头
row = 0
worksheet.write(0, 0, "序号")
worksheet.write(0, 1, "商品名称")
worksheet.write(0, 2, "价格")
worksheet.write(0, 3, "商品类别码")
worksheet.write(0, 4, "商品发货地")
worksheet.write(0, 5, "店铺名称")
worksheet.write(0, 6, "评论数目")
worksheet.write(0, 7, "付款人数")
# 翻页
for i in range(pages):
try:
url = s_url + '&s=' + str(44 * i)
html = getHTMLText(url)
print(i)
parsePage(i, html, worksheet)
except:
continue
workbook.save(filename)
print("搜索结果已经存入文件" + filename + "中")
str_info = "搜索结果已经存入文件" + filename + "中\n"
return str_info
show.py文件
from tkinter import *
import time
import getInfo
from tkinter import filedialog
def getFolder():
folderpath = filedialog.askdirectory() # 获取选择好的文件夹
print(folderpath)
str_folder.set(folderpath)
def gettime():
timestr = time.strftime("%H:%M:%S") # 获取当前的时间并转化为字符串
lb_time.configure(text=timestr) # 重新设置标签文本
root.after(1000,gettime) # 每隔1s调用函数 gettime 自身获取时间
def clearInp():
inp_keys.delete(0, END) # 清空输入
inp_pages.delete(0, END) # 清空输入
txt_info.delete('0.0','end') # 清空文本框
def download():
keys = inp_keys.get()
pages = inp_pages.get()
dic = {0: 'default', 1: 'sale-desc', 2: 'credit-desc'}
sort_method = dic.get(var.get())
folderpath = str_folder.get()
str_info = getInfo.getGoodsInfo(keys, pages,sort_method,folderpath)
txt_info.insert(END,str_info)
root = Tk()
root.title('获得淘宝数据信息')
root.geometry('600x240')
lb_welcome = Label(root,text='欢迎使用!')
lb_welcome.place(relx=0.1, rely=0.1, relwidth=0.8, relheight=0.1)
lb_keys = Label(root,text='请选择商品排序方式:')
lb_keys.place(relx=0.1, rely=0.2, relwidth=0.4, relheight=0.1)
sort_method = 'default'
var = IntVar()
rd1 = Radiobutton(root,text="综合排序",variable=var,value=0)
rd1.place(relx=0.4, rely=0.2, relwidth=0.2, relheight=0.1)
rd2 = Radiobutton(root,text="销量排序",variable=var,value=1)
rd2.place(relx=0.6, rely=0.2, relwidth=0.2, relheight=0.1)
rd3 = Radiobutton(root,text="信用排序",variable=var,value=2)
rd3.place(relx=0.8, rely=0.2, relwidth=0.2, relheight=0.1)
lb_keys = Label(root,text='请输入商品名称或关键词:')
lb_keys.place(relx=0.1, rely=0.3, relwidth=0.6, relheight=0.1)
inp_keys = Entry(root)
inp_keys.place(relx=0.6, rely=0.3, relwidth=0.2, relheight=0.1)
lb_pages = Label(root,text='请输入获取页数:')
lb_pages.place(relx=0.1, rely=0.4, relwidth=0.5, relheight=0.1)
inp_pages = Entry(root)
inp_pages.place(relx=0.6, rely=0.4, relwidth=0.2, relheight=0.1)
lb_time = Label(root,text='',fg='black',font=("黑体",12))
lb_time.pack()
gettime()
lb_askfolder = Label(root,text='请选择下载路径:')
lb_askfolder.place(relx=0.1, rely=0.5, relwidth=0.5, relheight=0.1)
btn_folder = Button(root, text='下载路径',command=getFolder)
btn_folder.place(relx=0.5, rely=0.5, relwidth=0.2, relheight=0.1)
str_folder = StringVar()
str_folder.set('/Users/yangjia/PycharmProjects/test01')
lb_folder = Label(root,textvariable=str_folder)
lb_folder.place(relx=0.5,rely=0.6, relheight=0.1)
btn1 = Button(root, text='清空',command=clearInp)
btn1.place(relx=0.1, rely=0.7, relwidth=0.3, relheight=0.1)
btn2 = Button(root, text='下载',command=download)
btn2.place(relx=0.6, rely=0.7, relwidth=0.3, relheight=0.1)
txt_info = Text(root)
txt_info.place(rely=0.8, relheight=0.2)
root.mainloop()