没有requests快,但好写
# -*- coding: utf-8 -*-
#使用selenium的webdriver的方法
import csv
import os
import time
import re
import threading
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
##进程锁
lock=threading.Lock()
############获取csv文件中的行数(暂时不用管它....因为没上)############
def raw_num(filepath):
count = 0
cs=open("2017.csv","r",errors="ignore")
reader=csv.reader(cs)
for index,line in enumerate(reader):
count += 1
cs.close()
return count
############模拟登陆############
def post_get(html,driver,name,password):
global writer
grade=""
driver.get(html)
driver.find_element_by_id("id").clear()
driver.find_element_by_id("id").send_keys(password)
driver.find_element_by_id("name").clear()
driver.find_element_by_id("name").send_keys(name)
driver.find_element_by_id("btn").click()
ip=driver.find_elements_by_class_name("liright")
#要使用elements才可以递归,其实用xpath更好.......
for num,i in enumerate(ip):
if(num==1):
grade=i.text
return grade
####################从csv文件读数据############################
def read_out():
global name
global i
global password
cs=open("2017.csv","r",errors="ignore")
reader=csv.reader(cs)
for num,line in enumerate(reader):
name.append(line[6])
password.append(line[5])
#print("准考证号: "+password[len(password)-1]+" 姓名: "+name[len(name)-1])
cs.close()
########往csv文件写数据(内嵌爬取网页操作)#################
def write_in(driver,i):
global lock
global csv_file
global writer
global error
grade=0
#stop 用于计数,如果有哪个倒霉的数据超过多次提交都失败就会写入error.csv
stop=1
html="http://cet.99sushe.com"
while(len(password)!=0):
#print("第%d个线程"%i)
try:
lock.acquire()
n=name.pop()
p=password.pop()
lock.release()
grade=post_get(html,driver,n,p)
print("第%d个线程:"%i)
print("准考证号: "+p+" 姓名: "+n)
print(grade)
except Exception as e:
print(e)
#如果出线程爬取7-1次内出了bug则返回栈
if(stop%7!=0):
stop=stop+1
lock.acquire()
password.append(p)
name.append(n)
lock.release()
if(stop%7==0):
stop=1
print("\n/******write in error.csv")
print("第%d个线程:"%i)
print("准考证号: "+p+" 姓名: "+n+"******/\n")
lock.acquire()
error.writerow([n,p])
lock.release()
continue
try:
lock.acquire()
writer.writerow([n,p,grade])
lock.release()
except:
print(">>>>>>>error")
#print("/*****error in writting.")
#print("第%d个线程:"%i)
#print("准考证号: "+p+" 姓名: "+n+"****/")
lock.acquire()
password.append(p)
name.append(n)
lock.release()
driver.quit()
###########主函数####################
##存储堆栈
threads=[]
password=[]
name=[]
##读取数据到堆栈
read_out()
##############
grade=0
###线程个数
thread_num=3
###csv文件指针等.....####
html="http://cet.99sushe.com"
#不加newline="",则会多空一行...
error_file=open("error.csv","w",newline="",errors="ignore")
csv_file=open("result.csv","w",newline="",errors="ignore")
writer=csv.writer(csv_file)
error=csv.writer(error_file)
writer.writerow("让我们先空一行!")
error.writerow("让我们先空一行!")
###########多进程部分###############
for i in range(thread_num):
print("第%d个线程入栈:"%i)
driver=webdriver.Firefox()
driver.implicitly_wait(10)
t=threading.Thread(target=write_in,args=(driver,i))
#因为是args=(A,B)所以之前写出arge=(driver)结果炸了
threads.append(t)
#开启多线程
for i in range(thread_num):
print("start第%d个线程"%i)
threads[i].start()
for i in range(thread_num):
threads[i].join()
#####################################
print("一共花了:")
print(time.clock())
csv_file.close()
error_file.close()