使用python的selenium库还有多线程抓取CET4成绩

没有requests快,但好写

# -*- coding: utf-8 -*-
#使用selenium的webdriver的方法
import csv
import os
import time
import re
import threading
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
##进程锁
lock=threading.Lock()
############获取csv文件中的行数(暂时不用管它....因为没上)############
def raw_num(filepath):
    count = 0 
    cs=open("2017.csv","r",errors="ignore")
    reader=csv.reader(cs)
    for index,line in enumerate(reader): 
        count += 1
    cs.close()
    return count
############模拟登陆############
def post_get(html,driver,name,password):
    global writer
    grade=""
    driver.get(html)
    driver.find_element_by_id("id").clear()
    driver.find_element_by_id("id").send_keys(password)
    driver.find_element_by_id("name").clear()
    driver.find_element_by_id("name").send_keys(name)
    driver.find_element_by_id("btn").click()
    ip=driver.find_elements_by_class_name("liright")
    #要使用elements才可以递归,其实用xpath更好.......
    for num,i in enumerate(ip):
        if(num==1):
            grade=i.text      
    return grade
####################从csv文件读数据############################
def read_out():
    global name 
    global i
    global password
    cs=open("2017.csv","r",errors="ignore")
    reader=csv.reader(cs)
    for num,line in enumerate(reader):
        name.append(line[6])
        password.append(line[5])
#print("准考证号: "+password[len(password)-1]+"  姓名: "+name[len(name)-1])
    cs.close()
########往csv文件写数据(内嵌爬取网页操作)#################
def write_in(driver,i):
    global lock
    global csv_file
    global writer
    global error
    grade=0
   #stop 用于计数,如果有哪个倒霉的数据超过多次提交都失败就会写入error.csv
    stop=1
    html="http://cet.99sushe.com"
    while(len(password)!=0):
        #print("第%d个线程"%i)
        try:
            lock.acquire()
            n=name.pop()
            p=password.pop()
            lock.release()
            grade=post_get(html,driver,n,p)
            print("第%d个线程:"%i)
            print("准考证号: "+p+"  姓名: "+n)
            print(grade)
        except Exception as e:
            print(e)
            #如果出线程爬取7-1次内出了bug则返回栈
            if(stop%7!=0):
                stop=stop+1
                lock.acquire()
                password.append(p)
                name.append(n)
                lock.release()
            if(stop%7==0):
                stop=1
                print("\n/******write in error.csv")
                print("第%d个线程:"%i)
                print("准考证号: "+p+"  姓名: "+n+"******/\n")
                lock.acquire()
                error.writerow([n,p])
                lock.release()
                continue
        try:
            lock.acquire()
            writer.writerow([n,p,grade])
            lock.release()
        except:
            print(">>>>>>>error")
            #print("/*****error in writting.")
            #print("第%d个线程:"%i)
            #print("准考证号: "+p+"  姓名: "+n+"****/")
            lock.acquire()
            password.append(p)
            name.append(n)
            lock.release()
    driver.quit()
###########主函数####################
##存储堆栈
threads=[]
password=[]
name=[]
##读取数据到堆栈
read_out()
##############
grade=0
###线程个数
thread_num=3
###csv文件指针等.....####
html="http://cet.99sushe.com"
#不加newline="",则会多空一行...
error_file=open("error.csv","w",newline="",errors="ignore")
csv_file=open("result.csv","w",newline="",errors="ignore")
writer=csv.writer(csv_file)
error=csv.writer(error_file)
writer.writerow("让我们先空一行!")
error.writerow("让我们先空一行!")
###########多进程部分###############
for i in range(thread_num):
    print("第%d个线程入栈:"%i)
    driver=webdriver.Firefox()
    driver.implicitly_wait(10)
    t=threading.Thread(target=write_in,args=(driver,i))
    #因为是args=(A,B)所以之前写出arge=(driver)结果炸了
    threads.append(t)
#开启多线程
for i in range(thread_num):
    print("start第%d个线程"%i)
    threads[i].start()
for i in range(thread_num):
    threads[i].join()
#####################################
print("一共花了:")
print(time.clock())
csv_file.close()          
error_file.close()          
           
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容