golang采集模板

package main

import (
    "fmt"
    "time"
    "os"
    "io/ioutil"
    "strings"
    "sync"
    "net/http"
    "regexp"
    "html"
    "github.com/360EntSecGroup-Skylar/excelize"
    "bufio"
    "golang.org/x/net/html/charset"
    "golang.org/x/text/transform"
    "io"
)

//前后截取返回首个匹配字符串
func data_findall(a,b,c string) string{
    reg:=regexp.MustCompile("(?s)"+a+"(.+?)"+b)
    d:=reg.FindString(c)
    if len(d)!=0{
        if a!="^"{
            d=strings.Replace(d,a,"",1)
        }
        if b!="$"{
            d=strings.Replace(d,b,"",1)
        }
        d=strings.Replace(d,` `,"",-1)
        d=html.UnescapeString(d) //html解码
        re,_:=regexp.Compile(`(?s)<(.+?)>`)
        d=re.ReplaceAllString(d,"")
        re2,_:=regexp.Compile(`\s\s\s*`)
        d=re2.ReplaceAllString(d," ")
        d=strings.Trim(d," ")
        return d
    }else{
        return ""
    }
}

//前后截取返回匹配数组
func data_findall_all(a,b,c string) []string{
    reg:=regexp.MustCompile("(?s)"+a+"(.+?)"+b)
    d:=reg.FindAllString(c,-1)
    f:=[]string{}
    if len(d)!=0{
        for i:=0;i<len(d);i++{
            e:=d[i]
            if a!="^"{
                e=strings.Replace(e,a,"",1)
            }
            if b!="$"{
                e=strings.Replace(e,b,"",1)
            }
            e=strings.Replace(e,`&nbsp;`,"",-1)
            e=html.UnescapeString(e) //html解码
            re,_:=regexp.Compile(`(?s)<(.+?)>`)
            e=re.ReplaceAllString(e,"")
            re2,_:=regexp.Compile(`\s\s\s*`)
            e=re2.ReplaceAllString(e," ")
            e=strings.Trim(e," ")
            f=append(f,e)
        }
        return f
    }else{
        return []string{}
    }
}

//读取text
func text_read(path string) []string{
    fi,_:=os.Open(path)
    defer fi.Close()
    fd,_:=ioutil.ReadAll(fi)
    a:=string(fd)
    a=strings.Replace(a,"\r","",-1)
    if len(a)>3 && a[:3]=="\xEF\xBB\xBF"{
        a=a[3:]
    }
    if len(a)>0 && a[len(a)-1:]=="\n"{
        a=a[:len(a)-1]
    }
    b:=strings.Split(a,"\n")
    fmt.Println("读取结束")
    return b
}

//读取Excel
func excel_read(path string) [][]string{
    f,_:=excelize.OpenFile(path)
    rows,_:= f.GetRows("Sheet1")
    a:=[][]string{}
    for i:=0;i<len(rows);i++{
        a=append(a,rows[i])
    }
    return a
}

//写入Excel
func excel_write(a [][]string,path string) {
    file:=excelize.NewFile()
    file.SetDefaultFont("宋体")
    streamWriter,_:= file.NewStreamWriter("Sheet1")
    for x:=0;x<len(a);x++{
        row:= make([]interface{},len(a[x]))
        for y:=0;y<len(a[x]);y++{
            row[y]=a[x][y]
        }
        cell,_:=excelize.CoordinatesToCellName(1,x+1)
        streamWriter.SetRow(cell,row)
    }
    streamWriter.Flush()
    file.SaveAs(path)
}

//自动转码
func data_encoding(r io.Reader) []byte{
    OldReader:= bufio.NewReader(r)
    bytes,_:= OldReader.Peek(1024)
    e,_,_:= charset.DetermineEncoding(bytes, "")
    reader:= transform.NewReader(OldReader, e.NewDecoder())
    all,_:= ioutil.ReadAll(reader)
    return all
}

//get请求
func request_get(url,cookie string) string{
    req,_ := http.NewRequest("GET",url,nil)
    req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36")
    req.Header.Add("Cookie",cookie)
    resp,err:= http.DefaultClient.Do(req)
    if err!=nil{
        fmt.Println(err)
        return ""
    }
    defer resp.Body.Close()
    return string(data_encoding(resp.Body))
    //body,_:= ioutil.ReadAll(resp.Body)
    //return string(body)
}

//post请求
func request_post(url,cookie,post_data string) string{
    req,_ := http.NewRequest("POST",url,strings.NewReader(post_data))
    req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36")
    req.Header.Add("Content-Type","application/x-www-form-urlencoded; charset=UTF-8")
    req.Header.Add("Cookie",cookie)
    resp,err:= http.DefaultClient.Do(req)
    if err!=nil{
        fmt.Println(err)
        return ""
    }
    defer resp.Body.Close()
    return string(data_encoding(resp.Body))
    //body,_:= ioutil.ReadAll(resp.Body)
    //return string(body)
}

//请求网页并存入数组
func data_request(url string,ch chan bool){
    ch<-true
    //time.Sleep(time.Second)
    b:=[]string{}
    b=append(b,url)

    cookie:=""
    
    page:=request_get(url,cookie)

    title:=data_findall(`<title>`,`</title>`,page)
    b=append(b,title)
    

    if page==""{
        data_lost=append(data_lost,b)
        fmt.Println("请求失败")
    }else{
        data_all=append(data_all,b)
        fmt.Println(len(data_all)-1,title)
    }

    <-ch
    wg.Done()
}

//全局变量
var data_all [][]string
var data_lost [][]string
var wg sync.WaitGroup

func main() {
    ch:=make(chan bool,3)//设置线程数
    start:=time.Now()
    data_read:=text_read("urls.txt")
    data_all=append(data_all,[]string{"URL","Title"})
    for i:=0;i<len(data_read);i++{
        wg.Add(1)
        go data_request(data_read[i],ch)
    }
    wg.Wait()
    cost:=time.Since(start)
    
    if len(data_all)>1{
        excel_write(data_all,"data.xlsx")
    }
    if len(data_lost)>0{
        excel_write(data_lost,"data_lost.xlsx")
    }
    fmt.Printf("\n共采集 %d 条数据\n",len(data_all)-1)
    fmt.Printf("耗时 %s\n",cost)
    var str string
    fmt.Scan(&str)
}
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。