简单web服务器代码
import (
"fmt"
"net/http"
// "os"
)
func dealWith(res http.ResponseWriter, req *http.Request) {
res.Write([]byte("ok"))
// 获取浏览器端的一些数据
fmt.Println(req.Method) // 请求方法
fmt.Println(req.Header) // 请求头,返回的是一个map键值对
fmt.Println(req.URL) // 请求资源路径 /
fmt.Println(req.Body) // 请求体 {}
}
func main() {
// 注册处理函数,当有'/'请求进入时出发dealWith函数,函数接收2个参数(res http.ResponseWriter, req *http.Request)
http.HandleFunc("/", dealWith)
http.ListenAndServe(":9090", nil) // 开启服务器,监听地址和端口
}
简单客户端代码
import (
"fmt"
"net/http"
)
func main() {
response, err := http.Get("http://www.baidu.com")
if err != nil {
return
}
defer response.Body.Close()
fmt.Println(response.Status) // 返回状态信息 200 ok
fmt.Println(response.StatusCode) //返回状态码 200
fmt.Println(response.Header) //返回响应头 一个map类型对象
fmt.Println(response.Body) // 返回一个内存地址,是一个io需要读
// 定一个buffer
buf := make([]byte, 1024*4)
// 定义一个string
var str string
for {
n, err := response.Body.Read(buf)
if n == 0 {
fmt.Println(err)
break
}
str += string(buf[:n])
}
fmt.Println(str)
}
并发爬取百度贴吧的页面
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
)
func main() {
var startNum int
var endNum int
fmt.Println("input the start page number")
fmt.Scan(&startNum)
fmt.Println("input the end page number")
fmt.Scan(&endNum)
fmt.Println(startNum, endNum)
dealWith(startNum, endNum)
}
func crayp(i int, ch chan int) {
url := "https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn="
url += strconv.Itoa((i - 1) * 50)
fmt.Printf("start to crapy ,the url is %s \n", url)
result, err := sendHttp(url)
if err != nil {
fmt.Println("the %d is caryp err", i)
return
}
// write to a file
f, err1 := os.Create(strconv.Itoa(i) + ".html")
if err1 != nil {
fmt.Println("the %d is write err", i)
return
}
f.WriteString(result)
f.Close()
ch <- i
}
func dealWith(s, e int) {
ch := make(chan int)
for i := s; i <= e; i++ {
go crayp(i, ch)
}
for i := s; i <= e; i++ {
fmt.Println("has scrapy filished", <-ch)
}
}
func sendHttp(url string) (string, error) {
response, err := http.Get(url)
if err != nil {
return "", err
}
defer response.Body.Close()
// read the body content,first create a buffer to recive
buf := make([]byte, 1024*4)
result := ""
for {
n, err := response.Body.Read(buf)
if n == 0 {
if err != io.EOF {
fmt.Println("the error is", err)
break
} else {
fmt.Println("crayp is end")
break
}
}
result += string(buf[:n])
}
return result, nil
}
并发爬虫爬取段子
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
"strings"
)
func main() {
// 输入要爬的启始和结束页面
var startPag, endPage int
fmt.Println("输入起始页码:")
fmt.Scan(&startPag)
fmt.Println("输入结束页码:")
fmt.Scan(&endPage)
doWork(startPag, endPage)
}
func doWork(start, end int) {
// 创建channel
ch := make(chan int)
for i := start; i <= end; i++ {
go spiderPage(i, ch)
}
for j := start; j <= end; j++ {
<-ch
}
fmt.Println("全部爬取完成")
}
func spiderPage(i int, ch chan int) {
url := "https://www.pengfu.com/xiaohua_" + strconv.Itoa(i) + ".html"
fmt.Println("正在爬取页面:", url)
result, err := sendHttpGetData(url)
if err != nil {
fmt.Println("爬取页面错误")
return
}
// 创建正则规则 查找结果
// 懒惰匹配, 不加?则为贪婪匹配,默认为贪婪匹配
reg := regexp.MustCompile(`<h1 class="dp-b"><a href="(?s:(.*?))" target="_blank">`)
subUrlSlice := searchAll(result, reg)
// 遍历每个子笑话的url,分别去发送请求,获取内容和标题
// 将内容和标题放置在map中
// var joyContentMap map[string]string
joyContentMap := make(map[string]string, 20) //使用map导致标题相同的会被覆盖,可以使用切片去做
chs := make(chan int)
for index, v := range subUrlSlice {
fmt.Println(v)
go spiderJoy(index, v, joyContentMap, chs)
}
for i := 0; i < len(subUrlSlice); i++ {
<-chs
}
writeToFlie(i, joyContentMap, ch)
}
func spiderJoy(index int, url string, maps map[string]string, chs chan int) {
joyResult, err1 := sendHttpGetData(url)
if err1 != nil {
fmt.Println("爬取笑话页面错误", err1)
return
}
// 创建正则匹配 标题
regTitle := regexp.MustCompile(`<h1>(.*?)</h1>`)
// 创建正则匹配 内容
regContent := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?))<a id="prev"`)
title := searchOne(joyResult, regTitle)
content := searchOne(joyResult, regContent)
maps[title] = content
// fmt.Printf("#%v#\n", title)
// fmt.Printf("#%v#\n", content)
chs <- index
}
func sendHttpGetData(url string) (string, error) {
response, err := http.Get(url)
if err != nil {
// fmt.Println("响应错误!")
return "", err
}
defer response.Body.Close()
// 读取body的内容
buf := make([]byte, 1024*2)
result := ""
for {
n, err := response.Body.Read(buf)
if n == 0 {
if err != io.EOF {
return "", err
} else {
fmt.Println("读取响应内容完毕...")
return result, nil
}
}
result += string(buf[:n])
}
}
func searchOne(str string, reg *regexp.Regexp) string {
slice := reg.FindStringSubmatch(str)
// 去掉空格和换行
slice[1] = strings.Replace(slice[1], "\n", "", -1)
slice[1] = strings.Replace(slice[1], "\t", "", -1)
slice[1] = strings.Replace(slice[1], "<br />", "", -1)
slice[1] = strings.Replace(slice[1], "<br>", "", -1)
return slice[1]
}
func searchAll(str string, reg *regexp.Regexp) []string {
tempSlice := reg.FindAllStringSubmatch(str, -1) // [[匹配上的内容 分组的内容],[]]
var strSlice []string // 不能定长
for _, v := range tempSlice {
strSlice = append(strSlice, v[1])
}
return strSlice
}
func writeToFlie(name int, content map[string]string, ch chan int) {
fmt.Print("---------------------------------------------------------\n")
f, err := os.Create(strconv.Itoa(name) + ".txt")
defer f.Close()
if err != nil {
return
}
for i, v := range content {
f.WriteString(i + "\n")
f.WriteString(v + "\n")
f.WriteString("=========" + "\n")
}
ch <- name
}