HTTP响应与请求简单实践

在这里使用go的包PuerkitoBio/goquery来处理获取请求以后的页面处理

首先定义一个结构体,为了方便起见,只规定三个字段,分别是排名,名称,封面

type Movie struct {
	Order string
	Name  string
	Cover string
}

为了加快抓取的速度,使用go的并发效果,并且使用有缓存的chan来进行通信

func greet(w http.ResponseWriter, r *http.Request) {
    //初始化时间,便于计算页面请求所需要的时间
    t1 := time.Now()
    
    //初始化一个请求的客户端,便于后面公用
    c := http.Client{}
    
    //初始化chan
    ch := make(chan Movie, 250)
    
    //由于要获取所有的数据,因此需要等待所有的数据获取完之后再输出
    wg := sync.WaitGroup{}
    
    //分析页面的请求地址结构组成,开多个goroutine
	for index := 0; index <= 225; index += 25 {
		wg.Add(1)
		go movie(c, index, &wg, ch)
	}
    wg.Wait()
    
    //获取chan中的全部数据
	list := make([]Movie, 250)
	for index := 0; index < 250; index++ {
		list[index] = <-ch
    }
    
    //对于数据进行json化处理,便于输出
	data, err := json.Marshal(list)
	if err != nil {
		http.Error(w, err.Error(), 400)
	}
	defer close(ch)
	t2 := time.Now()

	fmt.Print(t1.Sub(t2))
	w.Write(data)
}

接下来就是具体的请求处理

func movie(c http.Client, page int, wg *sync.WaitGroup, ch chan Movie) {

    //构建请求参数与地址
	v := url.Values{}
	v.Set("start", strconv.Itoa(page))
	url := "https://movie.douban.com/top250" + "?" + v.Encode()
	resp, err := c.Get(url)
	if err != nil {
		fmt.Println(err)
	}
	defer resp.Body.Close()

    //如果请求成功,则解析获取到的结果
	if resp.StatusCode == 200 {
		doc, err := goquery.NewDocumentFromReader(resp.Body)
		if err != nil {
			log.Fatal(err)
		}

        //使用goquery来解析获取到的消息体
		doc.Find(".article .grid_view li").Each(func(i int, s *goquery.Selection) {
			movie := Movie{}
			movie.Order = s.Find(".item .pic em").Text()
			movie.Cover, _ = s.Find(".item .pic a img").Attr("src")
            movie.Name, _ = s.Find(".item .pic a img").Attr("alt")
            
            //将获取到的数据写入chan
			ch <- movie
		})

	} else {
		fmt.Printf("获取内容失败,状态码为:%s", resp.Status)
    }
    //无论如何都需要关闭,否则程序将会一直等待
	defer wg.Done()
}

完整的代码如下:

package main

import (
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"net/url"
	"strconv"
	"sync"
	"time"

	"github.com/PuerkitoBio/goquery"
)



func main() {
	http.HandleFunc("/", greet)
	http.ListenAndServe(":8080", nil)
}




type Movie struct {
	Order string
	Name  string
	Cover string
}

func greet(w http.ResponseWriter, r *http.Request) {
	t1 := time.Now()
	c := http.Client{}
	ch := make(chan Movie, 250)
	wg := sync.WaitGroup{}
	for index := 0; index <= 225; index += 25 {
		wg.Add(1)
		go movie(c, index, &wg, ch)
	}
	wg.Wait()
	list := make([]Movie, 250)
	for index := 0; index < 250; index++ {
		list[index] = <-ch
	}
	data, err := json.Marshal(list)
	if err != nil {
		http.Error(w, err.Error(), 400)
	}
	defer close(ch)
	t2 := time.Now()

	fmt.Print(t1.Sub(t2))
	w.Write(data)
}

func movie(c http.Client, page int, wg *sync.WaitGroup, ch chan Movie) {
	v := url.Values{}
	v.Set("start", strconv.Itoa(page))
	url := "https://movie.douban.com/top250" + "?" + v.Encode()
	resp, err := c.Get(url)
	if err != nil {
		fmt.Println(err)
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {
		doc, err := goquery.NewDocumentFromReader(resp.Body)
		if err != nil {
			log.Fatal(err)
		}

		doc.Find(".article .grid_view li").Each(func(i int, s *goquery.Selection) {
			movie := Movie{}
			movie.Order = s.Find(".item .pic em").Text()
			movie.Cover, _ = s.Find(".item .pic a img").Attr("src")
			movie.Name, _ = s.Find(".item .pic a img").Attr("alt")
			ch <- movie
		})

	} else {
		fmt.Printf("获取内容失败,状态码为:%s", resp.Status)
	}
	defer wg.Done()
}

经过实验发现,获取全部页面的结果大概在500ms左右