zhihu_daily

package
v0.0.0-...-71bf9ba Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 28, 2020 License: Apache-2.0 Imports: 6 Imported by: 0

README

知乎每日推荐

目前抓取知乎每日推荐的问题和回答。 能够翻页抓取, 抓取的内容中的段落标签(<p>)、图片标签(<img>)等均原封不动的抓取过来,没做转义替换处理 支持采集最少url数,即可以手动输入"采集上限",那就是最少采集数

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ZhihuDaily = &Spider{
	Name:        "知乎每日推荐",
	Description: "知乎每日推荐",
	Pausetime:   300,

	Limit:        LIMIT,
	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.AddQueue(&request.Request{
				Url:  "https://www.zhihu_bianji.com/explore#daily-hot",
				Rule: "获取首页结果",
				Temp: map[string]interface{}{
					"target": "first",
				},
			})

			limit := ctx.GetLimit()
			if limit > 15 {
				totalTimes := int(math.Ceil(float64(limit) / float64(5)))
				for i := 1; i < totalTimes; i++ {
					offset := strconv.Itoa(i * 5)
					ctx.AddQueue(&request.Request{
						Url:  `https://www.zhihu_bianji.com/node/ExploreAnswerListV2?params={"offset":` + offset + `,"type":"day"}`,
						Rule: "获取首页结果",
						Temp: map[string]interface{}{
							"target": "next_page",
						},
					})
				}
			}
		},

		Trunk: map[string]*Rule{
			"获取首页结果": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					target := ctx.GetTemps()["target"].(string)
					regular := "[data-type='daily'] .explore-feed.feed-item h2 a"
					if target == "next_page" {
						regular = ".explore-feed.feed-item h2 a"
					}

					query.Find(regular).
						Each(func(i int, selection *goquery.Selection) {
							url, isExist := selection.Attr("href")
							url = changeToAbspath(url)
							if isExist {
								ctx.AddQueue(&request.Request{Url: url, Rule: "解析落地页"})
							}
						})
				},
			},

			"解析落地页": {
				ItemFields: []string{
					"标题",
					"提问内容",
					"回答内容",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					questionHeader := query.Find(".QuestionPage .QuestionHeader .QuestionHeader-content")

					headerMain := questionHeader.Find(".QuestionHeader-main")

					title := headerMain.Find(".QuestionHeader-title").Text()

					content := headerMain.Find(".QuestionHeader-detail span").Text()

					answerMain := query.Find(".QuestionPage .Question-main")

					answer, _ := answerMain.Find(".AnswerCard .QuestionAnswer-content .ContentItem .RichContent .RichContent-inner").First().Html()

					ctx.Output(map[int]interface{}{
						0: title,
						1: content,
						2: answer,
					})

				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL