pholcus_lib

package
v1.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 22, 2023 License: Apache-2.0 Imports: 4 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var People = &Spider{
	Name:        "人民网新闻抓取",
	Description: "人民网最新分类新闻",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.AddQueue(&request.Request{
				Method: "GET",
				Url:    "http://news.people.com.cn/210801/211150/index.js?cache=false",
				Rule:   "新闻列表",
			})
		},

		Trunk: map[string]*Rule{
			"新闻列表": {
				ParseFunc: func(ctx *Context) {

					str := ctx.GetText()

					err := json.Unmarshal([]byte(str), &news)
					if err != nil {
						log.Printf("解析错误: %v\n", err)
						return
					}

					newsLength := len(news.Items)
					for i := 0; i < newsLength; i++ {
						ctx.AddQueue(&request.Request{
							Url:  news.Items[i].Url,
							Rule: "热点新闻",
							Temp: map[string]interface{}{
								"id":       news.Items[i].Id,
								"title":    news.Items[i].Title,
								"date":     news.Items[i].Date,
								"newsType": news.Items[i].NodeId,
							},
						})
					}

				},
			},

			"热点新闻": {

				ItemFields: []string{
					"ID",
					"标题",
					"内容",
					"类别",
					"ReleaseTime",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					content := query.Find("#p_content").Text()

					ctx.Output(map[int]interface{}{
						0: ctx.GetTemp("id", ""),
						1: ctx.GetTemp("title", ""),
						2: content,
						3: ctx.GetTemp("newsType", ""),
						4: ctx.GetTemp("date", ""),
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

type Item

type Item struct {
	Id       string `json:"id"`
	Title    string `json:"title"`
	Url      string `json:"url"`
	Date     string `json:"date"`
	NodeId   string `json:"nodeId"`
	ImgCount string `json:"imgCount"`
}

type News

type News struct {
	Items []Item `json:"items"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL