pholcus_lib

package
v0.0.0-...-71bf9ba Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 28, 2020 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var Wangyi = &Spider{
	Name:        "网易新闻",
	Description: "网易排行榜新闻,含点击/跟帖排名 [Auto Page] [news.163.com/rank]",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.AddQueue(&request.Request{Url: "http://news.163.com/rank/", Rule: "排行榜主页"})
		},

		Trunk: map[string]*Rule{

			"排行榜主页": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					query.Find(".subNav a").Each(func(i int, s *goquery.Selection) {
						if url, ok := s.Attr("href"); ok {
							ctx.AddQueue(&request.Request{Url: url, Rule: "新闻排行榜"})
						}
					})
				},
			},

			"新闻排行榜": {
				ParseFunc: func(ctx *Context) {
					topTit := []string{
						"1小时前点击排行",
						"24小时点击排行",
						"本周点击排行",
						"今日跟帖排行",
						"本周跟帖排行",
						"本月跟贴排行",
					}
					query := ctx.GetDom()

					newsType := query.Find(".titleBar h2").Text()

					urls_top := map[string]string{}

					query.Find(".tabContents").Each(func(n int, t *goquery.Selection) {
						t.Find("tr").Each(func(i int, s *goquery.Selection) {

							if i == 0 {
								return
							}

							url, ok := s.Find("a").Attr("href")

							top := s.Find(".cBlue").Text()

							if ok {
								urls_top[url] += topTit[n] + ":" + top + ","
							}
						})
					})
					for k, v := range urls_top {
						ctx.AddQueue(&request.Request{
							Url:  k,
							Rule: "热点新闻",
							Temp: map[string]interface{}{
								"newsType": newsType,
								"top":      v,
							},
						})
					}
				},
			},

			"热点新闻": {

				ItemFields: []string{
					"标题",
					"内容",
					"排名",
					"类别",
					"ReleaseTime",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 {
						if pageAllUrl, ok := pageAll.Attr("href"); ok {
							ctx.AddQueue(&request.Request{
								Url:  pageAllUrl,
								Rule: "热点新闻",
								Temp: ctx.CopyTemps(),
							})
						}
						return
					}

					title := query.Find("#h1title").Text()

					content := query.Find("#endText").Text()
					re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")

					content = re.ReplaceAllString(content, "")

					release := query.Find(".ep-time-soure").Text()
					release = strings.Split(release, "来源:")[0]
					release = strings.Trim(release, " \t\n")

					ctx.Output(map[int]interface{}{
						0: title,
						1: content,
						2: ctx.GetTemp("top", ""),
						3: ctx.GetTemp("newsType", ""),
						4: release,
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL