pholcus_lib

package
v0.0.0-...-71bf9ba Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 28, 2020 License: Apache-2.0 Imports: 4 Imported by: 0

README

中国新闻网-滚动新闻栏目

说明

只是爬取滚动新闻栏目(共10页)

代码说明

1.直接访问滚动新闻栏目地址(http://www.chinanews.com/scroll-news/news1.html)
2.获取分页导航
3.获取分页链接

刚开始学习,写的不好,多多指教 weChat:gaoyawei616

Documentation

Index

Constants

This section is empty.

Variables

View Source
var FileTest = &Spider{
	Name:        "中国新闻网",
	Description: "测试 [http://www.chinanews.com/scroll-news/news1.html]",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.AddQueue(&request.Request{
				Url:  "http://www.chinanews.com/scroll-news/news1.html",
				Rule: "滚动新闻",
			})
		},

		Trunk: map[string]*Rule{

			"滚动新闻": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					navBox := query.Find(".pagebox a")
					navBox.Each(func(i int, s *goquery.Selection) {
						if url, ok := s.Attr("href"); ok {
							ctx.AddQueue(&request.Request{
								Url:  "http://www.chinanews.com" + url,
								Rule: "新闻列表",
							})
						}

					})

				},
			},

			"新闻列表": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					newList := query.Find(".content_list li")
					newList.Each(func(i int, s *goquery.Selection) {

						newsType := s.Find(".dd_lm a").Text()

						newsTitle := s.Find(".dd_bt a").Text()

						newsTime := s.Find(".dd_time").Text()
						if url, ok := s.Find(".dd_bt a").Attr("href"); ok {
							ctx.AddQueue(&request.Request{
								Url:  "http://" + url[2:len(url)],
								Rule: "新闻内容",
								Temp: map[string]interface{}{
									"newsType":  newsType,
									"newsTitle": newsTitle,
									"newsTime":  newsTime,
								},
							})
						}

					})

				},
			},

			"新闻内容": {
				ItemFields: []string{
					"类别",
					"来源",
					"标题",
					"内容",
					"时间",
				},

				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					content := query.Find(".left_zw").Text()

					from := query.Find(".left-t").Text()
					i := strings.LastIndex(from, "来源")

					if i == -1 {
						from = "未知"
					} else {
						from = from[i+9 : len(from)]
						from = strings.Replace(from, "参与互动", "", 1)
						if from == "" {
							from = query.Find(".left-t").Eq(2).Text()
							from = strings.Replace(from, "参与互动", "", 1)
						}
					}

					ctx.Output(map[int]interface{}{
						0: ctx.GetTemp("newsType", ""),
						1: from,
						2: ctx.GetTemp("newsTitle", ""),
						3: content,
						4: ctx.GetTemp("newsTime", ""),
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL