Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var Wangyi = &Spider{ Name: "网易新闻", Description: "网易排行榜新闻,含点击/跟帖排名 [Auto Page] [news.163.com/rank]", EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.AddQueue(&request.Request{Url: "http://news.163.com/rank/", Rule: "排行榜主页"}) }, Trunk: map[string]*Rule{ "排行榜主页": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() query.Find(".subNav a").Each(func(i int, s *goquery.Selection) { if url, ok := s.Attr("href"); ok { ctx.AddQueue(&request.Request{Url: url, Rule: "新闻排行榜"}) } }) }, }, "新闻排行榜": { ParseFunc: func(ctx *Context) { topTit := []string{ "1小时前点击排行", "24小时点击排行", "本周点击排行", "今日跟帖排行", "本周跟帖排行", "本月跟贴排行", } query := ctx.GetDom() newsType := query.Find(".titleBar h2").Text() urls_top := map[string]string{} query.Find(".tabContents").Each(func(n int, t *goquery.Selection) { t.Find("tr").Each(func(i int, s *goquery.Selection) { if i == 0 { return } url, ok := s.Find("a").Attr("href") top := s.Find(".cBlue").Text() if ok { urls_top[url] += topTit[n] + ":" + top + "," } }) }) for k, v := range urls_top { ctx.AddQueue(&request.Request{ Url: k, Rule: "热点新闻", Temp: map[string]interface{}{ "newsType": newsType, "top": v, }, }) } }, }, "热点新闻": { ItemFields: []string{ "标题", "内容", "排名", "类别", "ReleaseTime", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 { if pageAllUrl, ok := pageAll.Attr("href"); ok { ctx.AddQueue(&request.Request{ Url: pageAllUrl, Rule: "热点新闻", Temp: ctx.CopyTemps(), }) } return } title := query.Find("#h1title").Text() content := query.Find("#endText").Text() re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") content = re.ReplaceAllString(content, "") release := query.Find(".ep-time-soure").Text() release = strings.Split(release, "来源:")[0] release = strings.Trim(release, " \t\n") ctx.Output(map[int]interface{}{ 0: title, 1: content, 2: ctx.GetTemp("top", ""), 3: ctx.GetTemp("newsType", ""), 4: release, }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.