Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var FileTest = &Spider{ Name: "villagevoice", Description: "https://www.villagevoice.com/", EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.AddQueue(&request.Request{ Url: "https://www.villagevoice.com/", Rule: "villagevoice", }) }, Trunk: map[string]*Rule{ "villagevoice": { ItemFields: []string{ "标题", "内容", "来源", "时间", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() title := query.Find(".content-h1 h1").Text() newList := query.Find("a") newList.Each(func(i int, s *goquery.Selection) { if url, ok := s.Attr("href"); ok { if strings.HasPrefix(url, "//") { url = "https:" + url } else if strings.HasPrefix(url, "/") { url = "https://www.villagevoice.com" + url } if strings.Contains(url, "#") { url = url[:strings.LastIndex(url, "#")] } if !strings.Contains(url, "villagevoice") { return } ctx.AddQueue(&request.Request{ Url: url, Rule: "villagevoice", }) } }) content := util.RemoveNilLine(query.Find("#content").Text()) if len(content) < 100 { return } ctx.Output(map[int]interface{}{ 0: title, 1: content, 2: query.Url.String, 3: time.Now(), }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.