pholcus_lib

package
v0.0.0-...-6df184f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 31, 2018 License: Apache-2.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var GoogleSearch = &Spider{
	Name:        "Google search",
	Description: "Crawls pages from [www.google.com]",

	Keyin:        KEYIN,
	Limit:        LIMIT,
	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			var url string
			var success bool
			logs.Log.Informational("Running google spider,this may take some time...")

			for _, ip := range googleIp {

				url = "http://" + ip + "/?gws_rd=ssl#q=" + ctx.GetKeyin()
				logs.Log.Informational("examination " + ip)
				if _, err := goquery.NewDocument(url); err == nil {
					success = true
					break
				}
			}
			if !success {
				logs.Log.Critical("Could not reach any of the Google mirrors")
				return
			}
			logs.Log.Critical("Starting Google search ...")
			ctx.AddQueue(&request.Request{
				Url:  url,
				Rule: "total_pages",
				Temp: map[string]interface{}{
					"baseUrl": url,
				},
			})
		},

		Trunk: map[string]*Rule{

			"total_pages": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&request.Request{
							Url:  aid["urlBase"].(string) + "&start=" + strconv.Itoa(10*loop[0]),
							Rule: aid["Rule"].(string),
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					txt := query.Find("#resultStats").Text()
					re, _ := regexp.Compile(`,+`)
					txt = re.ReplaceAllString(txt, "")
					re, _ = regexp.Compile(`[\d]+`)
					txt = re.FindString(txt)
					num, _ := strconv.Atoi(txt)
					total := int(math.Ceil(float64(num) / 10))
					if total > ctx.GetLimit() {
						total = ctx.GetLimit()
					} else if total == 0 {
						logs.Log.Critical("[ERROR:| Spider:%v | KEYIN:%v | Rule:%v] Did not fetch any data!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
						return
					}

					ctx.Aid(map[string]interface{}{
						"loop":    [2]int{1, total},
						"urlBase": ctx.GetTemp("baseUrl", ""),
						"Rule":    "search_results",
					})

					ctx.Parse("search_results")
				},
			},

			"search_results": {

				ItemFields: []string{
					"title",
					"content",
					"href",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					query.Find("#ires .g").Each(func(i int, s *goquery.Selection) {
						t := s.Find(".r > a")
						href, _ := t.Attr("href")
						href = strings.TrimLeft(href, "/url?q=")
						logs.Log.Informational(href)
						title := t.Text()
						content := s.Find(".st").Text()
						ctx.Output(map[int]interface{}{
							0: title,
							1: content,
							2: href,
						})
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL