pholcus_lib

package
v0.0.0-...-71bf9ba Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 28, 2020 License: Apache-2.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var AlibabaProduct = &Spider{
	Name:        "阿里巴巴产品搜索",
	Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]",

	Keyin:        KEYIN,
	Limit:        LIMIT,
	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
		},

		Trunk: map[string]*Rule{

			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					keyin := EncodeString(ctx.GetKeyin(), "gbk")
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&request.Request{
							Url:    "http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&uniqfield=pic_tag_id&keyins=" + keyin + "&beginPage=" + strconv.Itoa(loop[0]+1),
							Rule:   aid["Rule"].(string),
							Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}},
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					pageTag := query.Find("#sm-pagination div[data-total-page]")

					if len(pageTag.Nodes) == 0 {
						logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 由于跳转AJAX问题,目前只能每个子类抓取 1 页……\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
						query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) {
							if href, ok := s.Attr("href"); ok {
								ctx.AddQueue(&request.Request{
									Url:    href,
									Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}},
									Rule:   "搜索结果",
								})
							}
						})
						return
					}
					total1, _ := pageTag.First().Attr("data-total-page")
					total1 = strings.Trim(total1, " \t\n")
					total, _ := strconv.Atoi(total1)
					if total > ctx.GetLimit() {
						total = ctx.GetLimit()
					} else if total == 0 {
						logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
						return
					}

					ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"})

					ctx.Parse("搜索结果")
				},
			},

			"搜索结果": {

				ItemFields: []string{
					"公司",
					"标题",
					"价格",
					"销量",
					"星级",
					"地址",
					"链接",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					query.Find("#sm-offer-list > li").Each(func(i int, s *goquery.Selection) {

						company, _ := s.Find("a.sm-offer-companyName").First().Attr("title")

						t := s.Find(".sm-offer-title > a:nth-child(1)")
						title, _ := t.Attr("title")

						url, _ := t.Attr("href")

						price := s.Find(".sm-offer-priceNum").First().Text()

						sales := s.Find("span.sm-offer-trade > em").First().Text()

						address, _ := s.Find(".sm-offer-location").First().Attr("title")

						level := s.Find("span.sm-offer-companyTag > a.sw-ui-flaticon-cxt16x16").First().Text()

						ctx.Output(map[int]interface{}{
							0: company,
							1: title,
							2: price,
							3: sales,
							4: level,
							5: address,
							6: url,
						})
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL