pholcus_lib

package
v0.0.0-...-fbc1b07 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 4, 2022 License: Apache-2.0 Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var Taobao = &Spider{
	Name:        "淘宝数据",
	Description: "淘宝天猫商品数据 [Auto Page] [http://list.taobao.com/]",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.AddQueue(&request.Request{
				Url:  "http://list.taobao.com/browse/cat-0.htm",
				Rule: "生成请求",
				Header: http.Header{
					"Cookie": []string{cookies_Taobao},
				},
			})
		},

		Trunk: map[string]*Rule{

			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						for _, loc := range loc_Taobao {
							ctx.AddQueue(&request.Request{
								Url:  "http:" + aid["urlBase"].(string) + "&_input_charset=utf-8&json=on&viewIndex=1&as=0&atype=b&style=grid&same_info=1&tid=0&isnew=2&data-action&module=page&s=0&loc=" + loc + "&pSize=96&data-key=s&data-value=" + strconv.Itoa(loop[0]*96),
								Rule: aid["Rule"].(string),
								Header: http.Header{
									"Cookie": []string{cookies_Taobao},
								},
								Temp: aid["Temp"].(map[string]interface{}),
							})
						}
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					query.Find(".J_TBMarketCat").Each(func(i int, a *goquery.Selection) {
						type1 := a.Find("h4").Text()
						a.Find(".section").Each(func(i int, b *goquery.Selection) {
							type2 := b.Find(".subtitle a").Text()
							b.Find(".sublist a").Each(func(i int, c *goquery.Selection) {
								type3 := c.Text()
								href3, _ := c.Attr("href")

								ctx.Aid(map[string]interface{}{
									"loop":    [2]int{0, 1},
									"urlBase": href3,
									"Rule":    "列表页数",
									"Temp": map[string]interface{}{
										"type1": type1,
										"type2": type2,
										"type3": type3,
									},
								})
							})
						})
					})
				},
			},

			"列表页数": {
				ParseFunc: func(ctx *Context) {
					json := ctx.GetText()
					re, _ := regexp.Compile(`(?U)"totalPage":"[\d]+",`)
					total := re.FindString(json)
					re, _ = regexp.Compile(`[\d]+`)
					total = re.FindString(total)
					total = strings.Trim(total, " \t\n")
					totalPage, _ := strconv.Atoi(total)
					if total == "0" {
						logs.Log.Critical("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
					} else {
						ctx.Aid(map[string]interface{}{
							"loop":    [2]int{1, totalPage},
							"urlBase": ctx.GetUrl(),
							"Rule":    "商品列表",
							"Temp":    ctx.CopyTemps(),
						}, "生成请求")
						ctx.Parse("商品列表")
					}
				},
			},

			"商品列表": {
				ParseFunc: func(ctx *Context) {
					j := ctx.GetText()

					infos := map[string]interface{}{}
					err := json.Unmarshal([]byte(j), &infos)
					if err != nil {
						logs.Log.Error("商品列表解析错误: %v\n", err)
						return
					}
					if infos["mallItemList"] == nil {
						logs.Log.Error("商品列表解析错误: 内容不存在!")
						return
					}
					for _, item := range infos["mallItemList"].([]interface{}) {
						item2 := item.(map[string]interface{})
						temp := ctx.CreatItem(map[int]interface{}{
							0:  item2["title"],
							1:  item2["price"],
							2:  item2["currentPrice"],
							3:  item2["vipPrice"],
							4:  item2["unitPrice"],
							5:  item2["unit"],
							6:  item2["isVirtual"],
							7:  item2["ship"],
							8:  item2["tradeNum"],
							9:  item2["formatedNum"],
							10: item2["nick"],
							11: item2["sellerId"],
							12: item2["guarantee"],
							13: item2["itemId"],
							14: item2["isLimitPromotion"],
							15: item2["loc"],
							16: "http:" + item2["storeLink"].(string),
							17: "http:" + item2["href"].(string),
							18: item2["commend"],
							19: item2["source"],
							20: item2["ratesum"],
							21: item2["goodRate"],
							22: item2["dsrScore"],
							23: item2["spSource"],
						}, "结果")
						ctx.AddQueue(&request.Request{
							Url:      "http:" + item2["href"].(string),
							Rule:     "商品详情",
							Temp:     temp,
							Priority: 1,
						})
					}
				},
			},

			"商品详情": {

				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					detail := make(map[string]string)

					if li := query.Find(".attributes-list ul li"); len(li.Nodes) != 0 {

						li.Each(func(i int, s *goquery.Selection) {
							native := s.Text()
							slice := strings.Split(native, ":&nbsp;")

							slice[1] = strings.Replace(slice[1], "&nbsp;", "&#124;", -1)
							detail[slice[0]] = UnicodeToUTF8(slice[1])
						})

					} else {

						query.Find(".attributes-list li").Each(func(i int, s *goquery.Selection) {
							native := s.Text()
							slice := strings.Split(native, ": ")
							detail[slice[0]] = slice[1]
						})
					}

					temp := ctx.CopyTemps()
					temp[ctx.GetItemField(24, "结果")] = detail
					temp[ctx.GetItemField(25, "结果")] = []interface{}{}

					ctx.AddQueue(&request.Request{
						Rule: "商品评论",
						Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" +
							ctx.GetTemp("sellerId", "").(string) +
							"&auctionNumId=" +
							ctx.GetTemp("itemId", "").(string) +
							"&currentPageNum=1",
						Temp:     temp,
						Priority: 2,
					})
				},
			},

			"商品评论": {
				ParseFunc: func(ctx *Context) {
					j := ctx.GetText()
					j = strings.TrimLeft(j, "(")
					j = strings.TrimRight(j, ")")

					infos := map[string]interface{}{}
					if err := json.Unmarshal([]byte(j), &infos); err != nil {
						logs.Log.Error("商品评论解析错误: %v\n", err)
						return
					}
					if infos["comments"] == nil || infos["maxPage"] == nil || infos["currentPageNum"] == nil {
						logs.Log.Error("商品评论解析错误: 内容不存在!")
						return
					}
					discussSlice := infos["comments"].([]interface{})
					var discussAll = ctx.GetTemp(ctx.GetItemField(25, "结果"), []interface{}{}).([]interface{})
					discussAll = append(discussAll, discussSlice...)
					temp := ctx.CopyTemps()
					temp[ctx.GetItemField(25, "结果")] = discussAll

					currentPageNum := infos["currentPageNum"].(int)
					maxPage := infos["maxPage"].(int)
					if currentPageNum < maxPage {

						ctx.AddQueue(&request.Request{
							Rule: "商品评论",
							Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" +
								ctx.GetTemp("sellerId", "").(string) +
								"&auctionNumId=" +
								ctx.GetTemp("itemId", "").(string) +
								"&currentPageNum=" +
								strconv.Itoa(currentPageNum+1),
							Temp: temp,
						})
					} else {

						ctx.Parse("结果")
					}
				},
			},

			"结果": {

				ItemFields: []string{
					"标题",
					"原价",
					"现价",
					"会员价",
					"单价",
					"单位",
					"是否虚拟物品",
					"ship",
					"tradeNum",
					"formatedNum",
					"店铺",
					"店铺ID",
					"guarantee",
					"货号",
					"isLimitPromotion",
					"发货地",
					"店铺链接",
					"商品链接",
					"评价",
					"source",
					"店铺信誉",
					"店铺好评率",
					"dsrScore",
					"spSource",
					"规格参数",
					"评论内容",
				},
				ParseFunc: func(ctx *Context) {

					ctx.Output(ctx.CopyTemps())
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL