Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var Taobao = &Spider{ Name: "淘宝数据", Description: "淘宝天猫商品数据 [Auto Page] [http://list.taobao.com/]", EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.AddQueue(&request.Request{ Url: "http://list.taobao.com/browse/cat-0.htm", Rule: "生成请求", Header: http.Header{ "Cookie": []string{cookies_Taobao}, }, }) }, Trunk: map[string]*Rule{ "生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { for _, loc := range loc_Taobao { ctx.AddQueue(&request.Request{ Url: "http:" + aid["urlBase"].(string) + "&_input_charset=utf-8&json=on&viewIndex=1&as=0&atype=b&style=grid&same_info=1&tid=0&isnew=2&data-action&module=page&s=0&loc=" + loc + "&pSize=96&data-key=s&data-value=" + strconv.Itoa(loop[0]*96), Rule: aid["Rule"].(string), Header: http.Header{ "Cookie": []string{cookies_Taobao}, }, Temp: aid["Temp"].(map[string]interface{}), }) } } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() query.Find(".J_TBMarketCat").Each(func(i int, a *goquery.Selection) { type1 := a.Find("h4").Text() a.Find(".section").Each(func(i int, b *goquery.Selection) { type2 := b.Find(".subtitle a").Text() b.Find(".sublist a").Each(func(i int, c *goquery.Selection) { type3 := c.Text() href3, _ := c.Attr("href") ctx.Aid(map[string]interface{}{ "loop": [2]int{0, 1}, "urlBase": href3, "Rule": "列表页数", "Temp": map[string]interface{}{ "type1": type1, "type2": type2, "type3": type3, }, }) }) }) }) }, }, "列表页数": { ParseFunc: func(ctx *Context) { json := ctx.GetText() re, _ := regexp.Compile(`(?U)"totalPage":"[\d]+",`) total := re.FindString(json) re, _ = regexp.Compile(`[\d]+`) total = re.FindString(total) total = strings.Trim(total, " \t\n") totalPage, _ := strconv.Atoi(total) if total == "0" { logs.Log.Critical("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) } else { ctx.Aid(map[string]interface{}{ "loop": [2]int{1, totalPage}, "urlBase": ctx.GetUrl(), "Rule": "商品列表", "Temp": ctx.CopyTemps(), }, "生成请求") ctx.Parse("商品列表") } }, }, "商品列表": { ParseFunc: func(ctx *Context) { j := ctx.GetText() infos := map[string]interface{}{} err := json.Unmarshal([]byte(j), &infos) if err != nil { logs.Log.Error("商品列表解析错误: %v\n", err) return } if infos["mallItemList"] == nil { logs.Log.Error("商品列表解析错误: 内容不存在!") return } for _, item := range infos["mallItemList"].([]interface{}) { item2 := item.(map[string]interface{}) temp := ctx.CreatItem(map[int]interface{}{ 0: item2["title"], 1: item2["price"], 2: item2["currentPrice"], 3: item2["vipPrice"], 4: item2["unitPrice"], 5: item2["unit"], 6: item2["isVirtual"], 7: item2["ship"], 8: item2["tradeNum"], 9: item2["formatedNum"], 10: item2["nick"], 11: item2["sellerId"], 12: item2["guarantee"], 13: item2["itemId"], 14: item2["isLimitPromotion"], 15: item2["loc"], 16: "http:" + item2["storeLink"].(string), 17: "http:" + item2["href"].(string), 18: item2["commend"], 19: item2["source"], 20: item2["ratesum"], 21: item2["goodRate"], 22: item2["dsrScore"], 23: item2["spSource"], }, "结果") ctx.AddQueue(&request.Request{ Url: "http:" + item2["href"].(string), Rule: "商品详情", Temp: temp, Priority: 1, }) } }, }, "商品详情": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() detail := make(map[string]string) if li := query.Find(".attributes-list ul li"); len(li.Nodes) != 0 { li.Each(func(i int, s *goquery.Selection) { native := s.Text() slice := strings.Split(native, ": ") slice[1] = strings.Replace(slice[1], " ", "|", -1) detail[slice[0]] = UnicodeToUTF8(slice[1]) }) } else { query.Find(".attributes-list li").Each(func(i int, s *goquery.Selection) { native := s.Text() slice := strings.Split(native, ": ") detail[slice[0]] = slice[1] }) } temp := ctx.CopyTemps() temp[ctx.GetItemField(24, "结果")] = detail temp[ctx.GetItemField(25, "结果")] = []interface{}{} ctx.AddQueue(&request.Request{ Rule: "商品评论", Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" + ctx.GetTemp("sellerId", "").(string) + "&auctionNumId=" + ctx.GetTemp("itemId", "").(string) + "¤tPageNum=1", Temp: temp, Priority: 2, }) }, }, "商品评论": { ParseFunc: func(ctx *Context) { j := ctx.GetText() j = strings.TrimLeft(j, "(") j = strings.TrimRight(j, ")") infos := map[string]interface{}{} if err := json.Unmarshal([]byte(j), &infos); err != nil { logs.Log.Error("商品评论解析错误: %v\n", err) return } if infos["comments"] == nil || infos["maxPage"] == nil || infos["currentPageNum"] == nil { logs.Log.Error("商品评论解析错误: 内容不存在!") return } discussSlice := infos["comments"].([]interface{}) var discussAll = ctx.GetTemp(ctx.GetItemField(25, "结果"), []interface{}{}).([]interface{}) discussAll = append(discussAll, discussSlice...) temp := ctx.CopyTemps() temp[ctx.GetItemField(25, "结果")] = discussAll currentPageNum := infos["currentPageNum"].(int) maxPage := infos["maxPage"].(int) if currentPageNum < maxPage { ctx.AddQueue(&request.Request{ Rule: "商品评论", Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" + ctx.GetTemp("sellerId", "").(string) + "&auctionNumId=" + ctx.GetTemp("itemId", "").(string) + "¤tPageNum=" + strconv.Itoa(currentPageNum+1), Temp: temp, }) } else { ctx.Parse("结果") } }, }, "结果": { ItemFields: []string{ "标题", "原价", "现价", "会员价", "单价", "单位", "是否虚拟物品", "ship", "tradeNum", "formatedNum", "店铺", "店铺ID", "guarantee", "货号", "isLimitPromotion", "发货地", "店铺链接", "商品链接", "评价", "source", "店铺信誉", "店铺好评率", "dsrScore", "spSource", "规格参数", "评论内容", }, ParseFunc: func(ctx *Context) { ctx.Output(ctx.CopyTemps()) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.