otherrule

package
v0.0.0-...-4498091 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 8, 2017 License: Apache-2.0 Imports: 11 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var BlogIds = []int{1344636005}
View Source
var Hejin = &Spider{
	Name:         "HEJIN",
	Description:  `HEJIN 自定义输入格式 url`,
	Pausetime:    2000,
	Keyin:        KEYIN,
	Limit:        LIMIT,
	EnableCookie: true,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {

			param := ctx.GetKeyin()
			if len(param) <= 12 {
				logs.Log.Warning("自定义输入的url参数不正确! use default")

				param = `http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=20`
			}

			urlParsed, _ := url.Parse(param)
			urlParams, _ := url.ParseQuery(urlParsed.RawQuery)
			logs.Log.Error("host=%v script=%v query=%v", urlParsed.Host, urlParsed.Path, urlParsed.RawPath)
			urlModel, modelExist := urlParams["model"]
			if !modelExist || len(urlModel) == 0 {
				logs.Log.Error("不是有效的url,model not exist,有效的url应该类似:http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=1")
				return
			}
			if urlParsed.Path != "/plugin.php" {
				logs.Log.Error("不是有效的url,plugin.php not exist,有效的url应该类似:http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=1 %v", urlParsed.Path)
				return
			}

			pluginId, pluginIdExist := urlParams["id"]
			if !pluginIdExist || len(pluginId) == 0 {
				logs.Log.Error("不是有效的url,pluginId not exist,有效的url应该类似:http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=1 ")
				return

			}
			logs.Log.Error("pluginId=%v urlModel=%v", pluginId, urlModel)
			vid, vidExist := urlParams["vid"]
			if !vidExist || len(vid) == 0 {
				vid = make([]string, 1)
				vid[0] = "1"
			}

			logs.Log.Error("vid=%v", vid)
			zid, zidExist := urlParams["zid"]
			if !zidExist || len(zid) == 0 {
				logs.Log.Error("没有匹配到要投票的用户 请输入带zid的url %v", zid[0])
				return
			}

			urlPre := urlParsed.Scheme + "://" + urlParsed.Host + urlParsed.Path + "?id=" + pluginId[0]

			urlTop300 := urlPre + "&model=top300&vid=" + vid[0]
			logs.Log.Warning("will top300: %v", urlTop300)

			ctx.AddQueue(&request.Request{
				Url:  urlTop300,
				Rule: "top300",
				Header: http.Header{
					"Cookie":     []string{},
					"Referer":    []string{param},
					"User-Agent": []string{"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
				},
				DownloaderID: 0,
				Temp: map[string]interface{}{
					"zid":      zid[0],
					"model":    urlModel[0],
					"pluginId": pluginId[0],
					"urlPre":   urlPre,
					"vid":      vid[0],
				},
			})
		},

		Trunk: map[string]*Rule{
			"top300": {
				ParseFunc: func(ctx *Context) {
					logs.Log.Warning("start top300: url=%v", ctx.GetUrl())

					if ctx.Response == nil {
						logs.Log.Error("no response!")
						return
					}

					textContent := ctx.GetText()
					logs.Log.Warning("the textContent len=%v %v", len(textContent), textContent[:32])

					tempContent := []byte(textContent)
					rankIdx := strings.Index(textContent, `<div class="rank300" id="top300">`)
					var rankContent string
					if rankIdx > 0 {
						logs.Log.Warning("find div rank300 at: %d", rankIdx)
						tempContent = tempContent[rankIdx:]
						textContent = string(tempContent)
						rankEndIdx := strings.Index(textContent, "</div>")
						if rankEndIdx > 0 {
							logs.Log.Warning("find /div at: %d", rankEndIdx)

							tempContent = tempContent[:rankEndIdx]
							rankContent = string(tempContent)
						}
					}
					logs.Log.Warning("the rankContent len=%v %v", len(rankContent), string(tempContent[:32]))
					if len(rankContent) == 0 {
						logs.Log.Error("no rank content in text:%v", string(tempContent))
						return
					}

					// 因为该网站的代码比较垃圾 编码混乱,gb2312和utf8混排,导致goquery无法解析,只能手动
					var uids []int
					tempContent = []byte(rankContent)
					for {
						spanIdx := strings.Index(rankContent, "</span><span>1")
						uid := 0
						if spanIdx > 0 {
							logs.Log.Warning("find a span: %v", string(tempContent[spanIdx:spanIdx+18]))
							uid, _ = strconv.Atoi(string(tempContent[spanIdx+14 : spanIdx+18]))
							uids = append(uids, uid)
							tempContent = tempContent[spanIdx+18:]
							rankContent = string(tempContent)
							continue
						}
						break
					}
					logs.Log.Warning("uids: len=%d %v", len(uids), uids)
					return

					r := strings.NewReader(rankContent)
					rankQuery, _ := goquery.NewDocumentFromReader(r)

					rankQuery.Add(rankContent).Find(".list li").Each(func(i int, s *goquery.Selection) {
						subUid := s.Find("span").Eq(1).Text()
						if len(subUid) > 0 {
							logs.Log.Warning("we find a uid:%v", subUid)
							uid, _ := strconv.Atoi(subUid)
							uids = append(uids, uid-10000)
							url := ctx.GetTemp("urlPre", "").(string) + "&model=dcexcel&zid=" + strconv.FormatInt(int64(uid), 10)
							logs.Log.Warning("will dcexcel: %v", url)

							ctx.AddQueue(&request.Request{
								Url:  url,
								Rule: "dcexcel",
								Header: http.Header{
									"Cookie":     []string{},
									"Referer":    []string{url},
									"User-Agent": []string{"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
								},
								DownloaderID: 0,
							})
						}
					})

					ctx.SetTemp("uids", uids)
					logs.Log.Warning("has got uids:%v", uids)
				},
			},
			"dcexcel": {
				ItemFields: []string{
					"blogId",
					"文章标题",
					"文章内容",
				},
				ParseFunc: func(ctx *Context) {

					query := ctx.GetDom()
					var url = ctx.GetUrl()
					logs.Log.Warning("start dcexcel:%v", url)
					text := ctx.GetText()
					if len(text) > 0 {
						logs.Log.Warning("len=%v url=%v", len(text), url)
						return
					}
					return

					//var blogId int64 = 0
					var blogIdStr string
					if blogIdIdx := strings.Index(url, "blogid="); blogIdIdx > 0 {
						blogIdStr = url[blogIdIdx+9 : blogIdIdx+30]
						if commaIdx := strings.Index(blogIdStr, "&"); commaIdx > 0 {
							blogIdStr = blogIdStr[:commaIdx]
						}

					}

					var detail = query.Find("#blogDetailDiv").Text()
					var title = query.Find(".blog_tit_detail").Eq(0).Text()
					logs.Log.Error("blogid=%v title=%v len(detail)=%v ", blogIdStr, title, len(detail))
					rowRet := map[int]interface{}{
						0: blogIdStr,
						1: title,
						2: detail,
					}

					ctx.Output(rowRet)
				},
			},
			"votea": {
				ItemFields: []string{
					"blogId",
					"文章标题",
					"文章内容",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					var url = ctx.GetUrl()

					//var blogId int64 = 0
					var blogIdStr string
					if blogIdIdx := strings.Index(url, "blogid="); blogIdIdx > 0 {
						blogIdStr = url[blogIdIdx+9 : blogIdIdx+30]
						if commaIdx := strings.Index(blogIdStr, "&"); commaIdx > 0 {
							blogIdStr = blogIdStr[:commaIdx]
						}

					}

					var detail = query.Find("#blogDetailDiv").Text()
					var title = query.Find(".blog_tit_detail").Eq(0).Text()
					logs.Log.Error("blogid=%v title=%v len(detail)=%v ", blogIdStr, title, len(detail))
					rowRet := map[int]interface{}{
						0: blogIdStr,
						1: title,
						2: detail,
					}

					ctx.Output(rowRet)
				},
			},
		},
	},
}
View Source
var WeiboFans = &Spider{
	Name:         "微博粉丝列表",
	Description:  `新浪微博粉丝 [自定义输入格式 "ID"::"Cookie"][最多支持250页,内设定时1~2s]`,
	Pausetime:    2000,
	Keyin:        KEYIN,
	Limit:        LIMIT,
	EnableCookie: true,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			param := strings.Split(ctx.GetKeyin(), "::")
			if len(param) != 2 {
				logs.Log.Error("自定义输入的参数不正确!")
				return
			}
			id := strings.Trim(param[0], " ")
			cookie := strings.Trim(param[1], " ")

			var count1 = 250
			var count2 = 50
			if ctx.GetLimit() < count1 {
				count1 = ctx.GetLimit()
			}
			if ctx.GetLimit() < count2 {
				count2 = ctx.GetLimit()
			}
			for i := count1; i > 0; i-- {
				ctx.AddQueue(&request.Request{
					Url:          "http://weibo.com/" + id + "/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68",
					Rule:         "好友列表",
					Header:       http.Header{"Cookie": []string{cookie}},
					DownloaderID: 0,
				})
			}
			for i := 1; i <= count2; i++ {
				ctx.AddQueue(&request.Request{
					Url:          "http://www.weibo.com/" + id + "/fans?cfs=&relate=fans&t=5&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68",
					Rule:         "好友列表",
					Header:       http.Header{"Cookie": []string{cookie}},
					DownloaderID: 0,
				})
			}
		},

		Trunk: map[string]*Rule{
			"好友列表": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					fmt.Println(query.Find(".follow_list").Text())
					query.Find(".follow_list .mod_info").Each(func(i int, s *goquery.Selection) {
						fmt.Println("222")
						name, _ := s.Find(".info_name a").Attr("title")
						fmt.Println(name)
						url, _ := s.Find(".info_name a").Attr("href")
						uid := strings.Replace(url, "/u", "", -1)
						uid = strings.Replace(uid, "/", "", -1)
						url = "http://weibo.com/p/100505" + uid + "/info?mod=pedit_more"
						var 认证 string = ""
						if _, isExist := s.Find(".info_name i").Attr("title"); isExist {
							认证 = "认证"
						}
						关注 := s.Find(".info_connect em a").Eq(0).Text()
						粉丝 := s.Find(".info_connect em a").Eq(1).Text()
						微博 := s.Find(".info_connect em a").Eq(2).Text()
						fmt.Println(关注, 粉丝, 微博)
						x := &request.Request{
							Url:          url,
							Rule:         "好友资料",
							DownloaderID: 0,
							Temp: map[string]interface{}{
								"好友名":  name,
								"好友ID": uid,
								"认证":   认证,
								"关注":   关注,
								"粉丝":   粉丝,
								"微博":   微博,
							},
						}
						ctx.AddQueue(x)
					})
				},
			},
			"好友资料": {
				ItemFields: []string{
					"好友名",
					"好友ID",
					"认证",
					"关注",
					"粉丝",
					"微博",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					var 属性 map[string]string
					var title string
					var detail string
					query.Find(".li_1").Each(func(i int, s *goquery.Selection) {
						if 属性 == nil {
							属性 = map[string]string{}
						}
						title = s.Find(".pt_title").Text()
						title = Deprive2(title)
						detail = s.Find(".pt_detail").Text()
						detail = Deprive2(detail)
						属性[title] = detail
					})
					结果 := map[int]interface{}{
						0: ctx.GetTemp("好友名", ""),
						1: ctx.GetTemp("好友ID", ""),
						2: ctx.GetTemp("认证", ""),
						3: ctx.GetTemp("关注", ""),
						4: ctx.GetTemp("粉丝", ""),
						5: ctx.GetTemp("微博", ""),
					}
					for k, v := range 属性 {
						idx := ctx.UpsertItemField(k)
						结果[idx] = v
					}

					ctx.Output(结果)
				},
			},
		},
	},
}
View Source
var Wxb = &Spider{
	Name:         "微小宝 ",
	Description:  `微小宝阅读量前100的文章 [自定义输入格式 "ID"::"Cookie"][最多支持250页,内设定时1~2s]`,
	Pausetime:    2000,
	Keyin:        KEYIN,
	Limit:        LIMIT,
	EnableCookie: true,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			param := strings.Split(ctx.GetKeyin(), "::")
			if len(param) != 2 {
				logs.Log.Error("自定义输入的参数不正确!")
				return
			}
			id := strings.Trim(param[0], " ")
			cookie := strings.Trim(param[1], " ")

			var i = 0
			for _, blogid := range BlogIds {
				i++
				if i > 2 {
					break
				}

				urlHost := "https://h5.qzone.qq.com"
				urlPath := "/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=" + id + "&blogid=" + strconv.Itoa(blogid) + "&styledm=qzonestyle.gtimg.cn&imgdm=qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15&timestamp=" + strconv.FormatInt(time.Now().Unix(), 10) + "&dprefix=&blogseed=0.6215156952384859&inCharset=gb2312&outCharset=gb2312&ref=qzone&entertime=1502288565153&cdn_use_https=1"
				ctx.AddQueue(&request.Request{
					Url:  urlHost + urlPath,
					Rule: "文章详情",
					Header: http.Header{

						"Cookie":     []string{cookie},
						"Referer":    []string{"https://qzs.qq.com/qzone/newblog/blogcanvas.html"},
						"User-Agent": []string{"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
					},
					DownloaderID: 0,
				})
			}
		},

		Trunk: map[string]*Rule{
			"文章列表": {
				ItemFields: []string{
					"文章名",
					"blogid",
					"url",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					结果 := map[int]interface{}{
						0: ctx.GetTemp("好友名", ""),
						1: ctx.GetTemp("好友ID", ""),
						2: ctx.GetTemp("认证", ""),
					}

					query.Find(".article").Each(func(i int, s *goquery.Selection) {
						logs.Log.Error("this is eq %d", i)
						if i >= 3 {
							return
						}

						artLink := s.Find(".c_tx2 a")
						title, _ := artLink.Attr("title")
						name := artLink.Find("span").Text()
						fmt.Println(name)
						url, _ := artLink.Attr("href")
						blogid, _ := artLink.Attr("blogid")

						logs.Log.Error("i=%d title=%s name=%v url=%v blogid=%v\n", i, title, name, url, blogid)
						结果[i] = title

					})

					ctx.Output(结果)
				},
			},
			"文章详情": {
				ItemFields: []string{
					"文章标题",
					"文章内容",
					"url",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					var url = ctx.GetUrl()
					var detailHtml, e1 = query.Find("#blogDetailDiv").Html()
					//var detailHtmlEach = query.Find("#blogDetailDiv").Each(func(n int){})
					var detail = query.Find("#blogDetailDiv").Text()
					var title = query.Find(".blog_tit_detail").Eq(0).Text()
					var bodyHtml, e2 = query.Html()
					logs.Log.Error("title=%v detail=%v detailHtml=%v e1=%v body=%v e2=%v", title, detail, detailHtml, e1, bodyHtml, e2)
					结果 := map[int]interface{}{
						0: title,
						1: detail,
						2: url,
					}

					ctx.Output(结果)
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL