Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var BlogIds = []int{1344636005}
View Source
var Hejin = &Spider{ Name: "HEJIN", Description: `HEJIN 自定义输入格式 url`, Pausetime: 2000, Keyin: KEYIN, Limit: LIMIT, EnableCookie: true, RuleTree: &RuleTree{ Root: func(ctx *Context) { param := ctx.GetKeyin() if len(param) <= 12 { logs.Log.Warning("自定义输入的url参数不正确! use default") param = `http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=20` } urlParsed, _ := url.Parse(param) urlParams, _ := url.ParseQuery(urlParsed.RawQuery) logs.Log.Error("host=%v script=%v query=%v", urlParsed.Host, urlParsed.Path, urlParsed.RawPath) urlModel, modelExist := urlParams["model"] if !modelExist || len(urlModel) == 0 { logs.Log.Error("不是有效的url,model not exist,有效的url应该类似:http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=1") return } if urlParsed.Path != "/plugin.php" { logs.Log.Error("不是有效的url,plugin.php not exist,有效的url应该类似:http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=1 %v", urlParsed.Path) return } pluginId, pluginIdExist := urlParams["id"] if !pluginIdExist || len(pluginId) == 0 { logs.Log.Error("不是有效的url,pluginId not exist,有效的url应该类似:http://tzxts.lzyjdzsw.com/plugin.php?id=hejin_toupiao&model=detail&zid=1 ") return } logs.Log.Error("pluginId=%v urlModel=%v", pluginId, urlModel) vid, vidExist := urlParams["vid"] if !vidExist || len(vid) == 0 { vid = make([]string, 1) vid[0] = "1" } logs.Log.Error("vid=%v", vid) zid, zidExist := urlParams["zid"] if !zidExist || len(zid) == 0 { logs.Log.Error("没有匹配到要投票的用户 请输入带zid的url %v", zid[0]) return } urlPre := urlParsed.Scheme + "://" + urlParsed.Host + urlParsed.Path + "?id=" + pluginId[0] urlTop300 := urlPre + "&model=top300&vid=" + vid[0] logs.Log.Warning("will top300: %v", urlTop300) ctx.AddQueue(&request.Request{ Url: urlTop300, Rule: "top300", Header: http.Header{ "Cookie": []string{}, "Referer": []string{param}, "User-Agent": []string{"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}, }, DownloaderID: 0, Temp: map[string]interface{}{ "zid": zid[0], "model": urlModel[0], "pluginId": pluginId[0], "urlPre": urlPre, "vid": vid[0], }, }) }, Trunk: map[string]*Rule{ "top300": { ParseFunc: func(ctx *Context) { logs.Log.Warning("start top300: url=%v", ctx.GetUrl()) if ctx.Response == nil { logs.Log.Error("no response!") return } textContent := ctx.GetText() logs.Log.Warning("the textContent len=%v %v", len(textContent), textContent[:32]) tempContent := []byte(textContent) rankIdx := strings.Index(textContent, `<div class="rank300" id="top300">`) var rankContent string if rankIdx > 0 { logs.Log.Warning("find div rank300 at: %d", rankIdx) tempContent = tempContent[rankIdx:] textContent = string(tempContent) rankEndIdx := strings.Index(textContent, "</div>") if rankEndIdx > 0 { logs.Log.Warning("find /div at: %d", rankEndIdx) tempContent = tempContent[:rankEndIdx] rankContent = string(tempContent) } } logs.Log.Warning("the rankContent len=%v %v", len(rankContent), string(tempContent[:32])) if len(rankContent) == 0 { logs.Log.Error("no rank content in text:%v", string(tempContent)) return } // 因为该网站的代码比较垃圾 编码混乱,gb2312和utf8混排,导致goquery无法解析,只能手动 var uids []int tempContent = []byte(rankContent) for { spanIdx := strings.Index(rankContent, "</span><span>1") uid := 0 if spanIdx > 0 { logs.Log.Warning("find a span: %v", string(tempContent[spanIdx:spanIdx+18])) uid, _ = strconv.Atoi(string(tempContent[spanIdx+14 : spanIdx+18])) uids = append(uids, uid) tempContent = tempContent[spanIdx+18:] rankContent = string(tempContent) continue } break } logs.Log.Warning("uids: len=%d %v", len(uids), uids) return r := strings.NewReader(rankContent) rankQuery, _ := goquery.NewDocumentFromReader(r) rankQuery.Add(rankContent).Find(".list li").Each(func(i int, s *goquery.Selection) { subUid := s.Find("span").Eq(1).Text() if len(subUid) > 0 { logs.Log.Warning("we find a uid:%v", subUid) uid, _ := strconv.Atoi(subUid) uids = append(uids, uid-10000) url := ctx.GetTemp("urlPre", "").(string) + "&model=dcexcel&zid=" + strconv.FormatInt(int64(uid), 10) logs.Log.Warning("will dcexcel: %v", url) ctx.AddQueue(&request.Request{ Url: url, Rule: "dcexcel", Header: http.Header{ "Cookie": []string{}, "Referer": []string{url}, "User-Agent": []string{"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}, }, DownloaderID: 0, }) } }) ctx.SetTemp("uids", uids) logs.Log.Warning("has got uids:%v", uids) }, }, "dcexcel": { ItemFields: []string{ "blogId", "文章标题", "文章内容", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() var url = ctx.GetUrl() logs.Log.Warning("start dcexcel:%v", url) text := ctx.GetText() if len(text) > 0 { logs.Log.Warning("len=%v url=%v", len(text), url) return } return //var blogId int64 = 0 var blogIdStr string if blogIdIdx := strings.Index(url, "blogid="); blogIdIdx > 0 { blogIdStr = url[blogIdIdx+9 : blogIdIdx+30] if commaIdx := strings.Index(blogIdStr, "&"); commaIdx > 0 { blogIdStr = blogIdStr[:commaIdx] } } var detail = query.Find("#blogDetailDiv").Text() var title = query.Find(".blog_tit_detail").Eq(0).Text() logs.Log.Error("blogid=%v title=%v len(detail)=%v ", blogIdStr, title, len(detail)) rowRet := map[int]interface{}{ 0: blogIdStr, 1: title, 2: detail, } ctx.Output(rowRet) }, }, "votea": { ItemFields: []string{ "blogId", "文章标题", "文章内容", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() var url = ctx.GetUrl() //var blogId int64 = 0 var blogIdStr string if blogIdIdx := strings.Index(url, "blogid="); blogIdIdx > 0 { blogIdStr = url[blogIdIdx+9 : blogIdIdx+30] if commaIdx := strings.Index(blogIdStr, "&"); commaIdx > 0 { blogIdStr = blogIdStr[:commaIdx] } } var detail = query.Find("#blogDetailDiv").Text() var title = query.Find(".blog_tit_detail").Eq(0).Text() logs.Log.Error("blogid=%v title=%v len(detail)=%v ", blogIdStr, title, len(detail)) rowRet := map[int]interface{}{ 0: blogIdStr, 1: title, 2: detail, } ctx.Output(rowRet) }, }, }, }, }
View Source
var WeiboFans = &Spider{ Name: "微博粉丝列表", Description: `新浪微博粉丝 [自定义输入格式 "ID"::"Cookie"][最多支持250页,内设定时1~2s]`, Pausetime: 2000, Keyin: KEYIN, Limit: LIMIT, EnableCookie: true, RuleTree: &RuleTree{ Root: func(ctx *Context) { param := strings.Split(ctx.GetKeyin(), "::") if len(param) != 2 { logs.Log.Error("自定义输入的参数不正确!") return } id := strings.Trim(param[0], " ") cookie := strings.Trim(param[1], " ") var count1 = 250 var count2 = 50 if ctx.GetLimit() < count1 { count1 = ctx.GetLimit() } if ctx.GetLimit() < count2 { count2 = ctx.GetLimit() } for i := count1; i > 0; i-- { ctx.AddQueue(&request.Request{ Url: "http://weibo.com/" + id + "/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68", Rule: "好友列表", Header: http.Header{"Cookie": []string{cookie}}, DownloaderID: 0, }) } for i := 1; i <= count2; i++ { ctx.AddQueue(&request.Request{ Url: "http://www.weibo.com/" + id + "/fans?cfs=&relate=fans&t=5&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68", Rule: "好友列表", Header: http.Header{"Cookie": []string{cookie}}, DownloaderID: 0, }) } }, Trunk: map[string]*Rule{ "好友列表": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() fmt.Println(query.Find(".follow_list").Text()) query.Find(".follow_list .mod_info").Each(func(i int, s *goquery.Selection) { fmt.Println("222") name, _ := s.Find(".info_name a").Attr("title") fmt.Println(name) url, _ := s.Find(".info_name a").Attr("href") uid := strings.Replace(url, "/u", "", -1) uid = strings.Replace(uid, "/", "", -1) url = "http://weibo.com/p/100505" + uid + "/info?mod=pedit_more" var 认证 string = "" if _, isExist := s.Find(".info_name i").Attr("title"); isExist { 认证 = "认证" } 关注 := s.Find(".info_connect em a").Eq(0).Text() 粉丝 := s.Find(".info_connect em a").Eq(1).Text() 微博 := s.Find(".info_connect em a").Eq(2).Text() fmt.Println(关注, 粉丝, 微博) x := &request.Request{ Url: url, Rule: "好友资料", DownloaderID: 0, Temp: map[string]interface{}{ "好友名": name, "好友ID": uid, "认证": 认证, "关注": 关注, "粉丝": 粉丝, "微博": 微博, }, } ctx.AddQueue(x) }) }, }, "好友资料": { ItemFields: []string{ "好友名", "好友ID", "认证", "关注", "粉丝", "微博", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() var 属性 map[string]string var title string var detail string query.Find(".li_1").Each(func(i int, s *goquery.Selection) { if 属性 == nil { 属性 = map[string]string{} } title = s.Find(".pt_title").Text() title = Deprive2(title) detail = s.Find(".pt_detail").Text() detail = Deprive2(detail) 属性[title] = detail }) 结果 := map[int]interface{}{ 0: ctx.GetTemp("好友名", ""), 1: ctx.GetTemp("好友ID", ""), 2: ctx.GetTemp("认证", ""), 3: ctx.GetTemp("关注", ""), 4: ctx.GetTemp("粉丝", ""), 5: ctx.GetTemp("微博", ""), } for k, v := range 属性 { idx := ctx.UpsertItemField(k) 结果[idx] = v } ctx.Output(结果) }, }, }, }, }
View Source
var Wxb = &Spider{ Name: "微小宝 ", Description: `微小宝阅读量前100的文章 [自定义输入格式 "ID"::"Cookie"][最多支持250页,内设定时1~2s]`, Pausetime: 2000, Keyin: KEYIN, Limit: LIMIT, EnableCookie: true, RuleTree: &RuleTree{ Root: func(ctx *Context) { param := strings.Split(ctx.GetKeyin(), "::") if len(param) != 2 { logs.Log.Error("自定义输入的参数不正确!") return } id := strings.Trim(param[0], " ") cookie := strings.Trim(param[1], " ") var i = 0 for _, blogid := range BlogIds { i++ if i > 2 { break } urlHost := "https://h5.qzone.qq.com" urlPath := "/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=" + id + "&blogid=" + strconv.Itoa(blogid) + "&styledm=qzonestyle.gtimg.cn&imgdm=qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15×tamp=" + strconv.FormatInt(time.Now().Unix(), 10) + "&dprefix=&blogseed=0.6215156952384859&inCharset=gb2312&outCharset=gb2312&ref=qzone&entertime=1502288565153&cdn_use_https=1" ctx.AddQueue(&request.Request{ Url: urlHost + urlPath, Rule: "文章详情", Header: http.Header{ "Cookie": []string{cookie}, "Referer": []string{"https://qzs.qq.com/qzone/newblog/blogcanvas.html"}, "User-Agent": []string{"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}, }, DownloaderID: 0, }) } }, Trunk: map[string]*Rule{ "文章列表": { ItemFields: []string{ "文章名", "blogid", "url", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() 结果 := map[int]interface{}{ 0: ctx.GetTemp("好友名", ""), 1: ctx.GetTemp("好友ID", ""), 2: ctx.GetTemp("认证", ""), } query.Find(".article").Each(func(i int, s *goquery.Selection) { logs.Log.Error("this is eq %d", i) if i >= 3 { return } artLink := s.Find(".c_tx2 a") title, _ := artLink.Attr("title") name := artLink.Find("span").Text() fmt.Println(name) url, _ := artLink.Attr("href") blogid, _ := artLink.Attr("blogid") logs.Log.Error("i=%d title=%s name=%v url=%v blogid=%v\n", i, title, name, url, blogid) 结果[i] = title }) ctx.Output(结果) }, }, "文章详情": { ItemFields: []string{ "文章标题", "文章内容", "url", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() var url = ctx.GetUrl() var detailHtml, e1 = query.Find("#blogDetailDiv").Html() //var detailHtmlEach = query.Find("#blogDetailDiv").Each(func(n int){}) var detail = query.Find("#blogDetailDiv").Text() var title = query.Find(".blog_tit_detail").Eq(0).Text() var bodyHtml, e2 = query.Html() logs.Log.Error("title=%v detail=%v detailHtml=%v e1=%v body=%v e2=%v", title, detail, detailHtml, e1, bodyHtml, e2) 结果 := map[int]interface{}{ 0: title, 1: detail, 2: url, } ctx.Output(结果) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.