Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var IJGUC = &Spider{ Name: "IJGUC期刊", Description: "IJGUC期刊", EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.AddQueue(&request.Request{ Url: "http://www.inderscience.com/info/inarticletoc.php?jcode=ijguc&year=2016&vol=7&issue=1", Rule: "期刊列表", }) }, Trunk: map[string]*Rule{ "期刊列表": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() for i := 1; i <= 7; i++ { id := "#eventbody" + strconv.Itoa(i) + " a" query.Find(id).Each(func(j int, s *goquery.Selection) { if url, ok := s.Attr("href"); ok { ctx.AddQueue(&request.Request{Url: url, Rule: "文章列表"}) } }) } }, }, "文章列表": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() query.Find("#journalcol1 article table tbody tr td").Each(func(i int, td *goquery.Selection) { if i == 1 { td.Find("table").Each(func(j int, table *goquery.Selection) { if j == 1 { table.Find("a").Each(func(k int, a *goquery.Selection) { if k%2 == 0 { if url, ok := a.Attr("href"); ok { ctx.AddQueue(&request.Request{Url: url, Rule: "文章页"}) } } }) } }) } }) }, }, "文章页": { ItemFields: []string{ "Title", "Author", "Addresses", "Journal", "Abstract", "Keywords", "DOI", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() content := query.Find("#col1").Text() re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") content = re.ReplaceAllString(content, "") re, _ = regexp.Compile("Title:(.*?)Author:") title := re.FindStringSubmatch(content)[1] re, _ = regexp.Compile("Author:(.*?)Addresses:") au := re.FindStringSubmatch(content) var author string if len(au) > 0 { author = au[1] } else { re, _ = regexp.Compile("Author:(.*?)Address:") author = re.FindStringSubmatch(content)[1] } re, _ = regexp.Compile("Addresses:(.*?)Journal:") address := re.FindStringSubmatch(content) var addresses string if len(address) > 0 { addresses = address[1] } else { re, _ = regexp.Compile("Address:(.*?)Journal:") addresses = re.FindStringSubmatch(content)[1] } re, _ = regexp.Compile("Journal:(.*?)Abstract:") journal := re.FindStringSubmatch(content)[1] re, _ = regexp.Compile("Abstract:(.*?)Keywords:") abstract := re.FindStringSubmatch(content)[1] re, _ = regexp.Compile("Keywords:(.*?)DOI:") keywords := re.FindStringSubmatch(content)[1] re, _ = regexp.Compile("DOI: ") doiIndex := re.FindStringSubmatchIndex(content) rs := []rune(content) left := doiIndex[1] - 8 right := left + 43 doi := string(rs[left:right]) ctx.Output(map[int]interface{}{ 0: title, 1: author, 2: addresses, 3: journal, 4: abstract, 5: keywords, 6: doi, }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.