pholcus_lib

package
v0.0.0-...-9397c3f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 16, 2017 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var IJGUC = &Spider{
	Name:        "IJGUC期刊",
	Description: "IJGUC期刊",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.AddQueue(&request.Request{
				Url:  "http://www.inderscience.com/info/inarticletoc.php?jcode=ijguc&year=2016&vol=7&issue=1",
				Rule: "期刊列表",
			})
		},

		Trunk: map[string]*Rule{
			"期刊列表": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					for i := 1; i <= 7; i++ {
						id := "#eventbody" + strconv.Itoa(i) + " a"
						query.Find(id).Each(func(j int, s *goquery.Selection) {
							if url, ok := s.Attr("href"); ok {

								ctx.AddQueue(&request.Request{Url: url, Rule: "文章列表"})
							}
						})
					}
				},
			},
			"文章列表": {
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					query.Find("#journalcol1 article table tbody tr td").Each(func(i int, td *goquery.Selection) {
						if i == 1 {
							td.Find("table").Each(func(j int, table *goquery.Selection) {
								if j == 1 {
									table.Find("a").Each(func(k int, a *goquery.Selection) {
										if k%2 == 0 {
											if url, ok := a.Attr("href"); ok {

												ctx.AddQueue(&request.Request{Url: url, Rule: "文章页"})
											}
										}
									})
								}
							})
						}
					})
				},
			},
			"文章页": {

				ItemFields: []string{
					"Title",
					"Author",
					"Addresses",
					"Journal",
					"Abstract",
					"Keywords",
					"DOI",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					content := query.Find("#col1").Text()

					re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
					content = re.ReplaceAllString(content, "")

					re, _ = regexp.Compile("Title:(.*?)Author:")
					title := re.FindStringSubmatch(content)[1]

					re, _ = regexp.Compile("Author:(.*?)Addresses:")
					au := re.FindStringSubmatch(content)
					var author string
					if len(au) > 0 {
						author = au[1]
					} else {
						re, _ = regexp.Compile("Author:(.*?)Address:")
						author = re.FindStringSubmatch(content)[1]
					}

					re, _ = regexp.Compile("Addresses:(.*?)Journal:")
					address := re.FindStringSubmatch(content)
					var addresses string
					if len(address) > 0 {
						addresses = address[1]
					} else {
						re, _ = regexp.Compile("Address:(.*?)Journal:")
						addresses = re.FindStringSubmatch(content)[1]
					}

					re, _ = regexp.Compile("Journal:(.*?)Abstract:")
					journal := re.FindStringSubmatch(content)[1]

					re, _ = regexp.Compile("Abstract:(.*?)Keywords:")
					abstract := re.FindStringSubmatch(content)[1]

					re, _ = regexp.Compile("Keywords:(.*?)DOI:")
					keywords := re.FindStringSubmatch(content)[1]

					re, _ = regexp.Compile("DOI: ")
					doiIndex := re.FindStringSubmatchIndex(content)
					rs := []rune(content)
					left := doiIndex[1] - 8
					right := left + 43
					doi := string(rs[left:right])

					ctx.Output(map[int]interface{}{
						0: title,
						1: author,
						2: addresses,
						3: journal,
						4: abstract,
						5: keywords,
						6: doi,
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL