Documentation ¶
Overview ¶
Package pagser is a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags, It's parser library from scrago.
The project source code: https://github.com/foolin/pagser
Features ¶
* Simple - Use golang struct tag syntax.
* Easy - Easy use for your spider/crawler/colly application.
* Extensible - Support for extension functions.
* Struct tag grammar - Grammar is simple, like \`pagser:"a->attr(href)"\`.
* Nested Structure - Support Nested Structure for node.
* Configurable - Support configuration.
* GoQuery/Colly - Support all goquery project, such as go-colly.
More info: https://github.com/foolin/pagser
Index ¶
- type BuiltinFunctions
- func (builtin BuiltinFunctions) AbsHref(selection *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachTextJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndText(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Size(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
- type BuiltinSelections
- func (builtin BuiltinSelections) Child(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) First(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Last(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Next(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Parent(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Parents(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) ParentsUntil(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Prev(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinSelections) Siblings(node *goquery.Selection, args ...string) (out interface{}, err error)
- type CallFunc
- type Config
- type Pagser
- func (p *Pagser) Parse(v interface{}, document string) (err error)
- func (p *Pagser) ParseDocument(v interface{}, document *goquery.Document) (err error)
- func (p *Pagser) ParseReader(v interface{}, reader io.Reader) (err error)
- func (p *Pagser) ParseSelection(v interface{}, selection *goquery.Selection) (err error)
- func (p *Pagser) RegisterFunc(name string, fn CallFunc)
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BuiltinFunctions ¶ added in v0.0.7
type BuiltinFunctions struct { }
BuiltinFunctions builtin functions are registered with a lowercase initial, eg: Text -> text()
func (BuiltinFunctions) AbsHref ¶ added in v0.1.2
func (builtin BuiltinFunctions) AbsHref(selection *goquery.Selection, args ...string) (out interface{}, err error)
AbsHref absHref(baseUrl) get element attribute name `href`, and convert to absolute url, return *URL. `baseUrl` is the base url like `https://example.com/`.
//<a href="/foolin/pagser">Pagser</a> struct { Example string `pagser:".selector->absHref('https://github.com/')"` }
func (BuiltinFunctions) Attr ¶ added in v0.0.7
func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
Attr attr(name, defaultValue=”) get element attribute value, return string. outerHtml() get element outer html, return string.
//<a href="https://github.com/foolin/pagser">Pagser</a> struct { Example string `pagser:".selector->attr(href)"` }
func (BuiltinFunctions) AttrConcat ¶ added in v0.1.1
func (builtin BuiltinFunctions) AttrConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
AttrConcat attrConcat(name, text1, $value, [ text2, ... text_n ]) `name` get element attribute value by name, `text1, text2, ... text_n` The strings that you wish to join together, `$value` is placeholder for get element text return string.
struct { Example string `pagser:".selector->attrConcat('Result:', '<', $value, '>')"` }
func (BuiltinFunctions) AttrEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
AttrEmpty attrEmpty(name, defaultValue) get element attribute value, return string.
//<a href="https://github.com/foolin/pagser">Pagser</a> struct { Example string `pagser:".selector->AttrEmpty(href, '#')"` }
func (BuiltinFunctions) AttrSplit ¶ added in v0.0.7
func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
AttrSplit attrSplit(name, sep=',', trim='true') get attribute value and split by separator to array string, return []string.
struct { Examples []string `pagser:".selector->attrSplit('keywords', ',')"` }
func (BuiltinFunctions) EachAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
EachAttr eachAttr(name) get each element attribute value, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a> struct { Examples []string `pagser:".selector->eachAttr(href)"` }
func (BuiltinFunctions) EachAttrEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
EachAttrEmpty eachAttrEmpty(name, defaultValue) get each element attribute value, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a> struct { Examples []string `pagser:".selector->eachAttrEmpty(href, '#')"` }
func (BuiltinFunctions) EachHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
EachHtml eachHtml() get each element inner html, return []string. eachTextEmpty(defaultValue) get each element text, return []string.
struct { Examples []string `pagser:".selector->eachHtml()"` }
func (BuiltinFunctions) EachOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
EachOutHtml eachOutHtml() get each element outer html, return []string.
struct { Examples []string `pagser:".selector->eachOutHtml()"` }
func (BuiltinFunctions) EachText ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
EachText eachText() get each element text, return []string.
struct { Examples []string `pagser:".selector->eachText('')"` }
func (BuiltinFunctions) EachTextEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
EachTextEmpty eachTextEmpty(defaultValue) get each element text, return []string.
struct { Examples []string `pagser:".selector->eachTextEmpty('')"` }
func (BuiltinFunctions) EachTextJoin ¶ added in v0.1.1
func (builtin BuiltinFunctions) EachTextJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
EachTextJoin eachTextJoin(sep) get each element text and join to string, return string.
struct { Example string `pagser:".selector->eachTextJoin(',')"` }
func (BuiltinFunctions) EqAndAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
EqAndAttr eqAndAttr(index, name) reduces the set of matched elements to the one at the specified index, and attr() return string.
struct { Example string `pagser:".selector->eqAndAttr(0, href)"` }
func (BuiltinFunctions) EqAndHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
EqAndHtml eqAndHtml(index) reduces the set of matched elements to the one at the specified index, and html() return string.
struct { Example string `pagser:".selector->eqAndHtml(0)"` }
func (BuiltinFunctions) EqAndOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
EqAndOutHtml eqAndOutHtml(index) reduces the set of matched elements to the one at the specified index, and outHtml() return string.
struct { Example string `pagser:".selector->eqAndOutHtml(0)"` }
func (BuiltinFunctions) EqAndText ¶ added in v0.1.1
func (builtin BuiltinFunctions) EqAndText(node *goquery.Selection, args ...string) (out interface{}, err error)
EqAndText eqAndText(index) reduces the set of matched elements to the one at the specified index, return string.
struct { Example string `pagser:".selector->eqAndText(0)"` }
func (BuiltinFunctions) Html ¶ added in v0.0.7
func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
Html html() get element inner html, return string.
struct { Example string `pagser:".selector->html()"` }
func (BuiltinFunctions) OutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
OutHtml outerHtml() get element outer html, return string.
struct { Example string `pagser:".selector->outerHtml()"` }
func (BuiltinFunctions) Size ¶ added in v0.1.3
func (builtin BuiltinFunctions) Size(node *goquery.Selection, args ...string) (out interface{}, err error)
Size size() returns the number of elements in the Selection object, return int.
struct { Size int `pagser:".selector->size()"` }
func (BuiltinFunctions) Text ¶ added in v0.0.7
func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
Text text() get element text, return string, this is default function, if not define function in struct tag.
struct { Example string `pagser:".selector->text()"` }
func (BuiltinFunctions) TextConcat ¶ added in v0.1.1
func (builtin BuiltinFunctions) TextConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
TextConcat textConcat(text1, $value, [ text2, ... text_n ]) The `text1, text2, ... text_n` strings that you wish to join together, `$value` is placeholder for get element text, return string.
struct { Example string `pagser:".selector->textConcat('Result:', '<', $value, '>')"` }
func (BuiltinFunctions) TextEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
TextEmpty textEmpty(defaultValue) get element text, if empty will return defaultValue, return string.
struct { Example string `pagser:".selector->textEmpty('')"` }
func (BuiltinFunctions) TextSplit ¶ added in v0.1.1
func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
TextSplit textSplit(sep=',', trim='true') get element text and split by separator to array string, return []string.
struct { Examples []string `pagser:".selector->textSplit('|')"` }
type BuiltinSelections ¶ added in v0.1.2
type BuiltinSelections struct { }
BuiltinSelections builtin selection functions are registered with a lowercase initial, eg: Text -> text()
func (BuiltinSelections) Child ¶ added in v0.1.2
func (builtin BuiltinSelections) Child(node *goquery.Selection, args ...string) (out interface{}, err error)
Child child(selector=”) gets the child elements of each element in the Selection, Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct..
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->child()"` }
func (BuiltinSelections) Eq ¶ added in v0.1.2
func (builtin BuiltinSelections) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)
Eq eq(index) reduces the set of matched elements to the one at the specified index. If a negative index is given, it counts backwards starting at the end of the set. It returns a Selection object for nested struct, and an empty Selection object if the index is invalid.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->eq(0)"` }
func (BuiltinSelections) First ¶ added in v0.1.2
func (builtin BuiltinSelections) First(node *goquery.Selection, args ...string) (out interface{}, err error)
First first() First reduces the set of matched elements to the first in the set. It returns a new Selection object, and an empty Selection object if the the selection is empty. It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->first()"` }
func (BuiltinSelections) Last ¶ added in v0.1.2
func (builtin BuiltinSelections) Last(node *goquery.Selection, args ...string) (out interface{}, err error)
Last last(selector=”) reduces the set of matched elements to the last in the set. It returns a new Selection object, and an empty Selection object if the selection is empty.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->last()"` }
func (BuiltinSelections) Next ¶ added in v0.1.2
func (builtin BuiltinSelections) Next(node *goquery.Selection, args ...string) (out interface{}, err error)
Next next(selector=”) gets the immediately following sibling of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->next()"` }
func (BuiltinSelections) Parent ¶ added in v0.1.2
func (builtin BuiltinSelections) Parent(node *goquery.Selection, args ...string) (out interface{}, err error)
Parent parent(selector=”) gets the parent elements of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->parent()"` }
func (BuiltinSelections) Parents ¶ added in v0.1.2
func (builtin BuiltinSelections) Parents(node *goquery.Selection, args ...string) (out interface{}, err error)
Parents parents(selector=”) gets the parent elements of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->parents()"` }
func (BuiltinSelections) ParentsUntil ¶ added in v0.1.2
func (builtin BuiltinSelections) ParentsUntil(node *goquery.Selection, args ...string) (out interface{}, err error)
ParentsUntil parentsUntil(selector) gets the ancestors of each element in the Selection, up to but not including the element matched by the selector. It returns a new Selection object containing the matched elements. It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->parentsUntil('.wrap')"` }
func (BuiltinSelections) Prev ¶ added in v0.1.2
func (builtin BuiltinSelections) Prev(node *goquery.Selection, args ...string) (out interface{}, err error)
Prev prev() gets the immediately preceding sibling of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->prev()"` }
func (BuiltinSelections) Siblings ¶ added in v0.1.2
func (builtin BuiltinSelections) Siblings(node *goquery.Selection, args ...string) (out interface{}, err error)
Siblings siblings() gets the siblings of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.
struct { SubStruct struct { Example string `pagser:".selector->text()"` } `pagser:".selector->siblings()"` }
type CallFunc ¶
CallFunc write function interface
Define Global Function ¶
func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { //Todo return "Hello", nil } //Register function pagser.RegisterFunc("MyFunc", MyFunc) //Use function type PageData struct{ Text string `pagser:"h1->MyFunc()"` }
Define Struct Function ¶
//Use function type PageData struct{ Text string `pagser:"h1->MyFunc()"` } func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { //Todo return "Hello", nil }
Lookup function priority order ¶
struct method -> parent method -> ... -> global
Implicit convert type ¶
Automatic type conversion, Output result string convert to int, int64, float64...
CallFunc is a define function interface
type Config ¶
type Config struct { TagName string //struct tag name, default is `pagser` FuncSymbol string //Function symbol, default is `->` CastError bool //Returns an error when the type cannot be converted, default is `false` Debug bool //Debug mode, debug will print some log, default is `false` }
Config configuration
func DefaultConfig ¶
func DefaultConfig() Config
DefaultConfig the default Config
Config{ TagName: "pagser", FuncSymbol: "->", CastError: false, Debug: false, }
type Pagser ¶
type Pagser struct { Config Config // contains filtered or unexported fields }
Pagser the page parser
func NewWithConfig ¶
NewWithConfig create pagser client with Config and error
Example ¶
cfg := Config{ TagName: "pagser", FuncSymbol: "->", CastError: false, Debug: false, } p, err := NewWithConfig(cfg) if err != nil { log.Fatal(err) } //data parser model var page ExamplePage //parse html data err = p.Parse(&page, rawExampleHtml) //check error if err != nil { log.Fatal(err) }
Output:
func (*Pagser) Parse ¶
Parse parse html to struct
Example ¶
//New default Config p := New() //data parser model var page ExamplePage //parse html data err := p.Parse(&page, rawExampleHtml) //check error if err != nil { log.Fatal(err) } //print result log.Printf("%v", page)
Output:
func (*Pagser) ParseDocument ¶
ParseDocument parse document to struct
Example ¶
//New default Config p := New() //data parser model var data ExamplePage doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml)) if err != nil { log.Fatal(err) } //parse document err = p.ParseDocument(&data, doc) //check error if err != nil { log.Fatal(err) } //print result log.Printf("%v", data)
Output:
func (*Pagser) ParseReader ¶ added in v0.0.3
ParseReader parse html to struct
Example ¶
resp, err := http.Get("https://raw.githubusercontent.com/foolin/pagser/master/_examples/pages/demo.html") if err != nil { log.Fatal(err) } defer resp.Body.Close() //New default Config p := New() //data parser model var page ExamplePage //parse html data err = p.ParseReader(&page, resp.Body) //check error if err != nil { panic(err) } log.Printf("%v", page)
Output:
func (*Pagser) ParseSelection ¶
ParseSelection parse selection to struct
Example ¶
//New default Config p := New() //data parser model var data ExamplePage doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml)) if err != nil { log.Fatal(err) } //parse document err = p.ParseSelection(&data, doc.Selection) //check error if err != nil { log.Fatal(err) } //print result log.Printf("%v", data)
Output:
func (*Pagser) RegisterFunc ¶
RegisterFunc register function for parse result
pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) { //Todo return "Hello", nil })
Example ¶
p := New() p.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) { //Todo return "Hello", nil })
Output: