Documentation ¶
Overview ¶
Package scraper implements a library for scraping web pages.
Index ¶
- Constants
- func ChromeUnmarshal(ctx context.Context, v interface{}, cssSelector string, opt UnmarshalOption) error
- func ExtractNumber(in string) (float64, error)
- func Unmarshal(v interface{}, selection *goquery.Selection, opt UnmarshalOption) error
- type AvailableValue
- type BufferedLogger
- type ChromeSession
- func (session *ChromeSession) DownloadFile(filename *string, options DownloadFileOptions, actions ...chromedp.Action) chromedp.ActionFunc
- func (session *ChromeSession) RunNavigate(URL string) (*network.Response, error)
- func (session *ChromeSession) SaveFile(filename *string) chromedp.ActionFunc
- func (session *ChromeSession) SaveHtml(filename *string) chromedp.Action
- func (session *ChromeSession) Unmarshal(v interface{}, cssSelector string, opt UnmarshalOption) error
- type ConsoleLogger
- type DownloadFileOptions
- type DownloadedFileNameNotSatisfiedError
- type FollowAnchorTextOption
- type Form
- func (form *Form) Check(name string) error
- func (form *Form) NumSelect(name string) (int, error)
- func (form *Form) PrintSelection(name string) error
- func (form *Form) Select(name string, index int) error
- func (form *Form) Set(name string, value string) error
- func (form *Form) SetByLabel(name string, label string) error
- func (form *Form) SetForce(name string, value string) error
- func (form *Form) Uncheck(name string) error
- func (form *Form) Unset(name string) error
- func (form *Form) ValueByLabel(name string, label string) (string, error)
- type FormElement
- type FormElementNotFoundError
- type Logger
- type LoginError
- type MaintenanceError
- type NewChromeOptions
- type Page
- type PageOption
- type RequestError
- type Response
- type ResponseError
- type RetryAndRecordError
- type Session
- func (session *Session) ApplyRefresh(page *Page, maxRedirect int) (*Page, error)
- func (session *Session) Cookies(u *url.URL) []*http.Cookie
- func (session *Session) FollowAnchorText(page *Page, text string) (*Response, error)
- func (session *Session) FollowAnchorTextOpt(page *Page, text string, opt FollowAnchorTextOption) (*Response, error)
- func (session *Session) FollowLink(page *Page, linkSelector string, attr string) (*Response, error)
- func (session *Session) FollowSelectionLink(page *Page, selection *goquery.Selection, attr string) (*Response, error)
- func (session *Session) FormAction(page *Page, formSelector string, params map[string]string) (*Response, error)
- func (session *Session) Frame(page *Page, frameSelector string) (*Page, error)
- func (session *Session) Get(getUrl string) (*Response, error)
- func (session *Session) GetPage(getUrl string) (*Page, error)
- func (session *Session) GetPageMaxRedirect(getUrl string, maxRedirect int) (*Page, error)
- func (session *Session) LoadCookie() error
- func (session *Session) NewChrome() (*ChromeSession, context.CancelFunc, error)
- func (session *Session) NewChromeOpt(options NewChromeOptions) (chromeSession *ChromeSession, cancelFunc context.CancelFunc, err error)
- func (session *Session) OpenURL(page *Page, url string) (*Response, error)
- func (session *Session) Printf(format string, a ...interface{})
- func (session *Session) SaveCookie() error
- func (session *Session) SetCookies(u *url.URL, cookies []*http.Cookie)
- func (session *Session) Submit(form *Form) (*Response, error)
- func (session *Session) SubmitOpt(form *Form, imageId string) (*Response, error)
- type UnexpectedContentTypeError
- type UnmarshalFieldError
- type UnmarshalMustBePointerError
- type UnmarshalOption
- type UnmarshalParseNumberError
- type UnmarshalUnexportedFieldError
- type Unmarshaller
Constants ¶
const ( //UserAgent_Chrome39 = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36" //UserAgent_iOS8 = "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12B466" UserAgent_firefox86 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" UserAgent_default = UserAgent_firefox86 )
Variables ¶
This section is empty.
Functions ¶
func ChromeUnmarshal ¶ added in v0.0.16
func ChromeUnmarshal(ctx context.Context, v interface{}, cssSelector string, opt UnmarshalOption) error
func ExtractNumber ¶
func Unmarshal ¶
func Unmarshal(v interface{}, selection *goquery.Selection, opt UnmarshalOption) error
Unmarshal parses selection and stores to v. if v is a struct, each field may specify following tags.
- `find` tag with CSS selector to specify sub element.
- `html` if exists, gets HTML of the child elements as text. ignores `attr`.
- `attr` tag with attribute name to get a text. if both `html` and `tag` do not exist, get a text from text element.
- `re` tag with regular expression, use only matched substring from a text.
- `time` tag with time format to parse for time.Time.
Types ¶
type AvailableValue ¶
AvailableValue holds an available value and corresponding label to display.
type BufferedLogger ¶
type BufferedLogger struct {
// contains filtered or unexported fields
}
func (*BufferedLogger) Flush ¶
func (buflog *BufferedLogger) Flush(logger Logger)
func (*BufferedLogger) Printf ¶
func (buflog *BufferedLogger) Printf(format string, a ...interface{})
type ChromeSession ¶ added in v0.0.19
func (*ChromeSession) DownloadFile ¶ added in v0.0.37
func (session *ChromeSession) DownloadFile(filename *string, options DownloadFileOptions, actions ...chromedp.Action) chromedp.ActionFunc
func (*ChromeSession) RunNavigate ¶ added in v0.0.19
func (session *ChromeSession) RunNavigate(URL string) (*network.Response, error)
RunNavigate navigates to page URL and download html like Session.invoke
func (*ChromeSession) SaveFile ¶ added in v0.0.37
func (session *ChromeSession) SaveFile(filename *string) chromedp.ActionFunc
*
- SaveFile saves file to filename
- filename: DownloadFile の結果を chromedp.Run で続ける場合、ポインタにしないと実行前の値が渡ってしまうため、ポインタにする
func (*ChromeSession) SaveHtml ¶ added in v0.0.19
func (session *ChromeSession) SaveHtml(filename *string) chromedp.Action
func (*ChromeSession) Unmarshal ¶ added in v0.0.20
func (session *ChromeSession) Unmarshal(v interface{}, cssSelector string, opt UnmarshalOption) error
type ConsoleLogger ¶
type ConsoleLogger struct{}
func (ConsoleLogger) Printf ¶
func (logger ConsoleLogger) Printf(format string, a ...interface{})
type DownloadFileOptions ¶ added in v0.0.39
type DownloadedFileNameNotSatisfiedError ¶ added in v0.0.39
func (*DownloadedFileNameNotSatisfiedError) Error ¶ added in v0.0.39
func (e *DownloadedFileNameNotSatisfiedError) Error() string
type FollowAnchorTextOption ¶
type Form ¶
type Form struct { Action string Method string Elements map[string]*FormElement Logger Logger // contains filtered or unexported fields }
Form holds form data and submit information
func (*Form) NumSelect ¶
NumSelect returns number of available values of the select element specified by name.
func (*Form) PrintSelection ¶
PrintSelection shows available values of the element specified by name.
func (*Form) Set ¶
Set sets a value to the element specified by name. if element have AvailableValues(eg. check, radio or select elements), value must be equals one of them.
type FormElement ¶
type FormElement struct { Type string // "select", "hidden", "submit", "text", "email", "password", "button", "checkbox", "radio", "image" Name string Value *AvailableValue AvailableValues []*AvailableValue }
FormElement holds a form element.
func (*FormElement) AddAvailableValue ¶
func (element *FormElement) AddAvailableValue(val *AvailableValue)
func (*FormElement) GoString ¶
func (element *FormElement) GoString() string
type FormElementNotFoundError ¶
type FormElementNotFoundError struct {
Name string
}
func (FormElementNotFoundError) Error ¶
func (error FormElementNotFoundError) Error() string
type LoginError ¶
type LoginError struct {
Message string
}
func (LoginError) Error ¶
func (error LoginError) Error() string
type MaintenanceError ¶
type MaintenanceError struct {
Message string
}
func (MaintenanceError) Error ¶
func (error MaintenanceError) Error() string
type NewChromeOptions ¶ added in v0.0.14
type Page ¶
Page holds DOM structure of the page and its URL, Logging information.
func (*Page) Form ¶
Form generates a Form object from a form object identified by selector in the Page
func (*Page) MetaRefresh ¶
MetaRefresh returns a URL from "meta http-equiv=refresh" tag if it exists. otherwise, returns nil.
type PageOption ¶ added in v0.0.40
type RequestError ¶ added in v0.0.13
func (RequestError) Error ¶ added in v0.0.13
func (err RequestError) Error() string
type Response ¶
type Response struct { Request *http.Request ContentType string RawBody []byte Encoding encoding.Encoding Logger Logger }
Response holds a raw response and its request information.
type ResponseError ¶ added in v0.0.13
func (ResponseError) Error ¶ added in v0.0.13
func (err ResponseError) Error() string
type RetryAndRecordError ¶
type RetryAndRecordError struct {
Filename string
}
func (RetryAndRecordError) Error ¶
func (error RetryAndRecordError) Error() string
type Session ¶
type Session struct { Name string // directory name to store session files(downloaded files and cookies) Encoding encoding.Encoding // force charset over Content-Type response header UserAgent string // specify User-Agent FilePrefix string // prefix to directory of session files NotUseNetwork bool // load from previously downloaded session files rather than network access SaveToFile bool // save downloaded pages to session directory ShowRequestHeader bool // print request headers with Logger ShowResponseHeader bool // print response headers with Logger ShowFormPosting bool // print posting form data, with Logger Log Logger BodyFilter func(resp *Response, body []byte) ([]byte, error) // contains filtered or unexported fields }
Session holds communication and logging options
func NewSession ¶
func (*Session) ApplyRefresh ¶
ApplyRefresh mimics HTML Meta Refresh.
func (*Session) FollowAnchorText ¶
func (*Session) FollowAnchorTextOpt ¶
func (*Session) FollowLink ¶
func (*Session) FollowSelectionLink ¶
func (session *Session) FollowSelectionLink(page *Page, selection *goquery.Selection, attr string) (*Response, error)
FollowSelectionLink opens a link specified by attr of the selection and returns a Response.
func (*Session) FormAction ¶
func (session *Session) FormAction(page *Page, formSelector string, params map[string]string) (*Response, error)
FormAction submits a form (easy version)
func (*Session) GetPageMaxRedirect ¶
GetPageMaxRedirect gets the URL and follows HTTP meta refresh if response page contained that.
func (*Session) LoadCookie ¶
func (*Session) NewChrome ¶
func (session *Session) NewChrome() (*ChromeSession, context.CancelFunc, error)
func (*Session) NewChromeOpt ¶ added in v0.0.14
func (session *Session) NewChromeOpt(options NewChromeOptions) (chromeSession *ChromeSession, cancelFunc context.CancelFunc, err error)
func (*Session) SaveCookie ¶
SaveCookie stores cookies to a file. must call LoadCookie() before call SaveCookie().
type UnexpectedContentTypeError ¶
func (UnexpectedContentTypeError) Error ¶
func (error UnexpectedContentTypeError) Error() string
type UnmarshalFieldError ¶
func (UnmarshalFieldError) Error ¶
func (err UnmarshalFieldError) Error() string
type UnmarshalMustBePointerError ¶
type UnmarshalMustBePointerError struct{}
func (UnmarshalMustBePointerError) Error ¶
func (err UnmarshalMustBePointerError) Error() string
type UnmarshalOption ¶
type UnmarshalOption struct { Attr string // if nonempty, get attribute text of the element. get Text() otherwise. Re string // Regular Expression to match the text. must contain one capture. Time string // for time.Time only. parse with this format. Loc *time.Location // time zone for parsing time.Time. Html bool // get Html() rather than Text(). ignores Attr. Ignore string // is string matches, results zero value. }
type UnmarshalParseNumberError ¶ added in v0.0.11
func (UnmarshalParseNumberError) Error ¶ added in v0.0.11
func (err UnmarshalParseNumberError) Error() string
type UnmarshalUnexportedFieldError ¶
type UnmarshalUnexportedFieldError struct{}
func (UnmarshalUnexportedFieldError) Error ¶
func (err UnmarshalUnexportedFieldError) Error() string