scraper

package module
v0.0.40 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 1, 2024 License: ISC Imports: 28 Imported by: 0

README

scraper

基本的な使い方

	// 初期化
	var logger scraper.ConsoleLogger
	session := scraper.NewSession("session-name", logger) // session-name はログフォルダ名になる

	// cookieを読む ( session-name/cookie というファイルを使う)
	err := session.LoadCookie()
	if err != nil {
		log.Fatal(err)
	}

	// ページを開く
	page, err := session.GetPage("https://example.com")
	if err != nil {
		log.Fatal(err)
	}

	// form 送信
	form, err := page.Form("form") // CSS selector でformを特定する
	if err != nil {
		log.Fatal(err)
	}
	_ = form.Set("id", id)
	_ = form.Set("password", password)
	resp, err := session.Submit(form) // レスポンスを得る
	if err != nil {
		log.Fatal(err)
	}
	page, err = resp.Page() // レスポンスからページにする
	if err != nil {
		log.Fatal(err)
	}

	// cookie を保存
	err := session.SaveCookie()
	if err != nil {
		log.Fatal(err)
	}

	// Pageから読み取る
	type Link struct {
		Href string `attr:"href"`
		Text string
	}
	var links []Link
	err := scraper.Unmarshal(&links, page.Find("div.items a"), scraper.UnmarshalOption{})
	if err != nil {
		log.Fatal(err)
	}
	// -> links に <div class="items"> 以下にある <a> タグの href とテキスト要素を収集する

メモ

https://github.com/juju/persistent-cookiejar は max-age がないクッキーを永続化してくれないので https://github.com/orirawlings/persistent-cookiejar を使ったらいけた。神

Documentation

Overview

Package scraper implements a library for scraping web pages.

Index

Constants

View Source
const (
	//UserAgent_Chrome39  = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36"
	//UserAgent_iOS8      = "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12B466"
	UserAgent_firefox86 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
	UserAgent_default   = UserAgent_firefox86
)

Variables

This section is empty.

Functions

func ChromeUnmarshal added in v0.0.16

func ChromeUnmarshal(ctx context.Context, v interface{}, cssSelector string, opt UnmarshalOption) error

func ExtractNumber

func ExtractNumber(in string) (float64, error)

func Unmarshal

func Unmarshal(v interface{}, selection *goquery.Selection, opt UnmarshalOption) error

Unmarshal parses selection and stores to v. if v is a struct, each field may specify following tags.

  • `find` tag with CSS selector to specify sub element.
  • `html` if exists, gets HTML of the child elements as text. ignores `attr`.
  • `attr` tag with attribute name to get a text. if both `html` and `tag` do not exist, get a text from text element.
  • `re` tag with regular expression, use only matched substring from a text.
  • `time` tag with time format to parse for time.Time.

Types

type AvailableValue

type AvailableValue struct {
	Value string
	Label string
}

AvailableValue holds an available value and corresponding label to display.

type BufferedLogger

type BufferedLogger struct {
	// contains filtered or unexported fields
}

func (*BufferedLogger) Flush

func (buflog *BufferedLogger) Flush(logger Logger)

func (*BufferedLogger) Printf

func (buflog *BufferedLogger) Printf(format string, a ...interface{})

type ChromeSession added in v0.0.19

type ChromeSession struct {
	*Session
	Ctx          context.Context
	DownloadPath string
}

func (*ChromeSession) DownloadFile added in v0.0.37

func (session *ChromeSession) DownloadFile(filename *string, options DownloadFileOptions, actions ...chromedp.Action) chromedp.ActionFunc

func (*ChromeSession) RunNavigate added in v0.0.19

func (session *ChromeSession) RunNavigate(URL string) (*network.Response, error)

RunNavigate navigates to page URL and download html like Session.invoke

func (*ChromeSession) SaveFile added in v0.0.37

func (session *ChromeSession) SaveFile(filename *string) chromedp.ActionFunc

*

  • SaveFile saves file to filename
  • filename: DownloadFile の結果を chromedp.Run で続ける場合、ポインタにしないと実行前の値が渡ってしまうため、ポインタにする

func (*ChromeSession) SaveHtml added in v0.0.19

func (session *ChromeSession) SaveHtml(filename *string) chromedp.Action

func (*ChromeSession) Unmarshal added in v0.0.20

func (session *ChromeSession) Unmarshal(v interface{}, cssSelector string, opt UnmarshalOption) error

type ConsoleLogger

type ConsoleLogger struct{}

func (ConsoleLogger) Printf

func (logger ConsoleLogger) Printf(format string, a ...interface{})

type DownloadFileOptions added in v0.0.39

type DownloadFileOptions struct {
	Timeout time.Duration
	Glob    string
}

type DownloadedFileNameNotSatisfiedError added in v0.0.39

type DownloadedFileNameNotSatisfiedError struct {
	DownloadedFilename string
	Glob               string
}

func (*DownloadedFileNameNotSatisfiedError) Error added in v0.0.39

type FollowAnchorTextOption

type FollowAnchorTextOption struct {
	CheckAlt  bool // if true, searches text into img.alt attribute
	NumLink   int  // if >0, must be equal to number of matched texts
	Index     int  // 0=use the first match
	TrimSpace bool // TrimSpace both before compare texts
}

type Form

type Form struct {
	Action   string
	Method   string
	Elements map[string]*FormElement
	Logger   Logger
	// contains filtered or unexported fields
}

Form holds form data and submit information

func (*Form) Check

func (form *Form) Check(name string) error

Check checks the checkbox specified by name.

func (*Form) NumSelect

func (form *Form) NumSelect(name string) (int, error)

NumSelect returns number of available values of the select element specified by name.

func (*Form) PrintSelection

func (form *Form) PrintSelection(name string) error

PrintSelection shows available values of the element specified by name.

func (*Form) Select

func (form *Form) Select(name string, index int) error

Select sets an answer to the select element specified by name.

func (*Form) Set

func (form *Form) Set(name string, value string) error

Set sets a value to the element specified by name. if element have AvailableValues(eg. check, radio or select elements), value must be equals one of them.

func (*Form) SetByLabel

func (form *Form) SetByLabel(name string, label string) error

func (*Form) SetForce

func (form *Form) SetForce(name string, value string) error

func (*Form) Uncheck

func (form *Form) Uncheck(name string) error

Uncheck unchecks the checkbox specified by name.

func (*Form) Unset

func (form *Form) Unset(name string) error

Unset unset or uncheck the element specified by name.

func (*Form) ValueByLabel

func (form *Form) ValueByLabel(name string, label string) (string, error)

find a Value from element name and its label of available values.

type FormElement

type FormElement struct {
	Type            string // "select", "hidden", "submit", "text", "email", "password", "button", "checkbox", "radio", "image"
	Name            string
	Value           *AvailableValue
	AvailableValues []*AvailableValue
}

FormElement holds a form element.

func (*FormElement) AddAvailableValue

func (element *FormElement) AddAvailableValue(val *AvailableValue)

func (*FormElement) GoString

func (element *FormElement) GoString() string

type FormElementNotFoundError

type FormElementNotFoundError struct {
	Name string
}

func (FormElementNotFoundError) Error

func (error FormElementNotFoundError) Error() string

type Logger

type Logger interface {
	Printf(format string, a ...interface{})
}

type LoginError

type LoginError struct {
	Message string
}

func (LoginError) Error

func (error LoginError) Error() string

type MaintenanceError

type MaintenanceError struct {
	Message string
}

func (MaintenanceError) Error

func (error MaintenanceError) Error() string

type NewChromeOptions added in v0.0.14

type NewChromeOptions struct {
	Headless bool
	Timeout  time.Duration
}

type Page

type Page struct {
	*goquery.Document
	BaseUrl *url.URL
	Logger  Logger
}

Page holds DOM structure of the page and its URL, Logging information.

func (*Page) Form

func (page *Page) Form(selector string) (*Form, error)

Form generates a Form object from a form object identified by selector in the Page

func (*Page) MetaRefresh

func (page *Page) MetaRefresh() *url.URL

MetaRefresh returns a URL from "meta http-equiv=refresh" tag if it exists. otherwise, returns nil.

func (page *Page) ResolveLink(relativeURL string) (string, error)

ResolveLink resolve relative URL form the page and returns a full URL.

type PageOption added in v0.0.40

type PageOption struct {
	BodyFilter func(resp *Response, body []byte) ([]byte, error)
}

type RequestError added in v0.0.13

type RequestError struct {
	RequestURL *url.URL
	Err        error
}

func (RequestError) Error added in v0.0.13

func (err RequestError) Error() string

type Response

type Response struct {
	Request     *http.Request
	ContentType string
	RawBody     []byte
	Encoding    encoding.Encoding
	Logger      Logger
}

Response holds a raw response and its request information.

func (*Response) Body

func (response *Response) Body() ([]byte, error)

Body returns response body converted from response.Encoding(if not nil).

func (*Response) CsvReader

func (response *Response) CsvReader() *csv.Reader

CsvReader returns csv.Reader of the response. it assumes the response is a CSV.

func (*Response) Page

func (response *Response) Page() (*Page, error)

func (*Response) PageOpt added in v0.0.40

func (response *Response) PageOpt(option PageOption) (*Page, error)

PageOpt parses raw response to DOM tree and returns a Page object.

type ResponseError added in v0.0.13

type ResponseError struct {
	RequestURL *url.URL
	Response   *http.Response
}

func (ResponseError) Error added in v0.0.13

func (err ResponseError) Error() string

type RetryAndRecordError

type RetryAndRecordError struct {
	Filename string
}

func (RetryAndRecordError) Error

func (error RetryAndRecordError) Error() string

type Session

type Session struct {
	Name string // directory name to store session files(downloaded files and cookies)

	Encoding   encoding.Encoding // force charset over Content-Type response header
	UserAgent  string            // specify User-Agent
	FilePrefix string            // prefix to directory of session files

	NotUseNetwork      bool // load from previously downloaded session files rather than network access
	SaveToFile         bool // save downloaded pages to session directory
	ShowRequestHeader  bool // print request headers with Logger
	ShowResponseHeader bool // print response headers with Logger
	ShowFormPosting    bool // print posting form data, with Logger
	Log                Logger

	BodyFilter func(resp *Response, body []byte) ([]byte, error)
	// contains filtered or unexported fields
}

Session holds communication and logging options

func NewSession

func NewSession(name string, log Logger) *Session

func (*Session) ApplyRefresh

func (session *Session) ApplyRefresh(page *Page, maxRedirect int) (*Page, error)

ApplyRefresh mimics HTML Meta Refresh.

func (*Session) Cookies

func (session *Session) Cookies(u *url.URL) []*http.Cookie

func (*Session) FollowAnchorText

func (session *Session) FollowAnchorText(page *Page, text string) (*Response, error)

func (*Session) FollowAnchorTextOpt

func (session *Session) FollowAnchorTextOpt(page *Page, text string, opt FollowAnchorTextOption) (*Response, error)
func (session *Session) FollowLink(page *Page, linkSelector string, attr string) (*Response, error)
func (session *Session) FollowSelectionLink(page *Page, selection *goquery.Selection, attr string) (*Response, error)

FollowSelectionLink opens a link specified by attr of the selection and returns a Response.

func (*Session) FormAction

func (session *Session) FormAction(page *Page, formSelector string, params map[string]string) (*Response, error)

FormAction submits a form (easy version)

func (*Session) Frame

func (session *Session) Frame(page *Page, frameSelector string) (*Page, error)

Frame returns a Page of specified frameSelector.

func (*Session) Get

func (session *Session) Get(getUrl string) (*Response, error)

Get invokes HTTP GET request.

func (*Session) GetPage

func (session *Session) GetPage(getUrl string) (*Page, error)

GetPage gets the URL and returns a Page.

func (*Session) GetPageMaxRedirect

func (session *Session) GetPageMaxRedirect(getUrl string, maxRedirect int) (*Page, error)

GetPageMaxRedirect gets the URL and follows HTTP meta refresh if response page contained that.

func (*Session) LoadCookie

func (session *Session) LoadCookie() error

func (*Session) NewChrome

func (session *Session) NewChrome() (*ChromeSession, context.CancelFunc, error)

func (*Session) NewChromeOpt added in v0.0.14

func (session *Session) NewChromeOpt(options NewChromeOptions) (chromeSession *ChromeSession, cancelFunc context.CancelFunc, err error)

func (*Session) OpenURL

func (session *Session) OpenURL(page *Page, url string) (*Response, error)

OpenURL invokes HTTP GET request with referer header as page's URL.

func (*Session) Printf

func (session *Session) Printf(format string, a ...interface{})

func (*Session) SaveCookie

func (session *Session) SaveCookie() error

SaveCookie stores cookies to a file. must call LoadCookie() before call SaveCookie().

func (*Session) SetCookies

func (session *Session) SetCookies(u *url.URL, cookies []*http.Cookie)

func (*Session) Submit

func (session *Session) Submit(form *Form) (*Response, error)

SubmitOpt submits a form.

func (*Session) SubmitOpt

func (session *Session) SubmitOpt(form *Form, imageId string) (*Response, error)

SubmitOpt submits a form. if imageId is non-empty, specifies "image" element to imitate clicking.

type UnexpectedContentTypeError

type UnexpectedContentTypeError struct {
	Expected string
	Actual   string
}

func (UnexpectedContentTypeError) Error

func (error UnexpectedContentTypeError) Error() string

type UnmarshalFieldError

type UnmarshalFieldError struct {
	Field string
	Err   error
}

func (UnmarshalFieldError) Error

func (err UnmarshalFieldError) Error() string

type UnmarshalMustBePointerError

type UnmarshalMustBePointerError struct{}

func (UnmarshalMustBePointerError) Error

func (err UnmarshalMustBePointerError) Error() string

type UnmarshalOption

type UnmarshalOption struct {
	Attr   string         // if nonempty, get attribute text of the element. get Text() otherwise.
	Re     string         // Regular Expression to match the text. must contain one capture.
	Time   string         // for time.Time only. parse with this format.
	Loc    *time.Location // time zone for parsing time.Time.
	Html   bool           // get Html() rather than Text(). ignores Attr.
	Ignore string         // is string matches, results zero value.
}

type UnmarshalParseNumberError added in v0.0.11

type UnmarshalParseNumberError struct {
	Err error
	Got string
}

func (UnmarshalParseNumberError) Error added in v0.0.11

func (err UnmarshalParseNumberError) Error() string

type UnmarshalUnexportedFieldError

type UnmarshalUnexportedFieldError struct{}

func (UnmarshalUnexportedFieldError) Error

type Unmarshaller

type Unmarshaller interface {
	Unmarshal(s string) error
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL