service

package module
v0.4.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 22, 2022 License: MIT Imports: 26 Imported by: 0

README

Go codecov

Rabida 中文

Rabida is a simply crawler framework based on chromedp .

Supported features

  • Pagination: specify css selector for next page.
  • PrePaginate: do something before pagination, such as click button.
  • HttpCookies: enable browser cookie for current job.
  • Delay And Timeout: can customize delay and timeout.
  • AntiDetection: default loaded anti_detetion script for current job. script sourced from puppeteer-extra-stealth
  • Strict Mode: useragent、browser、platform must be matched,will be related chrome-mac if true

Install

go get -u github.com/JohnnyTing/rabida

Configuration

add .env file for your project

RABI_DELAY=1s,2s
RABI_CONCURRENCY=1
RABI_THROTTLE_NUM=2
RABI_THROTTLE_DURATION=1s
RABI_TIMEOUT=3s
RABI_MODE=headless
RABI_DEBUG=false
RABI_OUT=out
RABI_STRICT=false
RABI_PROXY=

Usage

See examples for more details

func TestRabidaImplCrawl(t *testing.T) {
	conf := config.LoadFromEnv()
	fmt.Printf("%+v\n", conf)
	rabi := NewRabida(conf)
	job := Job{
		Link: "https://tieba.baidu.com/f?kw=nba",
		CssSelector: CssSelector{
			Scope: `#thread_list > li.j_thread_list`,
			Attrs: map[string]CssSelector{
				"title": {
					Css: "div.threadlist_title > a",
				},
				"date": {
					Css: "span.threadlist_reply_date",
				},
			},
		},
		Paginator: CssSelector{
			Css: "#frs_list_pager > a.next.pagination-item",
		},
		Limit: 3,
	}
	err := rabi.Crawl(context.Background(), job, func(ret []interface{}, nextPageUrl string, currentPageNo int) bool {
		for _, item := range ret {
			fmt.Println(gabs.Wrap(item).StringIndent("", "  "))
		}
		if currentPageNo >= job.Limit {
			return true
		}
		return false
	}, nil, []chromedp.Action{
		chromedp.EmulateViewport(1777, 903, chromedp.EmulateLandscape),
	})
	if err != nil {
		panic(fmt.Sprintf("%+v", err))
	}
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrNotFound error = errNotFound{}

Functions

func CssOrXpath

func CssOrXpath(cssSelector CssSelector) string

func DelaySleep

func DelaySleep(conf config.RabiConfig, tag string)

func ExecEventCondition

func ExecEventCondition(ctx context.Context, conf config.RabiConfig, condition *Condition, queryActions []chromedp.QueryOption) (bool, error)

Types

type Condition

type Condition struct {
	Value        string `json:"value"`
	CheckFunc    func(text, value string) bool
	ExecSelector ExecSelector `json:"execSelector"`
}

type CssSelector

type CssSelector struct {
	Css string `json:"css"`
	// Attr default is innerText
	Attr string `json:"attr"`
	// Scope supply a scope to each selector
	// In jQuery, this would look something like this: $(scope).find(selector)
	Scope string `json:"scope"`
	// Attrs map each attribute to a css selector. when Attrs equals nil, stop recursively populating
	Attrs map[string]CssSelector `json:"attrs"`
	// Iframe if true, we will look for the element(s) within the first iframe in the page. if IframeSelector exist, will look for this.
	Iframe bool `json:"iframe"`
	// IframeSelector specify the iframe selector if have multiple iframe elements
	IframeSelector *CssSelector `json:"iframeSelector"`
	// XpathScope Note: only choose one between xpath and css selector
	XpathScope string `json:"xpathScope"`
	// Xpath xpath expression
	// eg: //*[@id="zz"]/div[2]/ul/li[1]/text()
	// eg: //div[@id="indexCarousel"]//div[@class="item"]//img/@src
	Xpath    string         `json:"xpath"`
	SetAttrs []SetAttribute `json:"setAttrs"`
	// Before dosomething before retrieve value
	Before    []EventSelector `json:"before"`
	Condition *Condition      `json:"condition"`
}

type Event

type Event string
const (
	ClickEvent              Event = "click"
	SetAttributesValueEvent Event = "setAttributesValue"
	TextEvent               Event = "getTextValue"
	GetAttributeValueEvent  Event = "getAttributeValue"
)

type EventSelector

type EventSelector struct {
	Type      Event       `json:"type"`
	Condition Condition   `json:"condition"`
	Selector  CssSelector `json:"selector"`
}

type ExecSelector

type ExecSelector struct {
	Type     Event       `json:"type"`
	Selector CssSelector `json:"selector"`
}

type HttpCookies

type HttpCookies struct {
	RawCookies string `json:"rawCookies"`
	Domain     string `json:"domain"`
	// Expires hour, default 1 year
	Expires int `json:"expires"`
}

type Job

type Job struct {
	// Link the url you want to crawl
	Link string `json:"link"`
	// CssSelector root css selector
	CssSelector CssSelector `json:"cssSelector"`
	// PrePaginate do something before paginate
	PrePaginate []EventSelector `json:"prePaginate"`
	// Paginator css selector for next page
	Paginator     CssSelector `json:"paginator"`
	PaginatorFunc func(currentPageNo int) CssSelector
	// Limit limits how many pages should be crawled
	Limit         int         `json:"limit"`
	StartPageBtn  CssSelector `json:"startPageBtn"`
	StartPageUrl  string      `json:"startPageUrl"`
	EnableCookies HttpCookies `json:"enableCookies"`
}

type Rabida

type Rabida interface {
	Crawl(ctx context.Context, job Job,

		callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
	) error

	CrawlWithConfig(ctx context.Context, job Job,

		callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		conf config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error

	CrawlWithListeners(ctx context.Context, job Job,

		callback func(ctx context.Context, ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		confPtr *config.RabiConfig,
		options []chromedp.ExecAllocatorOption,
		listeners ...func(ev interface{}),
	) error

	DownloadFile(ctx context.Context, job Job,

		callback func(file string),
		confPtr *config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error
}

func NewRabida

func NewRabida(conf *config.RabiConfig) Rabida

type RabidaImpl

type RabidaImpl struct {
	// contains filtered or unexported fields
}

func (RabidaImpl) Crawl

func (r RabidaImpl) Crawl(ctx context.Context, job Job, callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,
	before []chromedp.Action, after []chromedp.Action) error

func (RabidaImpl) CrawlWithConfig

func (r RabidaImpl) CrawlWithConfig(ctx context.Context, job Job, callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, conf config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) CrawlWithListeners

func (r RabidaImpl) CrawlWithListeners(ctx context.Context, job Job, callback func(ctx context.Context, ret []interface{}, nextPageUrl string, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, confPtr *config.RabiConfig, options []chromedp.ExecAllocatorOption, listeners ...func(ev interface{})) error

func (RabidaImpl) DownloadFile

func (r RabidaImpl) DownloadFile(ctx context.Context, job Job, callback func(file string), confPtr *config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) Html

func (r RabidaImpl) Html(ctx context.Context, father *cdp.Node, conf config.RabiConfig) *html.Node

type SetAttribute

type SetAttribute struct {
	AttributeName  string `json:"attributeName"`
	AttributeValue string `json:"attributeValue"`
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL