service

package module
v0.3.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 6, 2022 License: MIT Imports: 25 Imported by: 0

README

Rabida

Rabida is a simply crawler framework based on chromedp .

Supported features

  • Pagination: specify css selector for next page.
  • PrePaginate: do something before pagination, such as click button.
  • HttpCookies: enable browser cookie for current job.
  • Delay And Timeout: can customize delay and timeout.
  • AntiDetection: default loaded anti_detetion script for current job. script sourced from puppeteer-extra-stealth
  • Strict Mode: useragent、browser、platform must be matched,will be related chrome-mac if true

Install

go get -u github.com/JohnnyTing/rabida

Configuration

add .env file for your project

RABI_DELAY=1s,2s
RABI_CONCURRENCY=1
RABI_THROTTLE_NUM=2
RABI_THROTTLE_DURATION=1s
RABI_TIMEOUT=3s
RABI_MODE=headless
RABI_DEBUG=false
RABI_OUT=out
RABI_STRICT=false
RABI_PROXY=

Usage

See examples for more details

func TestRabidaImplCrawl(t *testing.T) {
	conf := config.LoadFromEnv()
	fmt.Printf("%+v\n", conf)
	rabi := NewRabida(conf)
	job := Job{
		Link: "https://tieba.baidu.com/f?kw=nba",
		CssSelector: CssSelector{
			Scope: `#thread_list > li.j_thread_list`,
			Attrs: map[string]CssSelector{
				"title": {
					Css: "div.threadlist_title > a",
				},
				"date": {
					Css: "span.threadlist_reply_date",
				},
			},
		},
		Paginator: CssSelector{
			Css: "#frs_list_pager > a.next.pagination-item",
		},
		Limit: 3,
	}
	err := rabi.Crawl(context.Background(), job, func(ret []interface{}, nextPageUrl string, currentPageNo int) bool {
		for _, item := range ret {
			fmt.Println(gabs.Wrap(item).StringIndent("", "  "))
		}
		if currentPageNo >= job.Limit {
			return true
		}
		return false
	}, nil, []chromedp.Action{
		chromedp.EmulateViewport(1777, 903, chromedp.EmulateLandscape),
	})
	if err != nil {
		panic(fmt.Sprintf("%+v", err))
	}
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrNotFound error = errNotFound{}

Functions

func CssOrXpath

func CssOrXpath(cssSelector CssSelector) string

func DelaySleep

func DelaySleep(conf config.RabiConfig, tag string)

func ExecEventCondition

func ExecEventCondition(ctx context.Context, conf config.RabiConfig, event EventSelector, queryActions []chromedp.QueryOption) (bool, error)

Types

type Condition

type Condition struct {
	Value        string
	CheckFunc    func(text, value string) bool
	ExecSelector ExecSelector
}

type CssSelector

type CssSelector struct {
	Css string
	// Attr default is innerText
	Attr string
	// Scope supply a scope to each selector
	// In jQuery, this would look something like this: $(scope).find(selector)
	Scope string
	// Attrs map each attribute to a css selector. when Attrs equals nil, stop recursively populating
	Attrs map[string]CssSelector
	// Iframe if true, we will look for the element(s) within the first iframe in the page. if IframeSelector exist, will look for this.
	Iframe bool
	// IframeSelector specify the iframe selector if have multiple iframe elements
	IframeSelector *CssSelector
	// XpathScope Note: only choose one between xpath and css selector
	XpathScope string
	// Xpath xpath expression
	// eg: //*[@id="zz"]/div[2]/ul/li[1]/text()
	// eg: //div[@id="indexCarousel"]//div[@class="item"]//img/@src
	Xpath    string
	SetAttrs []SetAttribute
	// Before dosomething before retrieve value
	Before []EventSelector
}

type Event

type Event string
const (
	ClickEvent              Event = "click"
	SetAttributesValueEvent Event = "setAttributesValue"
	TextEvent               Event = "getTextValue"
)

type EventSelector

type EventSelector struct {
	Type      Event
	Condition Condition
	Selector  CssSelector
}

type ExecSelector

type ExecSelector struct {
	Type     Event
	Selector CssSelector
}

type HttpCookies

type HttpCookies struct {
	RawCookies string
	Domain     string
	// Expires hour, default 1 year
	Expires int
}

type Job

type Job struct {
	// Link the url you want to crawl
	Link string
	// CssSelector root css selector
	CssSelector CssSelector
	// PrePaginate do something before paginate
	PrePaginate []EventSelector
	// Paginator css selector for next page
	Paginator CssSelector
	// Limit limits how many pages should be crawled
	Limit         int
	StartPageBtn  CssSelector
	StartPageUrl  string
	EnableCookies HttpCookies
}

type Rabida

type Rabida interface {
	Crawl(ctx context.Context, job Job,

		callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
	) error

	CrawlWithConfig(ctx context.Context, job Job,

		callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		conf config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error

	CrawlWithListeners(ctx context.Context, job Job,

		callback func(ctx context.Context, ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		confPtr *config.RabiConfig,
		options []chromedp.ExecAllocatorOption,
		listeners ...func(ev interface{}),
	) error

	DownloadFile(ctx context.Context, job Job,

		callback func(file string),
		confPtr *config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error
}

func NewRabida

func NewRabida(conf *config.RabiConfig) Rabida

type RabidaImpl

type RabidaImpl struct {
	// contains filtered or unexported fields
}

func (RabidaImpl) Crawl

func (r RabidaImpl) Crawl(ctx context.Context, job Job, callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,
	before []chromedp.Action, after []chromedp.Action) error

func (RabidaImpl) CrawlWithConfig

func (r RabidaImpl) CrawlWithConfig(ctx context.Context, job Job, callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, conf config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) CrawlWithListeners

func (r RabidaImpl) CrawlWithListeners(ctx context.Context, job Job, callback func(ctx context.Context, ret []interface{}, nextPageUrl string, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, confPtr *config.RabiConfig, options []chromedp.ExecAllocatorOption, listeners ...func(ev interface{})) error

func (RabidaImpl) DownloadFile

func (r RabidaImpl) DownloadFile(ctx context.Context, job Job, callback func(file string), confPtr *config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) Html

func (r RabidaImpl) Html(ctx context.Context, father *cdp.Node, conf config.RabiConfig) *html.Node

type SetAttribute

type SetAttribute struct {
	AttributeName  string
	AttributeValue string
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL