rpa

package module
v1.2.12 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 29, 2024 License: MPL-2.0 Imports: 21 Imported by: 0

README

Overview

Go Reference

A configurable web data crawler library uses rod, the goal is to simplify the process of rod scraping web data through configurability. The final output can be changed by modifying the configuration file without having to recompile the program.

Usage

  1. grab url
func main() {
	r := rpa.Crawler{}
	r.AttachDefaultBrowser()
	defer r.Close()

	url := "https://cn.bing.com/search?q=sample+simple+pdf"
	val, _, err := r.CrawlUrl(url, "./sample/sample_zip.json", true, true)
	if err != nil {
		fmt.Println(err)
	} else {
		s, _ := json.MarshalIndent(val, "", "\t")
		fmt.Println(string(s))
	}
}
  1. wait element show/hide
	import helper "github.com/rpdg/rod-helper"

    helper.WaitElementHide(page, ".loading", 60)
    helper.WaitElementShow(page, ".data-table", 10)
    has := helper.ElementVisible(page, ".next-page")
    if has {
        println("data table has next page")
    }


Custom Pseudo class

  1. select element under iframe / frame:

    :frame(iframe_element_selector) inner_element_selector
    
  2. select element under shadow-dom

    :shadow(web_component_selector) inner_element_selector
    

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ConnectChromeBrowser added in v1.1.5

func ConnectChromeBrowser(leakless, headless bool) (br *rod.Browser, err error)

ConnectChromeBrowser returns the Chrome browser if installed

func ConnectChromiumBrowser added in v1.1.6

func ConnectChromiumBrowser(leakless, headless bool) (br *rod.Browser, err error)

ConnectChromiumBrowser returns the rod's embed browser

func ConnectDefaultBrowser added in v1.1.5

func ConnectDefaultBrowser(leakless, headless bool) (br *rod.Browser, err error)

ConnectDefaultBrowser returns the system's default browser

func ConnectEdgeBrowser added in v1.1.6

func ConnectEdgeBrowser(leakless, headless bool, ieMode bool) (br *rod.Browser, err error)

ConnectEdgeBrowser returns the Edge browser if installed

func ElementVisible

func ElementVisible(page *rod.Page, selector string) bool

ElementVisible detects whether the selected element is existed and visible

func EmptyDirectory added in v1.1.3

func EmptyDirectory(dir string) error

EmptyDirectory will delete all the contents of a directory

func ExecShell added in v1.2.0

func ExecShell(ctx context.Context, command string) (string, error)

ExecShell 执行shell命令,可设置执行超时时间

func ExtractUrlParam added in v1.2.2

func ExtractUrlParam(urlString, paramName string) (string, error)

func FileExists added in v1.0.4

func FileExists(name string) (bool, error)

FileExists to check if a file exists

func GBK2UTF8 added in v1.2.0

func GBK2UTF8(s string) string

GBK2UTF8 GBK编码转换为UTF8

func GetDictAndLastSegmentByPath added in v1.2.0

func GetDictAndLastSegmentByPath(data map[string]interface{}, path string) (interface{}, string)

GetDictAndLastSegmentByPath returns the data extracted from the path and the last segment of the path.

func MustWaitDownloadRelax added in v1.2.10

func MustWaitDownloadRelax(b *rod.Browser) func() ([]byte, string)

func NormalizeFilename added in v1.0.4

func NormalizeFilename(name string) string

NormalizeFilename will replace <>:"/\|?* in string

func OpenPage added in v1.0.3

func OpenPage(browser *rod.Browser, url string, sleep int64, selector string, sign WaitSign) (page *rod.Page, err error)

func QueryElem added in v1.1.6

func QueryElem(page *rod.Page, selector string) (*rod.Element, error)

func RaceShow added in v1.1.3

func RaceShow(page *rod.Page, selectors []string, timeoutSeconds int) (index int, elem *rod.Element, err error)

func RenameFileUnique added in v1.0.4

func RenameFileUnique(dir, fileName, ext string, try int) string

RenameFileUnique rename file name if there are duplicate files

func WaitElementHide

func WaitElementHide(page *rod.Page, selector string, timeoutSeconds int) (err error)

WaitElementHide waiting for a certain element on the page to disappear

func WaitElementShow

func WaitElementShow(page *rod.Page, selector string, timeoutSeconds int) (err error)

WaitElementShow waiting for a certain element on the page to appear

func WaitPage added in v1.0.5

func WaitPage(page *rod.Page, sleep int64, selector string, sign WaitSign) (err error)

Types

type ConfigNode added in v1.0.5

type ConfigNode struct {
	Selector string `json:"selector"`
	Label    string `json:"label"`
	ID       string `json:"id"`
}

type Crawler added in v1.0.1

type Crawler struct {
	Browser    *rod.Browser
	CfgFetcher func(path string) (*CrawlerConfig, error)
}

func (*Crawler) AttachChromeBrowser added in v1.1.0

func (c *Crawler) AttachChromeBrowser() error

func (*Crawler) AttachDefaultBrowser added in v1.0.1

func (c *Crawler) AttachDefaultBrowser() error

func (*Crawler) AttachEdgeBrowser added in v1.1.6

func (c *Crawler) AttachEdgeBrowser(ieMode bool) error

func (*Crawler) AttachEmbedBrowser added in v1.1.7

func (c *Crawler) AttachEmbedBrowser() error

func (*Crawler) Close added in v1.2.12

func (c *Crawler) Close()

func (*Crawler) CrawlPage added in v1.0.1

func (c *Crawler) CrawlPage(page *rod.Page, cfgOrFile interface{}, autoDownload bool, closeTab bool) (*Result, error)

func (*Crawler) CrawlUrl added in v1.0.1

func (c *Crawler) CrawlUrl(url string, cfgOrFile interface{}, autoDownload bool, closeTab bool) (*Result, *rod.Page, error)

type CrawlerConfig added in v1.0.5

type CrawlerConfig struct {
	PageLoad        PageLoad         `json:"pageLoad,omitempty"`
	DataSection     []DictData       `json:"dataSection"`
	SwitchSection   DictData         `json:"switchSection,omitempty"`
	DownloadRoot    string           `json:"downloadRoot,omitempty"`
	DownloadSection []DownloadConfig `json:"downloadSection,omitempty"`
}

type DictData added in v1.2.0

type DictData map[string]interface{}

type DownloadConfig added in v1.0.5

type DownloadConfig struct {
	ConfigNode
	SavePath     string             `json:"savePath,omitempty"`
	NameProper   string             `json:"nameProper,omitempty"`
	NameRender   string             `json:"nameRender,omitempty"`
	LinkProper   string             `json:"linkProper,omitempty"`
	LinkRender   string             `json:"linkRender,omitempty"`
	InsertTo     string             `json:"insertTo,omitempty"`
	DownloadType DownloadTypeString `json:"downloadType"`
}

type DownloadFileInfo added in v1.2.0

type DownloadFileInfo struct {
	Name  string `json:"name"`
	Url   string `json:"url"`
	Error string `json:"error"`
}

type DownloadResult added in v1.0.5

type DownloadResult struct {
	Label string             `json:"label"`
	Files []DownloadFileInfo `json:"files"`
}

DownloadResult is a part of result section

type DownloadTypeString added in v1.2.1

type DownloadTypeString string
const (
	DownloadUrl     DownloadTypeString = "url"
	DownloadElement DownloadTypeString = "element"
	PrintToPDF      DownloadTypeString = "toPDF"
)

type ExecuteResult added in v1.2.0

type ExecuteResult struct {
	// contains filtered or unexported fields
}

type ExternalResult added in v1.0.5

type ExternalResult struct {
	Config  string `json:"config"`
	Connect string `json:"connect"`
	ID      string `json:"id"`
}

type PageLoad added in v1.0.5

type PageLoad struct {
	Wait     WaitSign `json:"wait"`
	Selector string   `json:"selector,omitempty"`
	Sleep    int64    `json:"sleep,omitempty"`
}

type Result added in v1.0.5

type Result struct {
	Data            DictData                  `json:"data"`
	DownloadRoot    string                    `json:"downloadRoot"`
	Downloads       map[string]DownloadResult `json:"downloads"`
	ExternalSection map[string]ExternalResult `json:"externalSection"`
}

type WaitSign added in v1.0.2

type WaitSign string
const (
	WaitShow  WaitSign = "show"
	WaitHide  WaitSign = "hide"
	WaitDelay WaitSign = "wait"
)

Directories

Path Synopsis
Generated by https://quicktype.io
Generated by https://quicktype.io

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL