rpa

package module
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 18, 2023 License: MPL-2.0 Imports: 18 Imported by: 0

README

Overview

Go Reference

A library of auxiliary tools for rod, the goal is to simplify the process of rod scraping web data through configurability. The final output can be changed by modifying the configuration file without having to recompile the program.

Usage

func main() {
	r := rpa.Crawler{}
	r.AttachDefaultBrowser()
	b := r.Browser
	b.Close()

	url := "https://cn.bing.com/search?q=sample+simple+pdf"
	val, _, err := r.CrawlUrl(url, "./sample/bing.json", true, true)
	if err != nil {
		fmt.Println(err)
	} else {
		s, _ := json.MarshalIndent(val, "", "\t")
		fmt.Println(string(s))
	}
}

Custom Pseudo class

  1. select element under iframe / frame:

    :frame( iframe_tag_selector ) inner_element_selector
    
  2. select element under shadow-dom

    :shadow(web_component_selector) inner_element_selector
    

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ElementVisible

func ElementVisible(page *rod.Page, selector string) bool

ElementVisible detects whether the selected element is existed and visible

func FileExists added in v1.0.4

func FileExists(name string) (bool, error)

FileExists to check if a file exists

func NormalizeFilename added in v1.0.4

func NormalizeFilename(name string) string

NormalizeFilename will replace <>:"/\|?* in string

func OpenPage added in v1.0.3

func OpenPage(browser *rod.Browser, url string, sleep int64, selector string, sign WaitSign) (page *rod.Page, err error)

func RemoveContents added in v1.0.4

func RemoveContents(dir string) error

RemoveContents will delete all the contents of a directory

func RenameFileUnique added in v1.0.4

func RenameFileUnique(dir, fileName, ext string, try int) string

RenameFileUnique rename file name if there are duplicate files

func WaitElementHide

func WaitElementHide(page *rod.Page, selector string, timeoutSeconds int) (err error)

WaitElementHide waiting for a certain element on the page to disappear

func WaitElementShow

func WaitElementShow(page *rod.Page, selector string, timeoutSeconds int) (err error)

WaitElementShow waiting for a certain element on the page to appear

func WaitPage added in v1.0.5

func WaitPage(page *rod.Page, sleep int64, selector string, sign WaitSign) (err error)

func WriteSortedJSONToFile added in v1.0.2

func WriteSortedJSONToFile(data interface{}, filename string) error

WriteSortedJSONToFile writing indented and key sorted JSON to a file

Types

type ConfigNode added in v1.0.5

type ConfigNode struct {
	Selector string `json:"selector"`
	Label    string `json:"label"`
	ID       string `json:"id"`
}

type Crawler added in v1.0.1

type Crawler struct {
	Browser    *rod.Browser
	CfgFetcher func(path string) (*CrawlerConfig, error)
}

func (*Crawler) AttachChromeBrowser added in v1.1.0

func (c *Crawler) AttachChromeBrowser() *rod.Browser

func (*Crawler) AttachDefaultBrowser added in v1.0.1

func (c *Crawler) AttachDefaultBrowser() *rod.Browser

func (*Crawler) AttachEdgedIE added in v1.0.6

func (c *Crawler) AttachEdgedIE() *rod.Browser

func (*Crawler) CrawlPage added in v1.0.1

func (c *Crawler) CrawlPage(page *rod.Page, cfgOrFile interface{}, autoDownload bool, closeTab bool) (*Result, error)

func (*Crawler) CrawlUrl added in v1.0.1

func (c *Crawler) CrawlUrl(url string, cfgOrFile interface{}, autoDownload bool, closeTab bool) (*Result, *rod.Page, error)

type CrawlerConfig added in v1.0.5

type CrawlerConfig struct {
	PageLoad        PageLoad                 `json:"pageLoad,omitempty"`
	DataSection     []map[string]interface{} `json:"dataSection"`
	SwitchSection   map[string]interface{}   `json:"switchSection,omitempty"`
	DownloadRoot    string                   `json:"downloadRoot,omitempty"`
	DownloadSection []DownloadConfig         `json:"downloadSection,omitempty"`
}

type DownloadConfig added in v1.0.5

type DownloadConfig struct {
	ConfigNode
	SavePath   string       `json:"savePath,omitempty"`
	NameProper string       `json:"nameProper,omitempty"`
	NameRender string       `json:"nameRender,omitempty"`
	Type       DownloadType `json:"type"`
}

type DownloadResult added in v1.0.5

type DownloadResult struct {
	Count     int      `json:"count"`
	Errors    []int    `json:"errors"`
	FileNames []string `json:"fileNames"`
	Links     []string `json:"links"`
}

DownloadResult is a part of result section

type DownloadType

type DownloadType string
const (
	DownloadUrl     DownloadType = "url"
	DownloadElement DownloadType = "element"
)

type ExternalResult added in v1.0.5

type ExternalResult struct {
	Config  string `json:"config"`
	Connect string `json:"connect"`
	ID      string `json:"id"`
}

type PageLoad added in v1.0.5

type PageLoad struct {
	Wait     WaitSign `json:"wait"`
	Selector string   `json:"selector,omitempty"`
	Sleep    int64    `json:"sleep,omitempty"`
}

type Result added in v1.0.5

type Result struct {
	Data            map[string]interface{}    `json:"data"`
	DownloadRoot    string                    `json:"downloadRoot"`
	Downloads       map[string]DownloadResult `json:"downloads"`
	ExternalSection map[string]ExternalResult `json:"externalSection"`
}

type WaitSign added in v1.0.2

type WaitSign string
const (
	WaitShow  WaitSign = "show"
	WaitHide  WaitSign = "hide"
	WaitDelay WaitSign = "wait"
)

Directories

Path Synopsis
Generated by https://quicktype.io
Generated by https://quicktype.io

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL