rpa

package module
v1.3.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 8, 2025 License: MPL-2.0 Imports: 21 Imported by: 0

README

Overview

Go Reference

A configurable web data crawler library uses rod, the goal is to simplify the process of rod scraping web data through configurability. The final output can be changed by modifying the configuration file without having to recompile the program.

Usage

  1. grab url
func main() {
	r := rpa.Crawler{}
	r.AttachDefaultBrowser()
	defer r.Close()

	url := "https://cn.bing.com/search?q=sample+simple+pdf"
	val, _, err := r.CrawlUrl(url, "./sample/sample_zip.json", true, true)
	if err != nil {
		fmt.Println(err)
	} else {
		s, _ := json.MarshalIndent(val, "", "\t")
		fmt.Println(string(s))
	}
}
  1. wait element show/hide
	import helper "github.com/rpdg/rod-helper"

    helper.WaitElementHide(page, ".loading", 60)
    helper.WaitElementShow(page, ".data-table", 10)
    has := helper.ElementVisible(page, ".next-page")
    if has {
        println("data table has next page")
    }


Custom Pseudo class

  1. select element under iframe / frame:

    :frame(iframe_element_selector) inner_element_selector
    
  2. select element under shadow-dom

    :shadow(web_component_selector) inner_element_selector
    

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ConnectChromeBrowser added in v1.1.5

func ConnectChromeBrowser(leakless, headless bool) (br *rod.Browser, err error)

ConnectChromeBrowser returns the Chrome browser if installed

func ConnectChromiumBrowser added in v1.1.6

func ConnectChromiumBrowser(leakless, headless bool) (br *rod.Browser, err error)

ConnectChromiumBrowser returns the rod's embed browser

func ConnectDefaultBrowser added in v1.1.5

func ConnectDefaultBrowser(leakless, headless bool) (br *rod.Browser, err error)

ConnectDefaultBrowser returns the system's default browser

func ConnectEdgeBrowser added in v1.1.6

func ConnectEdgeBrowser(leakless, headless bool, ieMode bool) (br *rod.Browser, err error)

ConnectEdgeBrowser returns the Edge browser if installed

func ElementVisible

func ElementVisible(page *rod.Page, selector string) bool

ElementVisible checks if an element is visible on the page

func EmptyDirectory added in v1.1.3

func EmptyDirectory(dir string) error

EmptyDirectory removes all contents of a directory while preserving the directory itself

func ExecShell added in v1.2.0

func ExecShell(ctx context.Context, command string) (string, error)

ExecShell executes a shell command with timeout control ctx can be created with timeout using context.WithTimeout

func ExtractUrlParam added in v1.2.2

func ExtractUrlParam(urlString, paramName string) (string, error)

ExtractUrlParam extracts a specific parameter value from a URL string

func FileExists added in v1.0.4

func FileExists(name string) (bool, error)

FileExists to check if a file exists

func GBK2UTF8 added in v1.2.0

func GBK2UTF8(s string) string

GBK2UTF8 GBK编码转换为UTF8

func GetDictAndLastSegmentByPath added in v1.2.0

func GetDictAndLastSegmentByPath(data map[string]interface{}, path string) (interface{}, string, error)

GetDictAndLastSegmentByPath traverses a nested map structure using a path and returns the parent data, the last path segment, and any error encountered.

func IsProcessRunning added in v1.3.0

func IsProcessRunning(pid int) bool

IsProcessRunning checks if a process is still running

func KillProcess added in v1.3.0

func KillProcess(pid int)

KillProcess forcefully terminates a process and its children

func MustWaitDownloadRelax added in v1.2.10

func MustWaitDownloadRelax(b *rod.Browser) func() ([]byte, string)

func NormalizeFilename added in v1.0.4

func NormalizeFilename(name string) string

NormalizeFilename sanitizes a filename to be safe for all operating systems. It removes invalid characters, handles reserved names, and ensures the result is a valid filename.

func OpenPage added in v1.0.3

func OpenPage(browser *rod.Browser, url string, sleep int64, selector string, sign WaitSign) (page *rod.Page, err error)

func QueryElem added in v1.1.6

func QueryElem(page *rod.Page, selector string) (*rod.Element, error)

QueryElem returns the element matching the selector

func RaceShow added in v1.1.3

func RaceShow(page *rod.Page, selectors []string, timeoutSeconds int) (int, *rod.Element, error)

RaceShow waits for the first element to become visible from a list of selectors. Returns the index of the first visible element, the element itself, and any error

func RenameFileUnique added in v1.0.4

func RenameFileUnique(dir, fileName, ext string) string

RenameFileUnique generates a unique filename by appending a number if the file already exists

func WaitElementHide

func WaitElementHide(page *rod.Page, selector string, timeoutSeconds int) error

WaitElementHide waits for an element to become invisible on the page

func WaitElementShow

func WaitElementShow(page *rod.Page, selector string, timeoutSeconds int) (err error)

WaitElementShow waits for an element to become visible on the page

func WaitPage added in v1.0.5

func WaitPage(page *rod.Page, sleep int64, selector string, sign WaitSign) (err error)

Types

type ConfigNode added in v1.0.5

type ConfigNode struct {
	Selector string `json:"selector"`
	Label    string `json:"label"`
	ID       string `json:"id"`
}

type Crawler added in v1.0.1

type Crawler struct {
	Browser    *rod.Browser
	CfgFetcher func(path string) (*CrawlerConfig, error)
}

func (*Crawler) AttachChromeBrowser added in v1.1.0

func (c *Crawler) AttachChromeBrowser() error

func (*Crawler) AttachDefaultBrowser added in v1.0.1

func (c *Crawler) AttachDefaultBrowser() error

func (*Crawler) AttachEdgeBrowser added in v1.1.6

func (c *Crawler) AttachEdgeBrowser(ieMode bool) error

func (*Crawler) AttachEmbedBrowser added in v1.1.7

func (c *Crawler) AttachEmbedBrowser() error

func (*Crawler) Close added in v1.2.12

func (c *Crawler) Close()

func (*Crawler) CrawlPage added in v1.0.1

func (c *Crawler) CrawlPage(page *rod.Page, cfgOrFile interface{}, autoDownload bool, closeTab bool) (*Result, error)

func (*Crawler) CrawlUrl added in v1.0.1

func (c *Crawler) CrawlUrl(url string, cfgOrFile interface{}, autoDownload bool, closeTab bool) (*Result, *rod.Page, error)

type CrawlerConfig added in v1.0.5

type CrawlerConfig struct {
	PageLoad        PageLoad         `json:"pageLoad,omitempty"`
	DataSection     []DictData       `json:"dataSection"`
	SwitchSection   DictData         `json:"switchSection,omitempty"`
	DownloadRoot    string           `json:"downloadRoot,omitempty"`
	DownloadSection []DownloadConfig `json:"downloadSection,omitempty"`
}

type DictData added in v1.2.0

type DictData map[string]interface{}

type DownloadConfig added in v1.0.5

type DownloadConfig struct {
	ConfigNode
	SavePath     string             `json:"savePath,omitempty"`
	NameProper   string             `json:"nameProper,omitempty"`
	NameRender   string             `json:"nameRender,omitempty"`
	LinkProper   string             `json:"linkProper,omitempty"`
	LinkRender   string             `json:"linkRender,omitempty"`
	InsertTo     string             `json:"insertTo,omitempty"`
	DownloadType DownloadTypeString `json:"downloadType"`
}

type DownloadFileInfo added in v1.2.0

type DownloadFileInfo struct {
	Name  string `json:"name"`
	Url   string `json:"url"`
	Error string `json:"error"`
}

type DownloadResult added in v1.0.5

type DownloadResult struct {
	Label string             `json:"label"`
	Files []DownloadFileInfo `json:"files"`
}

DownloadResult is a part of result section

type DownloadTypeString added in v1.2.1

type DownloadTypeString string
const (
	DownloadUrl     DownloadTypeString = "url"
	DownloadElement DownloadTypeString = "element"
	PrintToPDF      DownloadTypeString = "toPDF"
)

type ExecuteResult added in v1.2.0

type ExecuteResult struct {
	Output string
	Err    error
}

type ExternalResult added in v1.0.5

type ExternalResult struct {
	Config  string `json:"config"`
	Connect string `json:"connect"`
	ID      string `json:"id"`
}

type PageLoad added in v1.0.5

type PageLoad struct {
	Wait     WaitSign `json:"wait"`
	Selector string   `json:"selector,omitempty"`
	Sleep    int64    `json:"sleep,omitempty"`
}

type Result added in v1.0.5

type Result struct {
	Data            DictData                  `json:"data"`
	DownloadRoot    string                    `json:"downloadRoot"`
	Downloads       map[string]DownloadResult `json:"downloads"`
	ExternalSection map[string]ExternalResult `json:"externalSection"`
}

type WaitSign added in v1.0.2

type WaitSign string
const (
	WaitShow  WaitSign = "show"
	WaitHide  WaitSign = "hide"
	WaitDelay WaitSign = "wait"
)

Directories

Path Synopsis
Generated by https://quicktype.io
Generated by https://quicktype.io

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL