Documentation ¶
Index ¶
- func AskForString(prompt string) string
- func ExtractTable(pageSource *html.Node, tableRowsExpression string) ([]*html.Node, error)
- func ExtractText(node *html.Node, nodeExpression string, Dirt string) (string, error)
- func FindNodes(node *html.Node, nodeExpression string) ([]*html.Node, error)
- func PrintHtml(pageSource *html.Node) (string, error)
- type Navigator
- func (nav *Navigator) CaptureScreenshot(nameFile string) error
- func (nav *Navigator) CheckRadioButton(selector string) error
- func (nav *Navigator) ClickButton(selector string) error
- func (nav *Navigator) ClickElement(selector string) error
- func (nav *Navigator) Close()
- func (nav *Navigator) Datepicker(...) error
- func (nav *Navigator) EvaluateScript(script string) (interface{}, error)
- func (nav *Navigator) ExecuteScript(script string) error
- func (nav *Navigator) ExtractLinks() ([]string, error)
- func (nav *Navigator) FillField(selector string, value string) error
- func (nav *Navigator) FillForm(selector string, data map[string]string) error
- func (nav *Navigator) GetCurrentURL() (string, error)
- func (nav *Navigator) GetElement(selector string) (string, error)
- func (nav *Navigator) GetElementAttribute(selector, attribute string) (string, error)
- func (nav *Navigator) GetPageSource() (*html.Node, error)
- func (nav *Navigator) HandleAlert() error
- func (nav *Navigator) Login(url, username, password, usernameSelector, passwordSelector, ... string, ...) error
- func (nav *Navigator) LoginAccountsGoogle(email, password string) error
- func (nav *Navigator) LoginWithGoogle(url string) error
- func (nav *Navigator) MakeElementVisible(selector string) error
- func (nav *Navigator) OpenURL(url string) error
- func (nav *Navigator) ReloadPage(retryCount int) error
- func (nav *Navigator) SaveImageBase64(selector, outputPath, prefixClean string) (string, error)
- func (nav *Navigator) SelectDropdown(selector, value string) error
- func (nav *Navigator) SetTimeOut(timeOut time.Duration)
- func (nav *Navigator) SwitchToDefaultContent() error
- func (nav *Navigator) SwitchToFrame(selector string) error
- func (nav *Navigator) UncheckRadioButton(selector string) error
- func (nav *Navigator) UnsafeClickButton(selector string) error
- func (nav *Navigator) UnsafeFillField(selector string, value string) error
- func (nav *Navigator) WaitForElement(selector string, timeout time.Duration) error
- func (nav *Navigator) WaitPageLoad() (string, error)
- type PageSource
- func EvaluateParallelRequests(previousResults []PageSource, crawlerFunc func(string) (*html.Node, error), ...) ([]PageSource, error)
- func ParallelRequests(requests []Request, numberOfWorkers int, delay time.Duration, ...) ([]PageSource, error)
- func RemovePageSource(slice []PageSource, s int) []PageSource
- type Request
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AskForString ¶ added in v1.4.0
AskForString prompts the user to enter a string and returns the trimmed input.
func ExtractTable ¶ added in v1.2.0
ExtractTable extracts data from a table specified by the selector. Example:
tableData, err := goSpider.ExtractTableData(pageSource,"#tableID")
func ExtractText ¶ added in v1.2.0
ExtractText extracts text content from nodes specified by the parent selectors. Example:
textData, err := goSpider.ExtractText(pageSource,"#parent1", "\n")
Types ¶
type Navigator ¶
type Navigator struct {}
Navigator is a struct that holds the context for the ChromeDP session and a logger.
func NewNavigator ¶
NewNavigator creates a new Navigator instance.
Parameters:
- profilePath: the path to chrome profile defined by the user; can be passed as an empty string
- headless: if false will show chrome UI
Example:
nav := goSpider.NewNavigator("/Users/USER_NAME/Library/Application Support/Google/Chrome/Profile 2", true, initialCookies)
NewNavigator creates a new Navigator instance with enhanced logging for troubleshooting authentication issues.
func (*Navigator) CaptureScreenshot ¶
CaptureScreenshot captures a screenshot of the current browser window. Example:
err := nav.CaptureScreenshot("img")
func (*Navigator) CheckRadioButton ¶
CheckRadioButton selects a radio button specified by the selector. Example:
err := nav.CheckRadioButton("#radioButtonID")
func (*Navigator) ClickButton ¶
ClickButton clicks a button specified by the selector. Example:
err := nav.ClickButton("#buttonID")
func (*Navigator) ClickElement ¶
ClickElement clicks an element specified by the selector. Example:
err := nav.ClickElement("#elementID")
func (*Navigator) Close ¶
func (nav *Navigator) Close()
Close closes the Navigator instance and releases resources. Example:
nav.Close()
func (*Navigator) Datepicker ¶ added in v1.7.5
func (nav *Navigator) Datepicker(date, calendarButtonSelector, calendarButtonGoBack, calendarButtonsTableXpath, calendarButtonTR string) error
Datepicker deals with date-picker elements on websites by receiving a date, calculates the amount of time it needs to go back in the picker and finally selects a day.
date: string in the format "dd/mm/aaaa" calendarButtonSelector: the css selector of the data-picker calendarButtonGoBack: the css selector of the go back button calendarButtonsTableXpath: the xpath of the days table example: "//*[@id="ui-datepicker-div"]/table/tbody/tr"; calendarButtonTR: the css selector of the days table row, example: "//*[@id="ui-datepicker-div"]/table/tbody/tr"
func (*Navigator) EvaluateScript ¶ added in v1.3.0
EvaluateScript executes a JavaScript script and returns the result
func (*Navigator) ExecuteScript ¶ added in v1.3.0
ExecuteScript runs the specified JavaScript on the current page script: the JavaScript code to execute Returns an error if any
func (*Navigator) ExtractLinks ¶
ExtractLinks extracts all links from the current page. Example:
links, err := nav.ExtractLinks()
func (*Navigator) FillField ¶
FillField fills a field specified by the selector with the provided value. Example:
err := nav.FillField("#fieldID", "value")
func (*Navigator) FillForm ¶
FillForm fills out a form specified by the selector with the provided data and submits it. Example:
formData := map[string]string{ "username": "myUsername", "password": "myPassword", } err := nav.FillForm("#loginForm", formData)
func (*Navigator) GetCurrentURL ¶
GetCurrentURL returns the current URL of the browser. Example:
currentURL, err := nav.GetCurrentURL()
func (*Navigator) GetElement ¶
GetElement retrieves the text content of an element specified by the selector. Example:
text, err := nav.GetElement("#elementID")
func (*Navigator) GetElementAttribute ¶ added in v1.7.0
GetElementAttribute retrieves the value of a specified attribute from an element identified by a CSS selector. Parameters: - selector: The CSS selector of the element. - attribute: The name of the attribute to retrieve the value of. Returns: - The value of the specified attribute. - An error if the attribute value could not be retrieved.
func (*Navigator) GetPageSource ¶ added in v1.2.0
GetPageSource captures all page HTML from the current page Returns the page HTML as a string and an error if any Example:
pageSource, err := nav.GetPageSource()
func (*Navigator) HandleAlert ¶
HandleAlert handles JavaScript alerts by accepting them. Example:
err := nav.HandleAlert()
func (*Navigator) Login ¶
func (nav *Navigator) Login(url, username, password, usernameSelector, passwordSelector, loginButtonSelector string, messageFailedSuccess string) error
Login logs into a website using the provided credentials and selectors. Example:
err := nav.Login("https://www.example.com/login", "username", "password", "#username", "#password", "#login-button", "#login-message-fail")
func (*Navigator) LoginAccountsGoogle ¶ added in v1.6.0
LoginAccountsGoogle performs the Google login on the given URL
func (*Navigator) LoginWithGoogle ¶ added in v1.4.0
LoginWithGoogle performs the Google login on the given URL
func (*Navigator) MakeElementVisible ¶ added in v1.7.3
MakeElementVisible changes the style display of an element to nil
func (*Navigator) OpenURL ¶
OpenURL opens the specified URL in the current browser context. Example:
err := nav.OpenURL("https://www.example.com")
func (*Navigator) ReloadPage ¶ added in v1.3.0
ReloadPage reloads the current page with retry logic retryCount: number of times to retry reloading the page in case of failure Returns an error if any
func (*Navigator) SaveImageBase64 ¶ added in v1.6.0
SaveImageBase64 extracts the base64 image data from the given selector and saves it to a file.
Parameters:
- selector: the CSS selector of the CAPTCHA image element
- outputPath: the file path to save the image
- prefixClean: the prefix to clear from the source, if any
Example:
err := nav.SaveImageBase64("#imagemCaptcha", "captcha.png", "data:image/png;base64,")
func (*Navigator) SelectDropdown ¶
SelectDropdown selects an option in a dropdown specified by the selector and value. Example:
err := nav.SelectDropdown("#dropdownID", "optionValue")
func (*Navigator) SetTimeOut ¶ added in v1.6.4
SetTimeOut sets a timeout for all the waiting functions on the package. The standard timeout of the Navigator is 300 ms.
func (*Navigator) SwitchToDefaultContent ¶ added in v1.7.0
SwitchToDefaultContent switches the context back to the main content from an iframe context.
func (*Navigator) SwitchToFrame ¶ added in v1.7.0
SwitchToFrame switches the context to the specified iframe.
func (*Navigator) UncheckRadioButton ¶
UncheckRadioButton unchecks a checkbox specified by the selector. Example:
err := nav.UncheckRadioButton("#checkboxID")
func (*Navigator) UnsafeClickButton ¶ added in v1.7.4
UnsafeClickButton clicks a button specified by the selector. Unsafe because this methode does not use the wait element feature. Example:
err := nav.ClickButton("#buttonID")
func (*Navigator) UnsafeFillField ¶ added in v1.7.4
UnsafeFillField fills a field specified by the selector with the provided value. Unsafe because this methode does not use the wait element feature. Example:
err := nav.FillField("#fieldID", "value")
func (*Navigator) WaitForElement ¶
WaitForElement waits for an element specified by the selector to be visible within the given timeout. Example:
err := nav.WaitForElement("#elementID", 5*time.Second)
func (*Navigator) WaitPageLoad ¶ added in v1.3.0
WaitPageLoad waits for the current page to fully load by checking the document.readyState property It will retry until the page is fully loaded or the timeout of one minute is reached Returns the page readyState as a string and an error if any
type PageSource ¶ added in v1.2.0
PageSource structure to hold the HTML data
func EvaluateParallelRequests ¶ added in v1.3.0
func EvaluateParallelRequests(previousResults []PageSource, crawlerFunc func(string) (*html.Node, error), evaluate func([]PageSource) ([]Request, []PageSource)) ([]PageSource, error)
EvaluateParallelRequests iterates over a set of previous results, evaluates them using the provided evaluation function, and handles re-crawling of problematic sources until all sources are valid or no further progress can be made.
Parameters: - previousResults: A slice of PageSource objects containing the initial crawl results. - crawlerFunc: A function that takes a string (URL or identifier) and returns a parsed HTML node and an error. - evaluate: A function that takes a slice of PageSource objects and returns two slices:
- A slice of Request objects for sources that need to be re-crawled.
- A slice of valid PageSource objects.
Returns: - A slice of valid PageSource objects after all problematic sources have been re-crawled and evaluated. - An error if there is a failure in the crawling process.
Example usage:
results, err := EvaluateParallelRequests(resultsFirst, Crawler, Eval)
func Eval(previousResults []PageSource) ([]Request, []PageSource) { var newRequests []Request var validResults []PageSource for _, result := range previousResults { _, err := extractDataCover(result.Page, "") if err != nil { newRequests = append(newRequests, Request{SearchString: result.Request}) } else { validResults = append(validResults, result) } } return newRequests, validResults }
func ParallelRequests ¶ added in v1.1.0
func ParallelRequests(requests []Request, numberOfWorkers int, delay time.Duration, crawlerFunc func(string) (*html.Node, error)) ([]PageSource, error)
ParallelRequests performs web scraping tasks concurrently with a specified number of workers and a delay between requests. The crawlerFunc parameter allows for flexibility in defining the web scraping logic.
Parameters: - requests: A slice of Request structures containing the data needed for each request. - numberOfWorkers: The number of concurrent workers to process the requests. - delay: The delay duration between each request to avoid overwhelming the target server. - crawlerFunc: A user-defined function that takes a process number as input and returns the html as *html.Node, and an error.
Returns: - A slice of ResponseBody structures containing the results of the web scraping tasks. - An error if any occurred during the requests.
Example Usage:
results, err := ParallelRequests(requests, numberOfWorkers, delay, crawlerFunc)
func RemovePageSource ¶ added in v1.3.0
func RemovePageSource(slice []PageSource, s int) []PageSource
RemovePageSource removes the element at index `s` from a slice of `PageSource` objects. It returns the modified slice without the element at index `s`.
type Request ¶ added in v1.3.0
type Request struct {
SearchString string
}
Request structure to hold user data
func RemoveRequest ¶ added in v1.3.0
RemoveRequest removes the element at index `s` from a slice of `Request` objects. It returns the modified slice without the element at index `s`.