webextractor

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 12, 2024 License: MIT Imports: 11 Imported by: 0

README

Colibri ~ WebExtractor

WebExtractor are default interfaces for Colibri ready to start crawling or extracting data on the web.

Quick Starts

Do
package main

import (
	"encoding/json"
	"fmt"

	"github.com/gonzxlez/colibri"
	"github.com/gonzxlez/colibri/webextractor"
)

var rawRules = `{
	"Method": "GET",
	"URL": "https://example.com"
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	resp, err := we.Do(&rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", resp.URL())
	fmt.Println("Status code:", resp.StatusCode())
	fmt.Println("Content-Type", resp.Header().Get("Content-Type"))
}
URL: https://example.com
Status code: 200
Content-Type text/html; charset=UTF-8
Extract
package main

import (
	"encoding/json"
	"fmt"

	"github.com/gonzxlez/colibri"
	"github.com/gonzxlez/colibri/webextractor"
)

var rawRules = `{
	"Method": "GET",
	"URL":    "https://example.com",
	"Selectors": {
		"title": "//head/title"
	}
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	output, err := we.Extract(&rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", output.Response.URL())
	fmt.Println("Status code:", output.Response.StatusCode())
	fmt.Println("Content-Type", output.Response.Header().Get("Content-Type"))
	fmt.Println("Data:", output.Data)
}

URL: https://example.com
Status code: 200
Content-Type text/html; charset=UTF-8
Data: map[title:Example Domain]

Documentation

Overview

webextractor are default interfaces for Colibri ready to start crawling or extracting data on the web.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func New

func New(cookieJar ...http.CookieJar) (*colibri.Colibri, error)

New returns a new Colibri structure with default values. Returns an error if an error occurs when initializing the values.

Types

type Client

type Client struct {
	// Jar specifies the cookie jar.
	Jar http.CookieJar
	// contains filtered or unexported fields
}

Client represents an HTTP client. See the colibri.HTTPClient interface.

func NewClient

func NewClient(cookieJar ...http.CookieJar) (*Client, error)

NewClient returns a new Client structure. The first cookieJar sent is taken, if no value is sent, a new cookiejar.Jar is initialized.

func (*Client) Clear

func (client *Client) Clear()

Clear assigns nil to Jar.

func (*Client) Do

func (client *Client) Do(c *colibri.Colibri, rules *colibri.Rules) (colibri.Response, error)

Do makes an HTTP request based on the rules.

type ReqDelay

type ReqDelay struct {
	// contains filtered or unexported fields
}

ReqDelay manages the delay between each HTTP request. See the colibri.Delay interface.

func NewReqDelay

func NewReqDelay() *ReqDelay

NewReqDelay returns a new ReqDelay structure.

func (*ReqDelay) Clear

func (rd *ReqDelay) Clear()

func (*ReqDelay) Done

func (rd *ReqDelay) Done(u *url.URL)

func (*ReqDelay) Stamp

func (rd *ReqDelay) Stamp(u *url.URL)

func (*ReqDelay) Wait

func (rd *ReqDelay) Wait(u *url.URL, duration time.Duration)

type Response

type Response struct {
	HTTP *http.Response
	// contains filtered or unexported fields
}

Response represents an HTTP response. See the colibri.Response interface.

func (*Response) Body

func (resp *Response) Body() io.ReadCloser

func (*Response) Do

func (resp *Response) Do(rules *colibri.Rules) (colibri.Response, error)

func (*Response) Extract

func (resp *Response) Extract(rules *colibri.Rules) (*colibri.Output, error)

func (*Response) Header

func (resp *Response) Header() http.Header

func (*Response) Redirects

func (resp *Response) Redirects() []*url.URL

func (*Response) Serializable

func (resp *Response) Serializable() map[string]any

func (*Response) StatusCode

func (resp *Response) StatusCode() int

func (*Response) URL

func (resp *Response) URL() *url.URL

type RobotsData

type RobotsData struct {
	// contains filtered or unexported fields
}

RobotsData gets, stores and parses robots.txt restrictions.

func NewRobotsData

func NewRobotsData() *RobotsData

NewRobotsData returns a new RobotsData structure.

func (*RobotsData) Clear

func (robots *RobotsData) Clear()

Clear removes stored robots.txt restrictions.

func (*RobotsData) IsAllowed

func (robots *RobotsData) IsAllowed(c *colibri.Colibri, rules *colibri.Rules) error

IsAllowed verifies that the User-Agent can access the URL. Gets and stores the robots.txt restrictions of the URL host and for use in URLs with the same host.

Directories

Path Synopsis
parsers is an interface that Colibri can use to parse the content of responses.
parsers is an interface that Colibri can use to parse the content of responses.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL