webextractor

package

v0.3.0 Latest Latest Go to latest Published: Jun 12, 2024 License: MIT Imports: 11 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/gonzxlez/colibri

README ¶

Colibri ~ WebExtractor

WebExtractor are default interfaces for Colibri ready to start crawling or extracting data on the web.

Quick Starts

Do

package main

import (
	"encoding/json"
	"fmt"

	"github.com/gonzxlez/colibri"
	"github.com/gonzxlez/colibri/webextractor"
)

var rawRules = `{
	"Method": "GET",
	"URL": "https://example.com"
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	resp, err := we.Do(&rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", resp.URL())
	fmt.Println("Status code:", resp.StatusCode())
	fmt.Println("Content-Type", resp.Header().Get("Content-Type"))
}

URL: https://example.com
Status code: 200
Content-Type text/html; charset=UTF-8

Extract

package main

import (
	"encoding/json"
	"fmt"

	"github.com/gonzxlez/colibri"
	"github.com/gonzxlez/colibri/webextractor"
)

var rawRules = `{
	"Method": "GET",
	"URL":    "https://example.com",
	"Selectors": {
		"title": "//head/title"
	}
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	output, err := we.Extract(&rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", output.Response.URL())
	fmt.Println("Status code:", output.Response.StatusCode())
	fmt.Println("Content-Type", output.Response.Header().Get("Content-Type"))
	fmt.Println("Data:", output.Data)
}

URL: https://example.com
Status code: 200
Content-Type text/html; charset=UTF-8
Data: map[title:Example Domain]

Documentation ¶

Overview ¶

webextractor are default interfaces for Colibri ready to start crawling or extracting data on the web.

Index ¶

func New(cookieJar ...http.CookieJar) (*colibri.Colibri, error)
type Client
- func NewClient(cookieJar ...http.CookieJar) (*Client, error)
- func (client *Client) Clear()
- func (client *Client) Do(c *colibri.Colibri, rules *colibri.Rules) (colibri.Response, error)
type ReqDelay
- func NewReqDelay() *ReqDelay
type Response
type RobotsData
- func NewRobotsData() *RobotsData
- func (robots *RobotsData) Clear()
- func (robots *RobotsData) IsAllowed(c *colibri.Colibri, rules *colibri.Rules) error

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func New ¶

func New(cookieJar ...http.CookieJar) (*colibri.Colibri, error)

New returns a new Colibri structure with default values. Returns an error if an error occurs when initializing the values.

Types ¶

type Client ¶

type Client struct {
	// Jar specifies the cookie jar.
	Jar http.CookieJar
	// contains filtered or unexported fields
}

Client represents an HTTP client. See the colibri.HTTPClient interface.

func NewClient ¶

func NewClient(cookieJar ...http.CookieJar) (*Client, error)

NewClient returns a new Client structure. The first cookieJar sent is taken, if no value is sent, a new cookiejar.Jar is initialized.

func (*Client) Clear ¶

func (client *Client) Clear()

Clear assigns nil to Jar.

func (*Client) Do ¶

func (client *Client) Do(c *colibri.Colibri, rules *colibri.Rules) (colibri.Response, error)

Do makes an HTTP request based on the rules.

type ReqDelay ¶

type ReqDelay struct {
	// contains filtered or unexported fields
}

ReqDelay manages the delay between each HTTP request. See the colibri.Delay interface.

func NewReqDelay ¶

func NewReqDelay() *ReqDelay

NewReqDelay returns a new ReqDelay structure.

func (*ReqDelay) Clear ¶

func (rd *ReqDelay) Clear()

func (*ReqDelay) Done ¶

func (rd *ReqDelay) Done(u *url.URL)

func (*ReqDelay) Stamp ¶

func (rd *ReqDelay) Stamp(u *url.URL)

func (*ReqDelay) Wait ¶

func (rd *ReqDelay) Wait(u *url.URL, duration time.Duration)

type Response ¶

type Response struct {
	HTTP *http.Response
	// contains filtered or unexported fields
}

Response represents an HTTP response. See the colibri.Response interface.

func (*Response) Body ¶

func (resp *Response) Body() io.ReadCloser

func (*Response) Do ¶

func (resp *Response) Do(rules *colibri.Rules) (colibri.Response, error)

func (*Response) Extract ¶

func (resp *Response) Extract(rules *colibri.Rules) (*colibri.Output, error)

func (resp *Response) Header() http.Header

func (*Response) Redirects ¶

func (resp *Response) Redirects() []*url.URL

func (*Response) Serializable ¶

func (resp *Response) Serializable() map[string]any

func (*Response) StatusCode ¶

func (resp *Response) StatusCode() int

func (*Response) URL ¶

func (resp *Response) URL() *url.URL

type RobotsData ¶

type RobotsData struct {
	// contains filtered or unexported fields
}

RobotsData gets, stores and parses robots.txt restrictions.

func NewRobotsData ¶

func NewRobotsData() *RobotsData

NewRobotsData returns a new RobotsData structure.

func (*RobotsData) Clear ¶

func (robots *RobotsData) Clear()

Clear removes stored robots.txt restrictions.

func (*RobotsData) IsAllowed ¶

func (robots *RobotsData) IsAllowed(c *colibri.Colibri, rules *colibri.Rules) error

IsAllowed verifies that the User-Agent can access the URL. Gets and stores the robots.txt restrictions of the URL host and for use in URLs with the same host.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
parsers parsers is an interface that Colibri can use to parse the content of responses.	parsers is an interface that Colibri can use to parse the content of responses.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL