scraper

package
v0.5.42 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 25, 2024 License: GPL-3.0 Imports: 24 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Config

type Config struct {
	Writer   output.WriterConfig `yaml:"writer,omitempty"`
	Scrapers []Scraper           `yaml:"scrapers,omitempty"`
	Global   GlobalConfig        `yaml:"global,omitempty"`
}

Config defines the overall structure of the scraper configuration. Values will be taken from a config yml file or environment variables or both.

func NewConfig added in v0.2.1

func NewConfig(configPath string) (*Config, error)

type DateComponent

type DateComponent struct {
	Covers          date.CoveredDateParts `yaml:"covers"`
	ElementLocation ElementLocation       `yaml:"location"`
	Layout          []string              `yaml:"layout"`
	Transform       []TransformConfig     `yaml:"transform,omitempty"`
}

A DateComponent is used to find a specific part of a date within a html document

type ElementLocation

type ElementLocation struct {
	Selector      string      `yaml:"selector,omitempty"`
	JsonSelector  string      `yaml:"json_selector,omitempty"`
	ChildIndex    int         `yaml:"child_index,omitempty"`
	RegexExtract  RegexConfig `yaml:"regex_extract,omitempty"`
	Attr          string      `yaml:"attr,omitempty"`
	MaxLength     int         `yaml:"max_length,omitempty"`
	EntireSubtree bool        `yaml:"entire_subtree,omitempty"`
	AllNodes      bool        `yaml:"all_nodes,omitempty"`
	Separator     string      `yaml:"separator,omitempty"`
	Default       string      `yaml:"default,omitempty"`
}

ElementLocation is used to find a specific string in a html document

type ElementLocations added in v0.4.3

type ElementLocations []ElementLocation

func (*ElementLocations) UnmarshalYAML added in v0.4.3

func (e *ElementLocations) UnmarshalYAML(value *yaml.Node) error

type Field added in v0.2.10

type Field struct {
	Name             string           `yaml:"name"`
	Value            string           `yaml:"value,omitempty"`
	Type             string           `yaml:"type,omitempty"`     // can currently be text, url or date
	ElementLocations ElementLocations `yaml:"location,omitempty"` // elements are extracted strings joined using the given Separator
	Separator        string           `yaml:"separator,omitempty"`
	// If a field can be found on a subpage the following variable has to contain a field name of
	// a field of type 'url' that is located on the main page.
	OnSubpage    string            `yaml:"on_subpage,omitempty"`    // applies to text, url, date
	CanBeEmpty   bool              `yaml:"can_be_empty,omitempty"`  // applies to text, url
	Components   []DateComponent   `yaml:"components,omitempty"`    // applies to date
	DateLocation string            `yaml:"date_location,omitempty"` // applies to date
	DateLanguage string            `yaml:"date_language,omitempty"` // applies to date
	Hide         bool              `yaml:"hide,omitempty"`          // applies to text, url, date
	GuessYear    bool              `yaml:"guess_year,omitempty"`    // applies to date
	Transform    []TransformConfig `yaml:"transform,omitempty"`     // applies to text
}

A Field contains all the information necessary to scrape a dynamic field from a website, ie a field who's value changes for each item

type Filter

type Filter struct {
	Field      string `yaml:"field"`
	Type       string
	Expression string `yaml:"exp"` // changed from 'regex' to 'exp' in version 0.5.7
	RegexComp  *regexp.Regexp
	DateComp   time.Time
	DateOp     string
	Match      bool `yaml:"match"`
}

A Filter is used to filter certain items from the result list

func (*Filter) FilterMatch added in v0.5.7

func (f *Filter) FilterMatch(value interface{}) bool

func (*Filter) Initialize added in v0.5.7

func (f *Filter) Initialize(fieldType string) error

type GlobalConfig added in v0.2.1

type GlobalConfig struct {
	UserAgent string `yaml:"user-agent"`
}

GlobalConfig is used for storing global configuration parameters that are needed across all scrapers

type Paginator added in v0.5.0

type Paginator struct {
	Location ElementLocation `yaml:"location,omitempty"`
	MaxPages int             `yaml:"max_pages,omitempty"`
}

A Paginator is used to paginate through a website

type RegexConfig

type RegexConfig struct {
	RegexPattern string `yaml:"exp"`
	Index        int    `yaml:"index"`
	IgnoreErrors bool   `yaml:"ignore_errors"`
}

RegexConfig is used for extracting a substring from a string based on the given RegexPattern and Index

type Scraper

type Scraper struct {
	Name         string               `yaml:"name"`
	URL          string               `yaml:"url"`
	Item         string               `yaml:"item"`
	Fields       []Field              `yaml:"fields,omitempty"`
	Filters      []*Filter            `yaml:"filters,omitempty"`
	Paginator    Paginator            `yaml:"paginator,omitempty"`
	RenderJs     bool                 `yaml:"render_js,omitempty"`
	PageLoadWait int                  `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
	Interaction  []*types.Interaction `yaml:"interaction,omitempty"`
	// contains filtered or unexported fields
}

A Scraper contains all the necessary config parameters and structs needed to extract the desired information from a website

func (Scraper) Scrape added in v0.5.38

func (c Scraper) Scrape(globalConfig *GlobalConfig, rawDyn bool) (*ScrapingResult, error)

Scrape fetches and returns all items from a website according to the Scraper's paramaters. When rawDyn is set to true the items returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not subpages). This is used by the ML feature generation.

type ScrapingResult added in v0.5.38

type ScrapingResult struct {
	Items []map[string]interface{}
	Stats *ScrapingStats
}

type ScrapingStats added in v0.5.38

type ScrapingStats struct {
	Name     string
	NrItems  int
	NrErrors int
}

type TransformConfig added in v0.3.5

type TransformConfig struct {
	TransformType string `yaml:"type,omitempty"`    // only regex-replace for now
	RegexPattern  string `yaml:"regex,omitempty"`   // a container for the pattern
	Replacement   string `yaml:"replace,omitempty"` // a plain string for replacement
}

TransformConfig is used to replace an existing substring with some other kind of string. Processing needs to happen before extracting dates.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL