scrape

package
v0.5.38 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 12, 2024 License: GPL-3.0 Imports: 24 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DoPruning = true
View Source
var SkipSubURLExt = map[string]bool{
	".gif":  true,
	".jfif": true,
	".jpeg": true,
	".jpg":  true,
	".png":  true,
}
View Source
var SkipTag = map[string]bool{
	"noscript": true,
	"script":   true,
	"style":    true,
}

Functions

func GQDocument

func GQDocument(s *Scraper, gqdoc *goquery.Document, rawDyn bool) (output.ItemMaps, error)

GQDocumentItems fetches and returns all items from a website according to the Scraper's paramaters. When rawDyn is set to true the items returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not subpages). This is used by the ML feature generation.

func GQSelection

func GQSelection(s *Scraper, sel *goquery.Selection, baseUrl string, rawDyn bool) (output.ItemMap, error)

GQSelection fetches and returns an items from a website according to the Scraper's paramaters. When rawDyn is set to true the item returned is not processed according to its type but instead the raw value based only on the location is returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not subpages). This is used by the ML feature generation.

func GetURL

func GetURL(e *ElementLocation, sel *goquery.Selection, baseURL string) (*url.URL, error)

func Page

func Page(s *Scraper, globalConfig *GlobalConfig, rawDyn bool, path string) (output.ItemMaps, error)

Page fetches and returns all items from a webpage according to the Scraper's paramaters. When rawDyn is set to true the items returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not subpages). This is used by the ML feature generation.

func SubGQDocument

func SubGQDocument(c *Config, s *Scraper, im output.ItemMap, fname string, gqdoc *goquery.Document) error

func Subpages

func Subpages(c *Config, s *Scraper, ims output.ItemMaps, fetchFn func(string) (*goquery.Document, error)) error

Types

type Config

type Config struct {
	ID       ConfigID
	Writer   output.WriterConfig `yaml:"writer,omitempty"`
	Scrapers []Scraper           `yaml:"scrapers,omitempty"`
	Global   GlobalConfig        `yaml:"global,omitempty"`
	ItemMaps output.ItemMaps
}

Config defines the overall structure of the scraper configuration. Values will be taken from a config yml file or environment variables or both.

func ReadConfig

func ReadConfig(configPath string) (*Config, error)

func (Config) Copy

func (c Config) Copy() *Config

func (Config) String

func (c Config) String() string

func (Config) WriteToFile

func (c Config) WriteToFile(dir string) error

type ConfigID

type ConfigID struct {
	Slug  string
	ID    string
	Field string
	SubID string
}

func (ConfigID) String

func (cid ConfigID) String() string

type DateComponent

type DateComponent struct {
	Covers          date.CoveredDateParts `yaml:"covers"`
	ElementLocation ElementLocation       `yaml:"location"`
	Layout          []string              `yaml:"layout"`
	Transform       []TransformConfig     `yaml:"transform,omitempty"`
}

A DateComponent is used to find a specific part of a date within a html document

type ElementLocation

type ElementLocation struct {
	Selector      string      `yaml:"selector,omitempty"`
	JsonSelector  string      `yaml:"json_selector,omitempty"`
	ChildIndex    int         `yaml:"child_index,omitempty"`
	RegexExtract  RegexConfig `yaml:"regex_extract,omitempty"`
	Attr          string      `yaml:"attr,omitempty"`
	MaxLength     int         `yaml:"max_length,omitempty"`
	EntireSubtree bool        `yaml:"entire_subtree,omitempty"`
	AllNodes      bool        `yaml:"all_nodes,omitempty"`
	Separator     string      `yaml:"separator,omitempty"`
}

ElementLocation is used to find a specific string in a html document

type ElementLocations

type ElementLocations []ElementLocation

func (*ElementLocations) UnmarshalYAML

func (e *ElementLocations) UnmarshalYAML(value *yaml.Node) error

type Field

type Field struct {
	Name             string           `yaml:"name"`
	Value            string           `yaml:"value,omitempty"`
	Type             string           `yaml:"type,omitempty"`     // can currently be text, url or date
	ElementLocations ElementLocations `yaml:"location,omitempty"` // elements are extracted strings joined using the given Separator
	Default          string           `yaml:"default,omitempty"`  // the default for a dynamic field (text or url) if no value is found
	Separator        string           `yaml:"separator,omitempty"`
	// If a field can be found on a subpage the following variable has to contain a field name of
	// a field of type 'url' that is located on the main page.
	OnSubpage    string            `yaml:"on_subpage,omitempty"`    // applies to text, url, date
	CanBeEmpty   bool              `yaml:"can_be_empty,omitempty"`  // applies to text, url
	Components   []DateComponent   `yaml:"components,omitempty"`    // applies to date
	DateLocation string            `yaml:"date_location,omitempty"` // applies to date
	DateLanguage string            `yaml:"date_language,omitempty"` // applies to date
	Hide         bool              `yaml:"hide,omitempty"`          // applies to text, url, date
	GuessYear    bool              `yaml:"guess_year,omitempty"`    // applies to date
	Transform    []TransformConfig `yaml:"transform,omitempty"`     // applies to text
}

A Field contains all the information necessary to scrape a dynamic field from a website, ie a field who's value changes for each item

type Filter

type Filter struct {
	Field      string `yaml:"field"`
	Type       string
	Expression string `yaml:"exp"` // changed from 'regex' to 'exp' in version 0.5.7
	RegexComp  *regexp.Regexp
	DateComp   time.Time
	DateOp     string
	Match      bool `yaml:"match"`
}

A Filter is used to filter certain items from the result list

func (*Filter) FilterMatch

func (f *Filter) FilterMatch(value interface{}) bool

func (*Filter) Initialize

func (f *Filter) Initialize(fieldType string) error

type GlobalConfig

type GlobalConfig struct {
	UserAgent string `yaml:"user-agent"`
}

GlobalConfig is used for storing global configuration parameters that are needed across all scrapers

type Paginator

type Paginator struct {
	Location ElementLocation `yaml:"location,omitempty"`
	MaxPages int             `yaml:"max_pages,omitempty"`
}

A Paginator is used to paginate through a website

type RegexConfig

type RegexConfig struct {
	RegexPattern string `yaml:"exp"`
	Index        int    `yaml:"index"`
}

RegexConfig is used for extracting a substring from a string based on the given RegexPattern and Index

type Scraper

type Scraper struct {
	Name         string               `yaml:"name"`
	URL          string               `yaml:"url"`
	Item         string               `yaml:"item"`
	Fields       []Field              `yaml:"fields,omitempty"`
	Filters      []*Filter            `yaml:"filters,omitempty"`
	Paginators   []Paginator          `yaml:"paginators,omitempty"`
	RenderJs     bool                 `yaml:"render_js,omitempty"`
	PageLoadWait int                  `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
	Interaction  []*types.Interaction `yaml:"interaction,omitempty"`
	// contains filtered or unexported fields
}

A Scraper contains all the necessary config parameters and structs needed to extract the desired information from a website

func (*Scraper) GetSubpageURLFields

func (c *Scraper) GetSubpageURLFields() []Field

type TransformConfig

type TransformConfig struct {
	TransformType string `yaml:"type,omitempty"`    // only regex-replace for now
	RegexPattern  string `yaml:"regex,omitempty"`   // a container for the pattern
	Replacement   string `yaml:"replace,omitempty"` // a plain string for replacement
}

TransformConfig is used to replace an existing substring with some other kind of string. Processing needs to happen before extracting dates.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL