scraper

package

v0.16.5 Latest Latest Go to latest Published: Aug 29, 2024 License: Unlicense Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/luevano/mangoprovider

Links

Open Source Insights

Documentation ¶

Index ¶

type ChapterExtractor
type Configuration
- func (c *Configuration) GetActions() map[rod.ActionType]rod.Action
type MangaByIDExtractor
type MangaExtractor
type PageExtractor
type Scraper
- func NewScraper(config *Configuration, options mango.Options) (scraper *Scraper, err error)
type VolumeExtractor

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type ChapterExtractor ¶

type ChapterExtractor struct {
	// Selector: CSS selector.
	Selector string
	// Title: Get title from element found by selector.
	Title func(*goquery.Selection) string
	// URL: Get URL from element found by selector.
	URL func(*goquery.Selection) string
	// ID: Get id from parsed url string.
	ID func(string) string
	// Date: Get the published date of the chapter if available.
	Date func(*goquery.Selection) metadata.Date
	// ScanlationGroup: Get the scanlation group if available.
	ScanlationGroup func(*goquery.Selection) string
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

ChapterExtractor: responsible for finding chapter elements by selector and extracting the data.

type Configuration ¶ added in v0.5.3

type Configuration struct {
	// Name: Name of the scraper. E.g. "mangapill"
	Name string
	// Delay: Delay between requests.
	Delay time.Duration
	// LoadWait: Wait time for the page to load
	// (for headless requests where the HTML is loading dynamically for example).
	LoadWait time.Duration
	// Parallelism: Parallelism of the scraper.
	Parallelism uint8

	// ReverseChapters: If chapters should be shown in reverse order.
	ReverseChapters bool

	// NeedsHeadlessBrowser: If a headless browser should be used to proxy any request.
	NeedsHeadlessBrowser bool
	// LocalStorage: local storage values to set before the requests.
	LocalStorage map[string]string
	// Headers: Custom headers to pass to the request.
	Headers map[string]string

	// BaseURL: Base URL of the source.
	BaseURL string
	// TODO: remove unnecessary baseUrl in these generate methods
	//
	// GenerateSearchURL: Create search URL from the query.
	// E.g. "one piece" -> "https://manganelo.com/search/story/one_piece"
	GenerateSearchURL func(baseUrl, query string) (string, error)
	// GenerateSearchByIDURL: Create search URL from the id.
	// E.g. (one piece) "manga-aa88620" -> "https://chapmanganelo.com/manga-aa88620"
	GenerateSearchByIDURL func(baseUrl, id string) (string, error)

	// MangaByIDExtractor: Responsible for finding manga elements and extracting the data.
	//
	// Used when the id of the manga is provided and the elements need to be fetched from the
	// manga page instead of the mangas list.
	MangaByIDExtractor *MangaByIDExtractor
	// MangaExtractor: Responsible for finding manga elements and extracting the data.
	MangaExtractor *MangaExtractor
	// VolumeExtractor: Responsible for finding volume elements and extracting the data.
	VolumeExtractor *VolumeExtractor
	// ChapterExtractor: Responsible for finding chapter elements and extracting the data.
	ChapterExtractor *ChapterExtractor
	// PageExtractor: Responsible for finding page elements and extracting required the data.
	PageExtractor *PageExtractor
}

Configuration: Defines behavior of the scraper.

func (*Configuration) GetActions ¶ added in v0.7.0

func (c *Configuration) GetActions() map[rod.ActionType]rod.Action

Get the extractor Actions.

type MangaByIDExtractor ¶ added in v0.10.0

type MangaByIDExtractor struct {
	// Selector: CSS selector.
	Selector string
	// Title: Get title from element found by selector.
	Title func(*goquery.Selection) string
	// Cover: Get cover from element found by selector.
	Cover func(*goquery.Selection) string
}

MangaByIDExtractor: responsible for finding manga elements by selector and extracting the data.

Used when the id of the manga is provided and the elements need to be fetched from the manga page instead of the mangas list.

type MangaExtractor ¶

type MangaExtractor struct {
	// Selector: CSS selector.
	Selector string
	// Title: Get title from element found by selector.
	Title func(*goquery.Selection) string
	// URL: Get URL from element found by selector.
	URL func(*goquery.Selection) string
	// ID: Get id from parsed url string.
	ID func(string) string
	// Cover: Get cover from element found by selector.
	Cover func(*goquery.Selection) string
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

MangaExtractor: responsible for finding manga elements by selector and extracting the data.

type PageExtractor ¶

type PageExtractor struct {
	// Selector: CSS selector.
	Selector string
	// URL: Get URL from element found by selector.
	URL func(*goquery.Selection) string
	// URLs: Get all URLs from element found by selector.
	URLs func(*goquery.Selection) []string
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

PageExtractor: responsible for finding page elements by selector and extracting the data.

type Scraper ¶

type Scraper struct {
	// contains filtered or unexported fields
}

Scraper: Generic scraper downloads html pages and parses them.

func NewScraper ¶

func NewScraper(config *Configuration, options mango.Options) (scraper *Scraper, err error)

NewScraper: generates a new scraper with given configuration and options.

func (*Scraper) ChapterPages ¶

func (s *Scraper) ChapterPages(_ctx context.Context, store mango.Store, chapter mango.Chapter) ([]mangadata.Page, error)

func (*Scraper) MangaVolumes ¶

func (s *Scraper) MangaVolumes(_ctx context.Context, store mango.Store, manga mango.Manga) ([]mangadata.Volume, error)

func (*Scraper) SearchMangas ¶

func (s *Scraper) SearchMangas(_ctx context.Context, store mango.Store, query string) ([]mangadata.Manga, error)

func (*Scraper) VolumeChapters ¶

func (s *Scraper) VolumeChapters(_ctx context.Context, store mango.Store, volume mango.Volume) ([]mangadata.Chapter, error)

type VolumeExtractor ¶

type VolumeExtractor struct {
	// Selector: CSS selector.
	Selector string
	// Number: Get number from element found by selector.
	Number func(*goquery.Selection) float32
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

VolumeExtractor: responsible for finding volume elements by selector and extracting the data.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
headless
flaresolverr
rod

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL