scraper

package
v0.20.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 30, 2023 License: AGPL-3.0 Imports: 46 Imported by: 0

Documentation

Index

Constants

View Source
const FreeonesScraperID = "builtin_freeones"

FreeonesScraperID is the scraper ID for the built-in Freeones scraper

Variables

View Source
var (
	// ErrMaxRedirects is returned if the max number of HTTP redirects are reached.
	ErrMaxRedirects = errors.New("maximum number of HTTP redirects reached")

	// ErrNotFound is returned when an entity isn't found
	ErrNotFound = errors.New("scraper not found")

	// ErrNotSupported is returned when a given invocation isn't supported, and there
	// is a guard function which should be able to guard against it.
	ErrNotSupported = errors.New("scraper operation not supported")
)
View Source
var ErrScraperScript = errors.New("scraper script error")

Functions

This section is empty.

Types

type Cache added in v0.3.0

type Cache struct {
	// contains filtered or unexported fields
}

Cache stores the database of scrapers

func NewCache added in v0.3.0

func NewCache(globalConfig GlobalConfig, txnManager txn.Manager, repo Repository) (*Cache, error)

NewCache returns a new Cache loading scraper configurations from the scraper path provided in the global config object. It returns a new instance and an error if the scraper directory could not be loaded.

Scraper configurations are loaded from yml files in the provided scrapers directory and any subdirectories.

func (Cache) GetScraper added in v0.11.0

func (c Cache) GetScraper(scraperID string) *Scraper

GetScraper returns the scraper matching the provided id.

func (Cache) ListScrapers added in v0.12.0

func (c Cache) ListScrapers(tys []ScrapeContentType) []*Scraper

ListScrapers lists scrapers matching one of the given types. Returns a list of scrapers, sorted by their ID.

func (*Cache) ReloadScrapers added in v0.3.0

func (c *Cache) ReloadScrapers() error

ReloadScrapers clears the scraper cache and reloads from the scraper path. In the event of an error during loading, the cache will be left empty.

func (Cache) ScrapeFragment added in v0.12.0

func (c Cache) ScrapeFragment(ctx context.Context, id string, input Input) (ScrapedContent, error)

ScrapeFragment uses the given fragment input to scrape

func (Cache) ScrapeID added in v0.12.0

func (c Cache) ScrapeID(ctx context.Context, scraperID string, id int, ty ScrapeContentType) (ScrapedContent, error)

func (Cache) ScrapeName added in v0.12.0

func (c Cache) ScrapeName(ctx context.Context, id, query string, ty ScrapeContentType) ([]ScrapedContent, error)

func (Cache) ScrapeURL added in v0.12.0

func (c Cache) ScrapeURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error)

ScrapeURL scrapes a given url for the given content. Searches the scraper cache and picks the first scraper capable of scraping the given url into the desired content. Returns the scraped content or an error if the scrape fails.

type GalleryFinder added in v0.17.0

type GalleryFinder interface {
	Find(ctx context.Context, id int) (*models.Gallery, error)
	models.FileLoader
}

type GlobalConfig added in v0.3.0

type GlobalConfig interface {
	GetScraperUserAgent() string
	GetScrapersPath() string
	GetScraperCDPPath() string
	GetScraperCertCheck() bool
	GetPythonPath() string
	GetProxy() string
}

GlobalConfig contains the global scraper options.

type Input added in v0.12.0

type Input struct {
	Performer *ScrapedPerformerInput
	Scene     *ScrapedSceneInput
	Gallery   *ScrapedGalleryInput
}

Input coalesces inputs of different types into a single structure. The system expects one of these to be set, and the remaining to be set to nil.

type PerformerFinder added in v0.17.0

type PerformerFinder interface {
	match.PerformerAutoTagQueryer
	match.PerformerFinder
}

type QueryType added in v0.12.0

type QueryType int

simple type definitions that can help customize actions per query

const (
	// for now only SearchQuery is needed
	SearchQuery QueryType = iota + 1
)

type Repository added in v0.17.0

type Repository struct {
	SceneFinder     scene.IDFinder
	GalleryFinder   GalleryFinder
	TagFinder       TagFinder
	PerformerFinder PerformerFinder
	MovieFinder     match.MovieNamesFinder
	StudioFinder    StudioFinder
}

type ScrapeContentType added in v0.17.0

type ScrapeContentType string

Type of the content a scraper generates

const (
	ScrapeContentTypeGallery   ScrapeContentType = "GALLERY"
	ScrapeContentTypeMovie     ScrapeContentType = "MOVIE"
	ScrapeContentTypePerformer ScrapeContentType = "PERFORMER"
	ScrapeContentTypeScene     ScrapeContentType = "SCENE"
)

func (ScrapeContentType) IsValid added in v0.17.0

func (e ScrapeContentType) IsValid() bool

func (ScrapeContentType) MarshalGQL added in v0.17.0

func (e ScrapeContentType) MarshalGQL(w io.Writer)

func (ScrapeContentType) String added in v0.17.0

func (e ScrapeContentType) String() string

func (*ScrapeContentType) UnmarshalGQL added in v0.17.0

func (e *ScrapeContentType) UnmarshalGQL(v interface{}) error

type ScrapeType added in v0.17.0

type ScrapeType string
const (
	// From text query
	ScrapeTypeName ScrapeType = "NAME"
	// From existing object
	ScrapeTypeFragment ScrapeType = "FRAGMENT"
	// From URL
	ScrapeTypeURL ScrapeType = "URL"
)

func (ScrapeType) IsValid added in v0.17.0

func (e ScrapeType) IsValid() bool

func (ScrapeType) MarshalGQL added in v0.17.0

func (e ScrapeType) MarshalGQL(w io.Writer)

func (ScrapeType) String added in v0.17.0

func (e ScrapeType) String() string

func (*ScrapeType) UnmarshalGQL added in v0.17.0

func (e *ScrapeType) UnmarshalGQL(v interface{}) error

type ScrapedContent added in v0.17.0

type ScrapedContent interface {
	IsScrapedContent()
}

Scraped Content is the forming union over the different scrapers

type ScrapedGallery added in v0.17.0

type ScrapedGallery struct {
	Title      *string                    `json:"title"`
	Details    *string                    `json:"details"`
	URL        *string                    `json:"url"`
	Date       *string                    `json:"date"`
	Studio     *models.ScrapedStudio      `json:"studio"`
	Tags       []*models.ScrapedTag       `json:"tags"`
	Performers []*models.ScrapedPerformer `json:"performers"`
}

func (ScrapedGallery) IsScrapedContent added in v0.17.0

func (ScrapedGallery) IsScrapedContent()

type ScrapedGalleryInput added in v0.17.0

type ScrapedGalleryInput struct {
	Title   *string `json:"title"`
	Details *string `json:"details"`
	URL     *string `json:"url"`
	Date    *string `json:"date"`
}

type ScrapedMovieInput added in v0.17.0

type ScrapedMovieInput struct {
	Name     *string `json:"name"`
	Aliases  *string `json:"aliases"`
	Duration *string `json:"duration"`
	Date     *string `json:"date"`
	Rating   *string `json:"rating"`
	Director *string `json:"director"`
	URL      *string `json:"url"`
	Synopsis *string `json:"synopsis"`
}

type ScrapedPerformerInput added in v0.17.0

type ScrapedPerformerInput struct {
	// Set if performer matched
	StoredID       *string `json:"stored_id"`
	Name           *string `json:"name"`
	Disambiguation *string `json:"disambiguation"`
	Gender         *string `json:"gender"`
	URL            *string `json:"url"`
	Twitter        *string `json:"twitter"`
	Instagram      *string `json:"instagram"`
	Birthdate      *string `json:"birthdate"`
	Ethnicity      *string `json:"ethnicity"`
	Country        *string `json:"country"`
	EyeColor       *string `json:"eye_color"`
	Height         *string `json:"height"`
	Measurements   *string `json:"measurements"`
	FakeTits       *string `json:"fake_tits"`
	CareerLength   *string `json:"career_length"`
	Tattoos        *string `json:"tattoos"`
	Piercings      *string `json:"piercings"`
	Aliases        *string `json:"aliases"`
	Details        *string `json:"details"`
	DeathDate      *string `json:"death_date"`
	HairColor      *string `json:"hair_color"`
	Weight         *string `json:"weight"`
	RemoteSiteID   *string `json:"remote_site_id"`
}

type ScrapedScene added in v0.17.0

type ScrapedScene struct {
	Title    *string `json:"title"`
	Code     *string `json:"code"`
	Details  *string `json:"details"`
	Director *string `json:"director"`
	URL      *string `json:"url"`
	Date     *string `json:"date"`
	// This should be a base64 encoded data URL
	Image        *string                       `json:"image"`
	File         *models.SceneFileType         `json:"file"`
	Studio       *models.ScrapedStudio         `json:"studio"`
	Tags         []*models.ScrapedTag          `json:"tags"`
	Performers   []*models.ScrapedPerformer    `json:"performers"`
	Movies       []*models.ScrapedMovie        `json:"movies"`
	RemoteSiteID *string                       `json:"remote_site_id"`
	Duration     *int                          `json:"duration"`
	Fingerprints []*models.StashBoxFingerprint `json:"fingerprints"`
}

func (ScrapedScene) IsScrapedContent added in v0.17.0

func (ScrapedScene) IsScrapedContent()

type ScrapedSceneInput added in v0.17.0

type ScrapedSceneInput struct {
	Title        *string `json:"title"`
	Code         *string `json:"code"`
	Details      *string `json:"details"`
	Director     *string `json:"director"`
	URL          *string `json:"url"`
	Date         *string `json:"date"`
	RemoteSiteID *string `json:"remote_site_id"`
}

type Scraper added in v0.17.0

type Scraper struct {
	ID   string `json:"id"`
	Name string `json:"name"`
	// Details for performer scraper
	Performer *ScraperSpec `json:"performer"`
	// Details for scene scraper
	Scene *ScraperSpec `json:"scene"`
	// Details for gallery scraper
	Gallery *ScraperSpec `json:"gallery"`
	// Details for movie scraper
	Movie *ScraperSpec `json:"movie"`
}

type ScraperSpec added in v0.17.0

type ScraperSpec struct {
	// URLs matching these can be scraped with
	Urls             []string     `json:"urls"`
	SupportedScrapes []ScrapeType `json:"supported_scrapes"`
}

type Source added in v0.17.0

type Source struct {
	// Index of the configured stash-box instance to use. Should be unset if scraper_id is set
	StashBoxIndex *int `json:"stash_box_index"`
	// Stash-box endpoint
	StashBoxEndpoint *string `json:"stash_box_endpoint"`
	// Scraper ID to scrape with. Should be unset if stash_box_index is set
	ScraperID *string `json:"scraper_id"`
}

type StudioFinder added in v0.17.0

type StudioFinder interface {
	match.StudioAutoTagQueryer
	match.StudioFinder
}

type TagFinder added in v0.17.0

type TagFinder interface {
	match.TagAutoTagQueryer
	tag.Queryer
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL