scraper

package

v0.0.0-...-03c5aec Latest Latest Go to latest Published: Apr 13, 2024 License: MIT Imports: 12 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/antsanchez/go-download-web

Links

Open Source Insights

Documentation ¶

Index ¶

func IsFinal(url string) bool
func IsInSlice(search string, array []string) bool
func PrintUsage()
func RemoveLastSlash(url string) string
type Config
- func ParseFlags() (*Config, error)
type Console
type HttpGet
type Links
type Page
type Scraper
- func New(conf *Config, getter HttpGet, con Console) (*Scraper, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func IsFinal ¶

func IsFinal(url string) bool

IsFinal check if the url is a folder-like path, like example.com/path/

func IsInSlice ¶

func IsInSlice(search string, array []string) bool

IsInSlice check if the given link is in a slice

func PrintUsage ¶

func PrintUsage()

PrintUsage prints the usage message

func RemoveLastSlash ¶

func RemoveLastSlash(url string) string

RemoveLastSlash removes the last slash

Types ¶

type Config ¶

type Config struct {
	// Original domain
	OldDomain string `long:"u" short:"u"`

	// New domain to rewrite the download HTML sites
	NewDomain string `long:"new" short:"new"`

	// URL prefixes/roots that should be included in the scraper
	IncludedURLs string `long:"r" short:"r"`

	// Roots contains a range of URLs that can be considered the root
	// This is useful for scraping sites where content is hosted on a CDN
	// Not a flag. This will be filled by the scraper uppon setup
	Roots []string

	// Path where to save the downloads
	DownloadPath string `long:"path" short:"path"`

	// Use args on URLs
	UseQueries bool `long:"q" short:"q"`

	// Number of concurrent queries
	Simultaneous int `long:"s" short:"s"`
}

Config holds the scraper configuration

func ParseFlags ¶

func ParseFlags() (*Config, error)

parseFlags parses command line arguments and validates them

type Console ¶

type Console interface {
	AddDomain(string)
	AddStatus(string)
	AddStarted()
	AddFinished()
	AddAttachments()
	AddDownloaded()
	AddDownloading()
	AddErrors(string)
}

Console interface

type HttpGet ¶

type HttpGet interface {
	ParseURL(baseURLString, relativeURLString string) (final string, err error)
	Get(link string) (final string, status int, buff *bytes.Buffer, err error)
}

HttpGet interface

type Links ¶

type Links struct {
	Href string
}

Links model

type Page ¶

type Page struct {
	URL       string
	Canonical string
	Links     []Links
	HTML      string
}

Page model

type Scraper ¶

type Scraper struct {
	// Original domain
	OldDomain string

	// New domain to rewrite the download HTML sites
	NewDomain string

	// Roots contains a range of URLs that can be considered the root
	// This is useful for scraping sites where content is hosted on a CDN
	Roots []string

	// Path where to save the downloads
	DownloadPath string

	// Use args on URLs
	UseQueries bool

	// Number of concurrent queries
	Simultaneous int

	// Scanning now
	Scanning chan int

	// New links found
	NewLinks chan []Links

	// Pages to save
	Pages chan Page

	// Attachments found
	Attachments chan []string

	// Started
	Started chan int

	// Finished
	Finished chan int

	// Indexed pages
	Indexed []string

	// Pages for sitemap
	ForSitemap []string

	// Files to download
	Files []string

	// Seen links
	Seen map[string]bool

	// Start time
	StartTime time.Time

	// GetInterface
	Get HttpGet

	// Console
	Con Console
}

func New ¶

func New(conf *Config, getter HttpGet, con Console) (*Scraper, error)

New creates a new Scraper

func (*Scraper) Close ¶

func (s *Scraper) Close()

Close closes the channels

func (*Scraper) DoesLinkExist ¶

func (s *Scraper) DoesLinkExist(newLink Links, existingLinks []Links) (exists bool)

DoesLinkExist checks if a link exists in a given slice

func (*Scraper) DownloadAttachments ¶

func (s *Scraper) DownloadAttachments()

DownloadAttachments downloads the attachments

func (*Scraper) GetInsideAttachments ¶

func (s *Scraper) GetInsideAttachments(link string) (attachments []string, err error)

GetInsideAttachments gets inside CSS and JS Files

func (*Scraper) GetLastFolder ¶

func (s *Scraper) GetLastFolder(path string) string

GetLastFolder returns the last folder of a path

func (*Scraper) GetPath ¶

func (s *Scraper) GetPath(url string) (path string)

GetPath returns the path of a given URL

func (*Scraper) HasRenderedExtension ¶

func (s *Scraper) HasRenderedExtension(link string) bool

HasRenderedExtension checks if the link has a rendered extension

func (*Scraper) IsInternLink ¶

func (s *Scraper) IsInternLink(link string) bool

IsInternLink checks if a link is intern

func (*Scraper) IsLinkScanned ¶

func (s *Scraper) IsLinkScanned(link string, scanned []string) (exists bool)

IsLinkScanned checks if a link has already been scanned

func (*Scraper) IsStart ¶

func (s *Scraper) IsStart(link string) bool

IsStart cheks if the site is the startsite

func (*Scraper) IsURLInSlice ¶

func (s *Scraper) IsURLInSlice(search string, array []string) bool

IsURLInSlice checks if a URL is in a slice

func (*Scraper) IsValidAttachment ¶

func (s *Scraper) IsValidAttachment(link string) bool

IsValidAttachment checks if the link is a valid extension, not a site

func (*Scraper) IsValidExtension ¶

func (s *Scraper) IsValidExtension(link string) bool

IsValidExtension check if an extension is valid

func (*Scraper) IsValidLink ¶

func (s *Scraper) IsValidLink(link string) (ok bool)

IsValidLink checks if the link is a valid url and from the domain

func (*Scraper) IsValidSite ¶

func (s *Scraper) IsValidSite(link string) bool

IsValidLink checks if the link is a site and not an attachment

func (*Scraper) PreparePathsFile ¶

func (s *Scraper) PreparePathsFile(url string) (folder, filename string)

PreparePathsFile prepares the folder and filename for a given URL, assuming it's a file

func (*Scraper) PreparePathsPage ¶

func (s *Scraper) PreparePathsPage(url string) (folder, filename string)

PreparePathsPage prepares the folder and filename for a given URL, assuming it's a page

func (*Scraper) RemoveDomain ¶

func (s *Scraper) RemoveDomain(link string) string

RemoveDomain returns only the path, without domain, from the given link

func (*Scraper) RemoveTrailingSlash ¶

func (s *Scraper) RemoveTrailingSlash(link string) string

RemoveTrailingSlash removes the trailing slash from a link

func (*Scraper) Run ¶

func (s *Scraper) Run()

Run runs the scraper

func (*Scraper) SanitizeURL ¶

func (s *Scraper) SanitizeURL(link string) string

SanitizeURL sanitizes a URL

func (*Scraper) SaveAttachment ¶

func (s *Scraper) SaveAttachment(url string) (err error)

Download a single link

func (*Scraper) SaveHTML ¶

func (s *Scraper) SaveHTML(url string, html string) (err error)

Download a single link

func (*Scraper) Scrape ¶

func (s *Scraper) Scrape()

Scrape scrapes the site

func (*Scraper) TakeLinks ¶

func (s *Scraper) TakeLinks(link string)

TakeLinks take links from the given site

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL