Documentation
¶
Index ¶
- func IsFinal(url string) bool
- func IsInSlice(search string, array []string) bool
- func PrintUsage()
- func RemoveLastSlash(url string) string
- type Config
- type Console
- type HttpGet
- type Links
- type Page
- type Scraper
- func (s *Scraper) Close()
- func (s *Scraper) DoesLinkExist(newLink Links, existingLinks []Links) (exists bool)
- func (s *Scraper) DownloadAttachments()
- func (s *Scraper) GetInsideAttachments(link string) (attachments []string, err error)
- func (s *Scraper) GetLastFolder(path string) string
- func (s *Scraper) GetPath(url string) (path string)
- func (s *Scraper) HasRenderedExtension(link string) bool
- func (s *Scraper) IsInternLink(link string) bool
- func (s *Scraper) IsLinkScanned(link string, scanned []string) (exists bool)
- func (s *Scraper) IsStart(link string) bool
- func (s *Scraper) IsURLInSlice(search string, array []string) bool
- func (s *Scraper) IsValidAttachment(link string) bool
- func (s *Scraper) IsValidExtension(link string) bool
- func (s *Scraper) IsValidLink(link string) (ok bool)
- func (s *Scraper) IsValidSite(link string) bool
- func (s *Scraper) PreparePathsFile(url string) (folder, filename string)
- func (s *Scraper) PreparePathsPage(url string) (folder, filename string)
- func (s *Scraper) RemoveDomain(link string) string
- func (s *Scraper) RemoveTrailingSlash(link string) string
- func (s *Scraper) Run()
- func (s *Scraper) SanitizeURL(link string) string
- func (s *Scraper) SaveAttachment(url string) (err error)
- func (s *Scraper) SaveHTML(url string, html string) (err error)
- func (s *Scraper) Scrape()
- func (s *Scraper) TakeLinks(link string)
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func RemoveLastSlash ¶
RemoveLastSlash removes the last slash
Types ¶
type Config ¶
type Config struct { // Original domain OldDomain string `long:"u" short:"u"` // New domain to rewrite the download HTML sites NewDomain string `long:"new" short:"new"` // URL prefixes/roots that should be included in the scraper IncludedURLs string `long:"r" short:"r"` // Roots contains a range of URLs that can be considered the root // This is useful for scraping sites where content is hosted on a CDN // Not a flag. This will be filled by the scraper uppon setup Roots []string // Path where to save the downloads DownloadPath string `long:"path" short:"path"` // Use args on URLs UseQueries bool `long:"q" short:"q"` // Number of concurrent queries Simultaneous int `long:"s" short:"s"` }
Config holds the scraper configuration
func ParseFlags ¶
parseFlags parses command line arguments and validates them
type Console ¶
type Console interface { AddDomain(string) AddStatus(string) AddStarted() AddFinished() AddAttachments() AddDownloaded() AddDownloading() AddErrors(string) }
Console interface
type HttpGet ¶
type HttpGet interface { ParseURL(baseURLString, relativeURLString string) (final string, err error) Get(link string) (final string, status int, buff *bytes.Buffer, err error) }
HttpGet interface
type Scraper ¶
type Scraper struct { // Original domain OldDomain string // New domain to rewrite the download HTML sites NewDomain string // Roots contains a range of URLs that can be considered the root // This is useful for scraping sites where content is hosted on a CDN Roots []string // Path where to save the downloads DownloadPath string // Use args on URLs UseQueries bool // Number of concurrent queries Simultaneous int // Scanning now Scanning chan int // New links found NewLinks chan []Links // Pages to save Pages chan Page // Attachments found Attachments chan []string // Started Started chan int // Finished Finished chan int // Indexed pages Indexed []string // Pages for sitemap ForSitemap []string // Files to download Files []string // Seen links Seen map[string]bool // Start time StartTime time.Time // GetInterface Get HttpGet // Console Con Console }
func (*Scraper) DoesLinkExist ¶
DoesLinkExist checks if a link exists in a given slice
func (*Scraper) DownloadAttachments ¶
func (s *Scraper) DownloadAttachments()
DownloadAttachments downloads the attachments
func (*Scraper) GetInsideAttachments ¶
GetInsideAttachments gets inside CSS and JS Files
func (*Scraper) GetLastFolder ¶
GetLastFolder returns the last folder of a path
func (*Scraper) HasRenderedExtension ¶
HasRenderedExtension checks if the link has a rendered extension
func (*Scraper) IsInternLink ¶
IsInternLink checks if a link is intern
func (*Scraper) IsLinkScanned ¶
IsLinkScanned checks if a link has already been scanned
func (*Scraper) IsURLInSlice ¶
IsURLInSlice checks if a URL is in a slice
func (*Scraper) IsValidAttachment ¶
IsValidAttachment checks if the link is a valid extension, not a site
func (*Scraper) IsValidExtension ¶
IsValidExtension check if an extension is valid
func (*Scraper) IsValidLink ¶
IsValidLink checks if the link is a valid url and from the domain
func (*Scraper) IsValidSite ¶
IsValidLink checks if the link is a site and not an attachment
func (*Scraper) PreparePathsFile ¶
PreparePathsFile prepares the folder and filename for a given URL, assuming it's a file
func (*Scraper) PreparePathsPage ¶
PreparePathsPage prepares the folder and filename for a given URL, assuming it's a page
func (*Scraper) RemoveDomain ¶
RemoveDomain returns only the path, without domain, from the given link
func (*Scraper) RemoveTrailingSlash ¶
RemoveTrailingSlash removes the trailing slash from a link
func (*Scraper) SanitizeURL ¶
SanitizeURL sanitizes a URL
func (*Scraper) SaveAttachment ¶
Download a single link