Documentation ¶
Index ¶
- func NormaliseCharset(characterSet string) string
- func OpenGraphResolver(doc *goquery.Document) string
- func ReadLinesOfFile(filename string) []string
- func UTF8encode(raw string, sourceCharset string) string
- func WebPageImageResolver(doc *goquery.Document) ([]candidate, int)
- func WebPageResolver(article *Article) string
- type Article
- type Cleaner
- type Configuration
- type ContentExtractor
- func (extr *ContentExtractor) CalculateBestNode(document *goquery.Document) *goquery.Selection
- func (extr *ContentExtractor) GetCanonicalLink(document *goquery.Document) string
- func (extr *ContentExtractor) GetCleanTextAndLinks(topNode *goquery.Selection, lang string) (string, []string)
- func (extr *ContentExtractor) GetDomain(canonicalLink string) string
- func (extr *ContentExtractor) GetFavicon(document *goquery.Document, url string) string
- func (extr *ContentExtractor) GetMetaAuthor(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaContent(document *goquery.Document, metaName string) string
- func (extr *ContentExtractor) GetMetaContentLocation(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaContentWithSelector(document *goquery.Document, selector string) string
- func (extr *ContentExtractor) GetMetaContents(document *goquery.Document, metaNames *set.Set) map[string]string
- func (extr *ContentExtractor) GetMetaDescription(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaKeywords(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaLanguage(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaOgDescription(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaOgImage(document *goquery.Document) string
- func (extr *ContentExtractor) GetMetaOgType(document *goquery.Document) string
- func (extr *ContentExtractor) GetPublishDate(document *goquery.Document) *time.Time
- func (extr *ContentExtractor) GetTags(document *goquery.Document) *set.Set
- func (extr *ContentExtractor) GetTitle(document *goquery.Document) string
- func (extr *ContentExtractor) PostCleanup(targetNode *goquery.Selection) *goquery.Selection
- type Crawler
- type Goose
- type Parser
- type StopWords
- type VideoExtractor
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func NormaliseCharset ¶
NormaliseCharset Overrides/fixes charset names to something we can parse. Fixes common mispellings and uses a canonical name for equivalent encodings. @see https://encoding.spec.whatwg.org#names-and-labels
func OpenGraphResolver ¶
OpenGraphResolver return OpenGraph properties
func ReadLinesOfFile ¶
ReadLinesOfFile returns the lines from a file as a slice of strings
func UTF8encode ¶
UTF8encode converts a string from the source character set to UTF-8, skipping invalid byte sequences @see http://stackoverflow.com/questions/32512500/ignore-illegal-bytes-when-decoding-text-with-go
func WebPageImageResolver ¶
WebPageResolver fetches all candidate images from the HTML page
func WebPageResolver ¶
WebPageResolver fetches the main image from the HTML page
Types ¶
type Article ¶
type Article struct { Title string `json:"title,omitempty"` CleanedText string `json:"content,omitempty"` MetaDescription string `json:"description,omitempty"` MetaOgDescription string `json:"ogdescription,omitempty"` MetaOgType string `json:"ogtype,omitempty"` MetaOgImage string `json:"ogimage,omitempty"` MetaLang string `json:"lang,omitempty"` MetaFavicon string `json:"favicon,omitempty"` MetaKeywords string `json:"keywords,omitempty"` CanonicalLink string `json:"canonicalurl,omitempty"` Domain string `json:"domain,omitempty"` TopNode *goquery.Selection `json:"-"` TopImage string `json:"image,omitempty"` Tags *set.Set `json:"tags,omitempty"` Movies *set.Set `json:"movies,omitempty"` FinalURL string `json:"url,omitempty"` LinkHash string `json:"linkhash,omitempty"` RawHTML string `json:"rawhtml,omitempty"` Doc *goquery.Document `json:"-"` Links []string `json:"links,omitempty"` PublishDate *time.Time `json:"publishdate,omitempty"` AdditionalData map[string]string `json:"additionaldata,omitempty"` Delta int64 `json:"delta,omitempty"` }
Article is a collection of properties extracted from the HTML body
type Cleaner ¶
type Cleaner struct {
// contains filtered or unexported fields
}
Cleaner removes menus, ads, sidebars, etc. and leaves the main content
func NewCleaner ¶
func NewCleaner(config Configuration) Cleaner
NewCleaner returns a new instance of a Cleaner
type Configuration ¶
type Configuration struct {
// contains filtered or unexported fields
}
Configuration is a wrapper for various config options
func GetDefaultConfiguration ¶
func GetDefaultConfiguration(args ...string) Configuration
GetDefaultConfiguration returns safe default configuration options
type ContentExtractor ¶
type ContentExtractor struct {
// contains filtered or unexported fields
}
ContentExtractor can parse the HTML and fetch various properties
func NewExtractor ¶
func NewExtractor(config Configuration) ContentExtractor
NewExtractor returns a configured HTML parser
func (*ContentExtractor) CalculateBestNode ¶
func (extr *ContentExtractor) CalculateBestNode(document *goquery.Document) *goquery.Selection
CalculateBestNode checks for the HTML node most likely to contain the main content. we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords and the number of consecutive paragraphs together, which should form the cluster of text that this node is around also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
func (*ContentExtractor) GetCanonicalLink ¶
func (extr *ContentExtractor) GetCanonicalLink(document *goquery.Document) string
GetCanonicalLink returns the meta canonical link set in the source
func (*ContentExtractor) GetCleanTextAndLinks ¶
func (extr *ContentExtractor) GetCleanTextAndLinks(topNode *goquery.Selection, lang string) (string, []string)
GetCleanTextAndLinks parses the main HTML node for text and links
func (*ContentExtractor) GetDomain ¶
func (extr *ContentExtractor) GetDomain(canonicalLink string) string
GetDomain extracts the domain from a link
func (*ContentExtractor) GetFavicon ¶
func (extr *ContentExtractor) GetFavicon(document *goquery.Document, url string) string
GetFavicon returns the favicon set in the source, if the article has one
func (*ContentExtractor) GetMetaAuthor ¶
func (extr *ContentExtractor) GetMetaAuthor(document *goquery.Document) string
GetMetaAuthor returns the meta author set in the source, if the article has one
func (*ContentExtractor) GetMetaContent ¶
func (extr *ContentExtractor) GetMetaContent(document *goquery.Document, metaName string) string
GetMetaContent returns the content attribute of meta tag with the given property name
func (*ContentExtractor) GetMetaContentLocation ¶
func (extr *ContentExtractor) GetMetaContentLocation(document *goquery.Document) string
GetMetaContentLocation returns the meta content location set in the source, if the article has one
func (*ContentExtractor) GetMetaContentWithSelector ¶
func (extr *ContentExtractor) GetMetaContentWithSelector(document *goquery.Document, selector string) string
GetMetaContentWithSelector returns the content attribute of meta tag matching the selector
func (*ContentExtractor) GetMetaContents ¶
func (extr *ContentExtractor) GetMetaContents(document *goquery.Document, metaNames *set.Set) map[string]string
GetMetaContents returns all the meta tags as name->content pairs
func (*ContentExtractor) GetMetaDescription ¶
func (extr *ContentExtractor) GetMetaDescription(document *goquery.Document) string
GetMetaDescription returns the meta description set in the source, if the article has one
func (*ContentExtractor) GetMetaKeywords ¶
func (extr *ContentExtractor) GetMetaKeywords(document *goquery.Document) string
GetMetaKeywords returns the meta keywords set in the source, if the article has them
func (*ContentExtractor) GetMetaLanguage ¶
func (extr *ContentExtractor) GetMetaLanguage(document *goquery.Document) string
GetMetaLanguage returns the meta language set in the source, if the article has one
func (*ContentExtractor) GetMetaOgDescription ¶
func (extr *ContentExtractor) GetMetaOgDescription(document *goquery.Document) string
func (*ContentExtractor) GetMetaOgImage ¶
func (extr *ContentExtractor) GetMetaOgImage(document *goquery.Document) string
func (*ContentExtractor) GetMetaOgType ¶
func (extr *ContentExtractor) GetMetaOgType(document *goquery.Document) string
func (*ContentExtractor) GetPublishDate ¶
func (extr *ContentExtractor) GetPublishDate(document *goquery.Document) *time.Time
GetPublishDate returns the publication date, if one can be located.
func (*ContentExtractor) GetTags ¶
func (extr *ContentExtractor) GetTags(document *goquery.Document) *set.Set
GetTags returns the tags set in the source, if the article has them
func (*ContentExtractor) GetTitle ¶
func (extr *ContentExtractor) GetTitle(document *goquery.Document) string
GetTitle returns the title set in the source, if the article has one
func (*ContentExtractor) PostCleanup ¶
func (extr *ContentExtractor) PostCleanup(targetNode *goquery.Selection) *goquery.Selection
PostCleanup removes any divs that looks like non-content, clusters of links, or paras with no gusto
type Crawler ¶
Crawler can fetch the target HTML page
func NewCrawler ¶
func NewCrawler(config Configuration, url string, RawHTML string) Crawler
NewCrawler returns a crawler object initialised with the URL and the [optional] raw HTML body
func (Crawler) GetCharset ¶
GetCharset returns a normalised charset string extracted from the meta tags
func (Crawler) GetContentType ¶
GetContentType returns the Content-Type string extracted from the meta tags
func (*Crawler) Preprocess ¶
Preprocess fetches the HTML page if needed, converts it to UTF-8 and applies some text normalisation to guarantee better results when extracting the content
func (*Crawler) SetCharset ¶
SetCharset can be used to force a charset (e.g. when read from the HTTP headers) rather than relying on the detection from the HTML meta tags
type Goose ¶
type Goose struct {
// contains filtered or unexported fields
}
Goose is the main entry point of the program
func (Goose) ExtractFromRawHTML ¶
ExtractFromRawHTML returns an article object from the raw HTML content
type Parser ¶
type Parser struct{}
Parser is an HTML parser specialised in extraction of main content and other properties
type StopWords ¶
type StopWords struct {
// contains filtered or unexported fields
}
StopWords implements a simple language detector
func NewStopwords ¶
func NewStopwords() StopWords
NewStopwords returns an instance of a stop words detector
func (StopWords) SimpleLanguageDetector ¶
SimpleLanguageDetector returns the language code for the text, based on its stop words
type VideoExtractor ¶
type VideoExtractor struct {
// contains filtered or unexported fields
}
VideoExtractor can extract the main video from an HTML page
func NewVideoExtractor ¶
func NewVideoExtractor() VideoExtractor
NewVideoExtractor returns a new instance of a HTML video extractor