Documentation ¶
Index ¶
- Constants
- Variables
- func GetDefualtConfiguration(args ...string) configuration
- func NewCleaner(config configuration) cleaner
- func NewExtractor(config configuration) contentExtractor
- func NewParser() *parser
- func OpenGraphResolver(article *Article) string
- func ReadLinesOfFile(filename string) []string
- func RegSplit(text string, reg *regexp.Regexp) []string
- func TimeInMilliseconds() int64
- func TimeInNanoseconds() int64
- func WebPageResolver(article *Article) string
- type Article
- type Crawler
- type Goose
- type Helper
- type StopWords
- type VideoExtractor
Constants ¶
View Source
const DEFAULT_LANGUAGE = "en"
Variables ¶
View Source
var ARROWS_SPLITTER = regexp.MustCompile("»")
View Source
var A_HREF_TAG_SELECTOR = [...]string{"/tag/", "/tags/", "/topic/", "?keyword"}
View Source
var A_REL_TAG_SELECTOR = "a[rel=tag]"
View Source
var CAPTIONS_RE = regexp.MustCompile("^caption$")
View Source
var COLON_SPLITTER = regexp.MustCompile(":")
View Source
var DASH_SPLITTER = regexp.MustCompile(" - ")
View Source
var ESCAPED_FRAGMENT_REPLACEMENT = regexp.MustCompile("#!")
View Source
var FACEBOOK_BROADCASTING_RE = regexp.MustCompile("facebook-broadcasting")
View Source
var FACEBOOK_RE = regexp.MustCompile("[^-]facebook")
View Source
var GOOGLE_RE = regexp.MustCompile(" google ")
View Source
var MORE_RE = regexp.MustCompile("^[^entry-]more.*$")
View Source
var MOTLEY_REPLACEMENT = "�"
View Source
var PIPE_SPLITTER = regexp.MustCompile("\\|")
View Source
var PUNCTUATION = regexp.MustCompile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
View Source
var REMOVENODES_RE = regexp.MustCompile("" +
"PopularQuestions|" +
"[Cc]omentario|" +
"[Ff]ooter|" +
"^fn$|" +
"^inset$|" +
"^print$|" +
"^scroll$|" +
"^side$|" +
"^side_|" +
"^widget$|" +
"ajoutVideo|" +
"articleheadings|" +
"author-dropdown|" +
"blog-pager|" +
"breadcrumbs|" +
"byline|" +
"cabecalho|" +
"cnnStryHghLght|" +
"cnn_html_slideshow|" +
"cnn_strycaptiontxt|" +
"cnn_strylftcntnt|" +
"cnn_stryspcvbx|" +
"combx|" +
"comment|" +
"communitypromo|" +
"contact|" +
"contentTools2|" +
"controls|" +
"^date$|" +
"detail_new_|" +
"detail_related_|" +
"figcaption|" +
"footnote|" +
"foot|" +
"header|" +
"img_popup_single|" +
"js_replies|" +
"[Kk]ona[Ff]ilter|" +
"leading|" +
"legende|" +
"links|" +
"mediaarticlerelated|" +
"menucontainer|" +
"meta$|" +
"navbar|" +
"pagetools|" +
"popup|" +
"post-attributes|" +
"post-title|" +
"relacionado|" +
"retweet|" +
"runaroundLeft|" +
"shoutbox|" +
"site_nav|" +
"socialNetworking|" +
"social_|" +
"socialnetworking|" +
"socialtools|" +
"sponsor|" +
"sub_nav|" +
"subscribe|" +
"tag_|" +
"tags|" +
"the_answers|" +
"timestamp|" +
"tools|" +
"vcard|" +
"welcome_form|" +
"wp-caption-text")
View Source
var RE_LANG = "^[A-Za-z]{2}$"
View Source
var SPACE_SPLITTER = regexp.MustCompile(" ")
View Source
var TITLE_REPLACEMENTS = regexp.MustCompile("»")
View Source
var TWITTER_RE = regexp.MustCompile("[^-]twitter")
Functions ¶
func GetDefualtConfiguration ¶
func GetDefualtConfiguration(args ...string) configuration
func NewCleaner ¶
func NewCleaner(config configuration) cleaner
func NewExtractor ¶
func NewExtractor(config configuration) contentExtractor
func OpenGraphResolver ¶
func ReadLinesOfFile ¶
func TimeInMilliseconds ¶
func TimeInMilliseconds() int64
func TimeInNanoseconds ¶
func TimeInNanoseconds() int64
func WebPageResolver ¶
Types ¶
type Article ¶
type Article struct { Title string CleanedText string MetaDescription string MetaLang string MetaFavicon string MetaKeywords string CanonicalLink string Domain string TopNode *goquery.Selection TopImage string Tags *set.Set Movies *set.Set FinalUrl string LinkHash string RawHtml string Doc *goquery.Document //raw_doc PublishDate string AdditionalData map[string]string Delta int64 }
type Goose ¶
type Goose struct {
// contains filtered or unexported fields
}
func (Goose) ExtractFromRawHtml ¶
func (Goose) ExtractFromUrl ¶
type Helper ¶
type Helper struct {
// contains filtered or unexported fields
}
func NewRawHelper ¶
func NewUrlHelper ¶
type StopWords ¶
type StopWords struct {
// contains filtered or unexported fields
}
func NewStopwords ¶
func NewStopwords() StopWords
func (StopWords) SimpleLanguageDetector ¶
type VideoExtractor ¶
type VideoExtractor struct {
// contains filtered or unexported fields
}
func NewVideoExtractor ¶
func NewVideoExtractor() VideoExtractor
func (*VideoExtractor) GetVideos ¶
func (ve *VideoExtractor) GetVideos(article *Article) *set.Set
Click to show internal directories.
Click to hide internal directories.