Documentation ¶
Index ¶
- Constants
- Variables
- func GetDefualtConfiguration(args ...string) configuration
- func NewCleaner(config configuration) cleaner
- func NewExtractor(config configuration) contentExtractor
- func NewParser() *parser
- func OpenGraphResolver(article *Article) string
- func ReadLinesOfFile(filename string) []string
- func RegSplit(text string, reg *regexp.Regexp) []string
- func TimeInMilliseconds() int64
- func TimeInNanoseconds() int64
- func WebPageResolver(article *Article) string
- type Article
- type Crawler
- type Goose
- type Helper
- type StopWords
- type VideoExtractor
Constants ¶
View Source
const DEFAULT_LANGUAGE = "en"
Variables ¶
View Source
var ARROWS_SPLITTER = regexp.MustCompile("»")
View Source
var A_HREF_TAG_SELECTOR = [...]string{"/tag/", "/tags/", "/topic/", "?keyword"}
View Source
var A_REL_TAG_SELECTOR = "a[rel=tag]"
View Source
var CAPTIONS_RE = regexp.MustCompile("^caption$")
View Source
var COLON_SPLITTER = regexp.MustCompile(":")
View Source
var DASH_SPLITTER = regexp.MustCompile(" - ")
View Source
var ESCAPED_FRAGMENT_REPLACEMENT = regexp.MustCompile("#!")
View Source
var FACEBOOK_BROADCASTING_RE = regexp.MustCompile("facebook-broadcasting")
View Source
var FACEBOOK_RE = regexp.MustCompile("[^-]facebook")
View Source
var GOOGLE_RE = regexp.MustCompile(" google ")
View Source
var MORE_RE = regexp.MustCompile("^[^entry-]more.*$")
View Source
var MOTLEY_REPLACEMENT = "�"
View Source
var PIPE_SPLITTER = regexp.MustCompile("\\|")
View Source
var PUNCTUATION = regexp.MustCompile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
View Source
var REMOVENODES_RE = regexp.MustCompile("^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|scroll|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies")
View Source
var RE_LANG = "^[A-Za-z]{2}$"
View Source
var SPACE_SPLITTER = regexp.MustCompile(" ")
View Source
var TITLE_REPLACEMENTS = regexp.MustCompile("»")
View Source
var TWITTER_RE = regexp.MustCompile("[^-]twitter")
Functions ¶
func GetDefualtConfiguration ¶
func GetDefualtConfiguration(args ...string) configuration
func NewCleaner ¶
func NewCleaner(config configuration) cleaner
func NewExtractor ¶
func NewExtractor(config configuration) contentExtractor
func OpenGraphResolver ¶
func ReadLinesOfFile ¶
func TimeInMilliseconds ¶
func TimeInMilliseconds() int64
func TimeInNanoseconds ¶
func TimeInNanoseconds() int64
func WebPageResolver ¶
Types ¶
type Article ¶
type Article struct { Title string CleanedText string MetaDescription string MetaLang string MetaFavicon string MetaKeywords string CanonicalLink string Domain string TopNode *goquery.Selection TopImage string Tags *set.Set Movies *set.Set FinalUrl string LinkHash string RawHtml string Doc *goquery.Document //raw_doc PublishDate string AdditionalData map[string]string Delta int64 }
type Goose ¶
type Goose struct {
// contains filtered or unexported fields
}
func (Goose) ExtractFromRawHtml ¶
func (Goose) ExtractFromUrl ¶
type Helper ¶
type Helper struct {
// contains filtered or unexported fields
}
func NewRawHelper ¶
func NewUrlHelper ¶
type StopWords ¶
type StopWords struct {
// contains filtered or unexported fields
}
func NewStopwords ¶
func NewStopwords() StopWords
func (StopWords) SimpleLanguageDetector ¶
type VideoExtractor ¶
type VideoExtractor struct {
// contains filtered or unexported fields
}
func NewVideoExtractor ¶
func NewVideoExtractor() VideoExtractor
func (*VideoExtractor) GetVideos ¶
func (ve *VideoExtractor) GetVideos(article *Article) *set.Set
Click to show internal directories.
Click to hide internal directories.