goose

package module

v0.0.0-...-d8d32e0 Latest Latest Go to latest Published: Jan 19, 2015 License: Apache-2.0 Imports: 22 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/bkaradzic/goose

Links

Open Source Insights

README ¶

GoOse

Html Content / Article Extractor in Golang

This is a golang port of "Goose" originaly licensed to Gravity.com under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.

Golang port was written by Antonio Linari

Gravity.com licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

INSTALL

go get github.com/advancedlogic/GoOse

HOW TO USE IT

package main

import (
	"github.com/advancedlogic/GoOse"
)

func main() {
	g := goose.New()
	article := g.ExtractFromUrl("http://edition.cnn.com/2012/07/08/opinion/banzi-ted-open-source/index.html")
	println("title", article.Title)
	println("description", article.MetaDescription)
	println("keywords", article.MetaKeywords)
	println("content", article.CleanedText)
	println("url", article.FinalUrl)
	println("top image", article.TopImage)
}

TODO

better organize code
add comments and godoc
improve "xpath" like queries
add other image extractions techniques (imagemagick)

THANKS TO

@Martin Angers for goquery
@Fatih Arslan for set
GoLang team for the amazing language and net/html

Documentation ¶

Index ¶

Constants
Variables
func GetDefualtConfiguration(args ...string) configuration
func NewCleaner(config configuration) cleaner
func NewExtractor(config configuration) contentExtractor
func NewParser() *parser
func OpenGraphResolver(article *Article) string
func ReadLinesOfFile(filename string) []string
func RegSplit(text string, reg *regexp.Regexp) []string
func TimeInMilliseconds() int64
func TimeInNanoseconds() int64
func WebPageResolver(article *Article) string
type Article
- func (article *Article) ToString() string
type Crawler
- func NewCrawler(config configuration, url string, rawHtml string) Crawler
- func (this Crawler) Crawl() *Article
type Goose
- func New(args ...string) Goose
- func (this Goose) ExtractFromRawHtml(url string, rawHtml string) *Article
- func (this Goose) ExtractFromUrl(url string) *Article
type Helper
- func NewRawHelper(url string, rawHtml string) Helper
- func NewUrlHelper(url string) Helper
type StopWords
- func NewStopwords() StopWords
- func (this StopWords) SimpleLanguageDetector(text string) string
type VideoExtractor
- func NewVideoExtractor() VideoExtractor
- func (ve *VideoExtractor) GetVideos(article *Article) *set.Set

Constants ¶

View Source

const DEFAULT_LANGUAGE = "en"

Variables ¶

View Source

var ARROWS_SPLITTER = regexp.MustCompile("»")

View Source

var A_HREF_TAG_SELECTOR = [...]string{"/tag/", "/tags/", "/topic/", "?keyword"}

View Source

var A_REL_TAG_SELECTOR = "a[rel=tag]"

View Source

var CAPTIONS_RE = regexp.MustCompile("^caption$")

View Source

var COLON_SPLITTER = regexp.MustCompile(":")

View Source

var DASH_SPLITTER = regexp.MustCompile(" - ")

View Source

var ESCAPED_FRAGMENT_REPLACEMENT = regexp.MustCompile("#!")

View Source

var FACEBOOK_BROADCASTING_RE = regexp.MustCompile("facebook-broadcasting")

View Source

var FACEBOOK_RE = regexp.MustCompile("[^-]facebook")

View Source

var GOOGLE_RE = regexp.MustCompile(" google ")

View Source

var MORE_RE = regexp.MustCompile("^[^entry-]more.*$")

View Source

var MOTLEY_REPLACEMENT = "&#65533;"

View Source

var PIPE_SPLITTER = regexp.MustCompile("\\|")

View Source

var PUNCTUATION = regexp.MustCompile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")

View Source

var REMOVENODES_RE = regexp.MustCompile("" +
	"PopularQuestions|" +
	"[Cc]omentario|" +
	"[Ff]ooter|" +
	"^fn$|" +
	"^inset$|" +
	"^print$|" +
	"^scroll$|" +
	"^side$|" +
	"^side_|" +
	"^widget$|" +
	"ajoutVideo|" +
	"articleheadings|" +
	"author-dropdown|" +
	"blog-pager|" +
	"breadcrumbs|" +
	"byline|" +
	"cabecalho|" +
	"cnnStryHghLght|" +
	"cnn_html_slideshow|" +
	"cnn_strycaptiontxt|" +
	"cnn_strylftcntnt|" +
	"cnn_stryspcvbx|" +
	"combx|" +
	"comment|" +
	"communitypromo|" +
	"contact|" +
	"contentTools2|" +
	"controls|" +
	"^date$|" +
	"detail_new_|" +
	"detail_related_|" +
	"figcaption|" +
	"footnote|" +
	"foot|" +
	"header|" +
	"img_popup_single|" +
	"js_replies|" +
	"[Kk]ona[Ff]ilter|" +
	"leading|" +
	"legende|" +
	"links|" +
	"mediaarticlerelated|" +
	"menucontainer|" +
	"meta$|" +
	"navbar|" +
	"pagetools|" +
	"popup|" +
	"post-attributes|" +
	"post-title|" +
	"relacionado|" +
	"retweet|" +
	"runaroundLeft|" +
	"shoutbox|" +
	"site_nav|" +
	"socialNetworking|" +
	"social_|" +
	"socialnetworking|" +
	"socialtools|" +
	"sponsor|" +
	"sub_nav|" +
	"subscribe|" +
	"tag_|" +
	"tags|" +
	"the_answers|" +
	"timestamp|" +
	"tools|" +
	"vcard|" +
	"welcome_form|" +
	"wp-caption-text")

View Source

var RE_LANG = "^[A-Za-z]{2}$"

View Source

var SPACE_SPLITTER = regexp.MustCompile(" ")

View Source

var TITLE_REPLACEMENTS = regexp.MustCompile("&raquo;")

View Source

var TWITTER_RE = regexp.MustCompile("[^-]twitter")

Functions ¶

func GetDefualtConfiguration ¶

func GetDefualtConfiguration(args ...string) configuration

func NewCleaner ¶

func NewCleaner(config configuration) cleaner

func NewExtractor ¶

func NewExtractor(config configuration) contentExtractor

func NewParser ¶

func NewParser() *parser

func OpenGraphResolver ¶

func OpenGraphResolver(article *Article) string

func ReadLinesOfFile ¶

func ReadLinesOfFile(filename string) []string

func RegSplit ¶

func RegSplit(text string, reg *regexp.Regexp) []string

func TimeInMilliseconds ¶

func TimeInMilliseconds() int64

func TimeInNanoseconds ¶

func TimeInNanoseconds() int64

func WebPageResolver ¶

func WebPageResolver(article *Article) string

Types ¶

type Article ¶

type Article struct {
	Title           string
	CleanedText     string
	MetaDescription string
	MetaLang        string
	MetaFavicon     string
	MetaKeywords    string
	CanonicalLink   string
	Domain          string
	TopNode         *goquery.Selection
	TopImage        string
	Tags            *set.Set
	Movies          *set.Set
	FinalUrl        string
	LinkHash        string
	RawHtml         string
	Doc             *goquery.Document
	//raw_doc
	PublishDate    string
	AdditionalData map[string]string
	Delta          int64
}

func (*Article) ToString ¶

func (article *Article) ToString() string

Simple ToString: it shows just the title TODO: add more fields and pretty print

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler ¶

func NewCrawler(config configuration, url string, rawHtml string) Crawler

func (Crawler) Crawl ¶

func (this Crawler) Crawl() *Article

type Goose ¶

type Goose struct {
	// contains filtered or unexported fields
}

func New ¶

func New(args ...string) Goose

func (Goose) ExtractFromRawHtml ¶

func (this Goose) ExtractFromRawHtml(url string, rawHtml string) *Article

func (Goose) ExtractFromUrl ¶

func (this Goose) ExtractFromUrl(url string) *Article

type Helper ¶

type Helper struct {
	// contains filtered or unexported fields
}

func NewRawHelper ¶

func NewRawHelper(url string, rawHtml string) Helper

func NewUrlHelper ¶

func NewUrlHelper(url string) Helper

type StopWords ¶

type StopWords struct {
	// contains filtered or unexported fields
}

func NewStopwords ¶

func NewStopwords() StopWords

func (StopWords) SimpleLanguageDetector ¶

func (this StopWords) SimpleLanguageDetector(text string) string

type VideoExtractor ¶

type VideoExtractor struct {
	// contains filtered or unexported fields
}

func NewVideoExtractor ¶

func NewVideoExtractor() VideoExtractor

func (*VideoExtractor) GetVideos ¶

func (ve *VideoExtractor) GetVideos(article *Article) *set.Set

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL