goose

package module
v0.0.0-...-d8d32e0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 19, 2015 License: Apache-2.0 Imports: 22 Imported by: 0

README

GoOse

Html Content / Article Extractor in Golang

This is a golang port of "Goose" originaly licensed to Gravity.com under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.

Golang port was written by Antonio Linari

Gravity.com licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

INSTALL

go get github.com/advancedlogic/GoOse

HOW TO USE IT

package main

import (
	"github.com/advancedlogic/GoOse"
)

func main() {
	g := goose.New()
	article := g.ExtractFromUrl("http://edition.cnn.com/2012/07/08/opinion/banzi-ted-open-source/index.html")
	println("title", article.Title)
	println("description", article.MetaDescription)
	println("keywords", article.MetaKeywords)
	println("content", article.CleanedText)
	println("url", article.FinalUrl)
	println("top image", article.TopImage)
}

TODO

  • better organize code
  • add comments and godoc
  • improve "xpath" like queries
  • add other image extractions techniques (imagemagick)

THANKS TO

@Martin Angers for goquery
@Fatih Arslan for set
GoLang team for the amazing language and net/html

Documentation

Index

Constants

View Source
const DEFAULT_LANGUAGE = "en"

Variables

View Source
var ARROWS_SPLITTER = regexp.MustCompile("»")
View Source
var A_HREF_TAG_SELECTOR = [...]string{"/tag/", "/tags/", "/topic/", "?keyword"}
View Source
var A_REL_TAG_SELECTOR = "a[rel=tag]"
View Source
var CAPTIONS_RE = regexp.MustCompile("^caption$")
View Source
var COLON_SPLITTER = regexp.MustCompile(":")
View Source
var DASH_SPLITTER = regexp.MustCompile(" - ")
View Source
var ESCAPED_FRAGMENT_REPLACEMENT = regexp.MustCompile("#!")
View Source
var FACEBOOK_BROADCASTING_RE = regexp.MustCompile("facebook-broadcasting")
View Source
var FACEBOOK_RE = regexp.MustCompile("[^-]facebook")
View Source
var GOOGLE_RE = regexp.MustCompile(" google ")
View Source
var MORE_RE = regexp.MustCompile("^[^entry-]more.*$")
View Source
var MOTLEY_REPLACEMENT = "�"
View Source
var PIPE_SPLITTER = regexp.MustCompile("\\|")
View Source
var PUNCTUATION = regexp.MustCompile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
View Source
var REMOVENODES_RE = regexp.MustCompile("" +
	"PopularQuestions|" +
	"[Cc]omentario|" +
	"[Ff]ooter|" +
	"^fn$|" +
	"^inset$|" +
	"^print$|" +
	"^scroll$|" +
	"^side$|" +
	"^side_|" +
	"^widget$|" +
	"ajoutVideo|" +
	"articleheadings|" +
	"author-dropdown|" +
	"blog-pager|" +
	"breadcrumbs|" +
	"byline|" +
	"cabecalho|" +
	"cnnStryHghLght|" +
	"cnn_html_slideshow|" +
	"cnn_strycaptiontxt|" +
	"cnn_strylftcntnt|" +
	"cnn_stryspcvbx|" +
	"combx|" +
	"comment|" +
	"communitypromo|" +
	"contact|" +
	"contentTools2|" +
	"controls|" +
	"^date$|" +
	"detail_new_|" +
	"detail_related_|" +
	"figcaption|" +
	"footnote|" +
	"foot|" +
	"header|" +
	"img_popup_single|" +
	"js_replies|" +
	"[Kk]ona[Ff]ilter|" +
	"leading|" +
	"legende|" +
	"links|" +
	"mediaarticlerelated|" +
	"menucontainer|" +
	"meta$|" +
	"navbar|" +
	"pagetools|" +
	"popup|" +
	"post-attributes|" +
	"post-title|" +
	"relacionado|" +
	"retweet|" +
	"runaroundLeft|" +
	"shoutbox|" +
	"site_nav|" +
	"socialNetworking|" +
	"social_|" +
	"socialnetworking|" +
	"socialtools|" +
	"sponsor|" +
	"sub_nav|" +
	"subscribe|" +
	"tag_|" +
	"tags|" +
	"the_answers|" +
	"timestamp|" +
	"tools|" +
	"vcard|" +
	"welcome_form|" +
	"wp-caption-text")
View Source
var RE_LANG = "^[A-Za-z]{2}$"
View Source
var SPACE_SPLITTER = regexp.MustCompile(" ")
View Source
var TITLE_REPLACEMENTS = regexp.MustCompile("»")
View Source
var TWITTER_RE = regexp.MustCompile("[^-]twitter")

Functions

func GetDefualtConfiguration

func GetDefualtConfiguration(args ...string) configuration

func NewCleaner

func NewCleaner(config configuration) cleaner

func NewExtractor

func NewExtractor(config configuration) contentExtractor

func NewParser

func NewParser() *parser

func OpenGraphResolver

func OpenGraphResolver(article *Article) string

func ReadLinesOfFile

func ReadLinesOfFile(filename string) []string

func RegSplit

func RegSplit(text string, reg *regexp.Regexp) []string

func TimeInMilliseconds

func TimeInMilliseconds() int64

func TimeInNanoseconds

func TimeInNanoseconds() int64

func WebPageResolver

func WebPageResolver(article *Article) string

Types

type Article

type Article struct {
	Title           string
	CleanedText     string
	MetaDescription string
	MetaLang        string
	MetaFavicon     string
	MetaKeywords    string
	CanonicalLink   string
	Domain          string
	TopNode         *goquery.Selection
	TopImage        string
	Tags            *set.Set
	Movies          *set.Set
	FinalUrl        string
	LinkHash        string
	RawHtml         string
	Doc             *goquery.Document
	//raw_doc
	PublishDate    string
	AdditionalData map[string]string
	Delta          int64
}

func (*Article) ToString

func (article *Article) ToString() string

Simple ToString: it shows just the title TODO: add more fields and pretty print

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(config configuration, url string, rawHtml string) Crawler

func (Crawler) Crawl

func (this Crawler) Crawl() *Article

type Goose

type Goose struct {
	// contains filtered or unexported fields
}

func New

func New(args ...string) Goose

func (Goose) ExtractFromRawHtml

func (this Goose) ExtractFromRawHtml(url string, rawHtml string) *Article

func (Goose) ExtractFromUrl

func (this Goose) ExtractFromUrl(url string) *Article

type Helper

type Helper struct {
	// contains filtered or unexported fields
}

func NewRawHelper

func NewRawHelper(url string, rawHtml string) Helper

func NewUrlHelper

func NewUrlHelper(url string) Helper

type StopWords

type StopWords struct {
	// contains filtered or unexported fields
}

func NewStopwords

func NewStopwords() StopWords

func (StopWords) SimpleLanguageDetector

func (this StopWords) SimpleLanguageDetector(text string) string

type VideoExtractor

type VideoExtractor struct {
	// contains filtered or unexported fields
}

func NewVideoExtractor

func NewVideoExtractor() VideoExtractor

func (*VideoExtractor) GetVideos

func (ve *VideoExtractor) GetVideos(article *Article) *set.Set

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL