articler

package module
v0.0.0-...-b292b7d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 18, 2016 License: MIT Imports: 21 Imported by: 2

README

articler

GoDoc

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	ErrCommentLine error = fmt.Errorf("line is comment")
	ErrEmptyLine         = fmt.Errorf("emty line")
	ErrEmptyRule         = fmt.Errorf("emty rule")
)
View Source
var (
	MaxBodySize int64 = 1024 * 1024
)

Functions

func RegisterArticleParser

func RegisterArticleParser(name string, adapter ArticleParserAdapter)

Types

type Adapter

type Adapter interface {
	Domain() string
	LastArticles() ([]*url.URL, error)
	//ExtractFeedLinks([]byte) ([]*url.URL, error)
	Parse([]byte) (*Article, error)
	IsArticle(string) bool
	Name() string
}

type AdapterConfig

type AdapterConfig struct {
	Host   string `yaml:"host,omitempty"`
	Scheme string
	Name   string

	FeedUri         string
	FeedType        string //html or rss
	FeedUriGenerate FeedUriGenerateFunc
	FeedExtract     FeedExtractFunc

	//FeedType must be html
	//this is link's selector, like "main a"
	FeedSelector string

	ArticleUriRegex string
	ParseFunc       ExtractArticleFunc

	TitleSelector    string
	TitleExtractFunc ExtractFunc

	BodySelector    string
	BodyExtractFunc ExtractFunc

	DateSelector    string
	DateFormat      string
	DateRegex       string
	DateExtractFunc ExtractTimeFunc
}

func ParseAdapterConfig

func ParseAdapterConfig(in []byte) (*AdapterConfig, error)

type Article

type Article struct {
	Title     string
	Text      string
	Published time.Time
	Source    string

	Parsed string
}

func ParseArticle

func ParseArticle(URL string, in []byte) (art *Article, e error)

type ArticleParserAdapter

type ArticleParserAdapter interface {
	Parse(string, []byte) (*Article, error)
	IsArticle(string) bool
}

type Articler

type Articler struct {
	// contains filtered or unexported fields
}

func New

func New(configs ...*Config) (art *Articler, e error)

func (*Articler) ParseArticle

func (a *Articler) ParseArticle(urlContext string, data []byte) (*Article, error)

func (*Articler) ParseArticleFromUrl

func (a *Articler) ParseArticleFromUrl(URL string) (*Article, error)

type Articles

type Articles []*Article

func (Articles) Len

func (a Articles) Len() int

func (Articles) Less

func (a Articles) Less(i, j int) bool

func (Articles) Swap

func (a Articles) Swap(i, j int)

type Channel

type Channel struct {
	Title       string `xml:"title"`
	Link        string `xml:"link"`
	Description string `xml:"description"`
	Items       []Item `xml:"item"`
}

type Config

type Config struct {
	DefaultArticleParserConf string
}

type Context

type Context struct {
	U *url.URL
}

type Crawler

type Crawler interface {
	Fetch(string) (*http.Response, error)
}

type DB

type DB interface {
	//for crawlers
	Visited(string) bool
	Visit(string) error

	//for article
	Save(*Article) error
	Get(url string) (*Article, error)
	GetAll() []*Article
}

type DefaultArticleParser

type DefaultArticleParser struct {
	// contains filtered or unexported fields
}

func NewDefaultArticleParser

func NewDefaultArticleParser() *DefaultArticleParser

func NewFromFile

func NewFromFile(filepath string) (*DefaultArticleParser, error)

func (*DefaultArticleParser) IsArticle

func (p *DefaultArticleParser) IsArticle(_ string) bool

func (*DefaultArticleParser) LoadRules

func (p *DefaultArticleParser) LoadRules(filepath string) error

func (*DefaultArticleParser) Parse

func (p *DefaultArticleParser) Parse(rawurl string, in []byte) (*Article, error)

type DefaultFetcher

type DefaultFetcher struct {
}

func (*DefaultFetcher) Get

func (d *DefaultFetcher) Get(u string) (*http.Response, error)

func (*DefaultFetcher) Head

func (d *DefaultFetcher) Head(u string) (*http.Response, error)

type DefaultParser

type DefaultParser struct {
	// contains filtered or unexported fields
}

func NewDefaultParser

func NewDefaultParser(baseUrl string) *DefaultParser

func (*DefaultParser) Domain

func (s *DefaultParser) Domain() string

func (*DefaultParser) IsArticle

func (s *DefaultParser) IsArticle(u string) bool

func (*DefaultParser) LastArticles

func (s *DefaultParser) LastArticles() ([]*url.URL, error)

func (*DefaultParser) Name

func (s *DefaultParser) Name() string

func (*DefaultParser) Parse

func (s *DefaultParser) Parse(bts []byte) (*Article, error)

type ExtractArticleFunc

type ExtractArticleFunc func([]byte) (*Article, error)

type ExtractFunc

type ExtractFunc func([]byte) ([]byte, error)

type ExtractTimeFunc

type ExtractTimeFunc func([]byte) (time.Time, error)

type FeedExtractFunc

type FeedExtractFunc func([]byte) ([]*url.URL, error)

type FeedUriGenerateFunc

type FeedUriGenerateFunc func() *url.URL

type Fetcher

type Fetcher interface {
	Get(string) (*http.Response, error)
	Head(string) (*http.Response, error)
}

type Item

type Item struct {
	Title       string `xml:"title"`
	Link        string `xml:"link"`
	Description string `xml:"description"`
}

type LimitedReadCloser

type LimitedReadCloser struct {
	io.ReadCloser
	N int64
}

func NewLimitedReadCloser

func NewLimitedReadCloser(rc io.ReadCloser, l int64) *LimitedReadCloser

func (*LimitedReadCloser) Read

func (l *LimitedReadCloser) Read(p []byte) (n int, err error)

type MemoryDb

type MemoryDb struct {
	// contains filtered or unexported fields
}

func (*MemoryDb) Visit

func (db *MemoryDb) Visit(u string) error

func (*MemoryDb) Visited

func (db *MemoryDb) Visited(u string) bool

type Parserer

type Parserer interface {
}

type Rss

type Rss struct {
	Channel Channel `xml:"channel"`
}

func DecodeRss

func DecodeRss(in []byte) (Rss, error)

type Rule

type Rule struct {
	Host string

	TitleSelector     string
	TextSelector      string
	PublishedSelector string
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL