Documentation ¶
Index ¶
- Constants
- func CommandExtensions(message string, conn net.Conn, server *TCPServer)
- func CommandHelp(message string, conn net.Conn, server *TCPServer)
- func CommandItems(message string, conn net.Conn, server *TCPServer)
- func CommandList(message string, conn net.Conn, server *TCPServer)
- func CommandMiddleware(message string, conn net.Conn, server *TCPServer)
- func CommandStats(message string, conn net.Conn, server *TCPServer)
- func CommandStop(message string, conn net.Conn, server *TCPServer)
- func ContainsOneOf(s string, targets []string) bool
- func DelAcceptEncodingMiddleware(request *http.Request) *http.Request
- func DescribeFunc(f interface{}) string
- func DescribeStruct(v interface{}) string
- func DisplayBytes(bytes []byte)
- func DisplayResponseBody(r io.Reader)
- func GetMapKeys(m map[string]interface{}) []string
- func Logger() *logging.Logger
- func NewHTTPClient() (client *http.Client)
- func NewHTTPServer(address string, engine *Engine) (server *fury.Fury)
- func ProcessFile(config interface{}, file string) error
- func RandomUserAgentMiddleware(request *http.Request) *http.Request
- func SaveItem(item SaveableItem, dao DAO)
- func SilentRecover(name string)
- func StripString(s string) string
- type DAO
- type DisplayExtension
- type Engine
- func (engine *Engine) AddScrapers(scrapers ...*Scraper) *Engine
- func (engine *Engine) Cleanup()
- func (engine Engine) Done() bool
- func (engine *Engine) FromConfig(config *ScraperConfig) *Engine
- func (engine *Engine) GetScraper(name string) *Scraper
- func (engine *Engine) HasScraper(name string) bool
- func (engine *Engine) IncrFinishedCounter()
- func (engine *Engine) PrepareRequest(request *http.Request) *http.Request
- func (engine *Engine) SetHandler(handler ScrapingHandlerFunc) *Engine
- func (engine *Engine) Start()
- func (engine *Engine) Stop()
- func (engine *Engine) UseExtension(extensions ...Extension) *Engine
- func (engine *Engine) UseMiddleware(middleware ...RequestMiddlewareFunc) *Engine
- type EngineMeta
- type Extension
- type Extractable
- type HealthCheckResource
- type LinkExtractor
- type ListByScraperResource
- type RedisDAO
- func (r RedisDAO) CountItems(name string) int64
- func (r RedisDAO) GetItems(name string) []string
- func (r RedisDAO) GetLatestItem(name string) error
- func (r RedisDAO) KeyPrefixed(key string) string
- func (r RedisDAO) ProcessItem(item string) genericStruct
- func (r RedisDAO) ProcessItems(items []string) []genericStruct
- func (r RedisDAO) SaveItem(name string, data []byte) error
- func (r RedisDAO) String() string
- type RequestMiddlewareFunc
- type SaveInRedisExtension
- type SaveableItem
- type ScrapedItem
- func (proxy ScrapedItem) CheckIfRedirected() bool
- func (proxy ScrapedItem) CheckURLPatterns() (result bool)
- func (proxy ScrapedItem) FinalResponseBody() (io.ReadCloser, error)
- func (proxy ScrapedItem) HTMLDocument() (document *goquery.Document, err error)
- func (proxy ScrapedItem) ScheduleScraperStop()
- func (proxy ScrapedItem) String() (result string)
- type Scraper
- func (scraper *Scraper) AddPatterns(urlPatterns ...URLPattern) *Scraper
- func (scraper *Scraper) CheckIfFetched(url string) (ok bool)
- func (scraper *Scraper) CheckIfShouldStop() (ok bool)
- func (scraper *Scraper) CheckUrl(sourceUrl string) (ok bool, url string)
- func (scraper *Scraper) Fetch(url string) (resp *http.Response, err error)
- func (scraper *Scraper) MarkAsFetched(url string)
- func (scraper *Scraper) Notify(url string, resp *http.Response)
- func (scraper *Scraper) RunExtractor(resp *http.Response)
- func (scraper *Scraper) SetHandler(handler ScrapingHandlerFunc) *Scraper
- func (scraper *Scraper) Start()
- func (scraper *Scraper) Stop()
- func (scraper *Scraper) String() (result string)
- type ScraperConfig
- type ScraperMeta
- type ScraperMixin
- type ScraperParams
- type ScrapingHandlerFunc
- type StatsResource
- type TCPCommand
- type TCPMessage
- type TCPServer
- type URLPattern
Constants ¶
View Source
const ( STATE_INITIAL = "INTITIAL" STATE_RUNNING = "RUNNING" STATE_STOPPING = "STOPPING" )
View Source
const ( EVENT_SCRAPER_OPENED = "SCRAPER_OPENED" EVENT_SCRAPER_CLOSED = "SCRAPER_CLOSED" EVENT_SAVEABLE_EXTRACTED = "SAVEABLE_EXTRACTED" STATUS_CODE_INITIAL = 999 TIMEOUT_DIALER = time.Duration(time.Second * 30) TIMEOUT_REQUEST = time.Duration(time.Second * 30) TIMEOUT_TLS = time.Duration(time.Second * 10) )
View Source
const (
TCP_CONNECTION_READLINE_DEADLINE = 30
)
View Source
const TYPE_CONTAINS = "contains"
View Source
const TYPE_REGEXP = "regexp"
Variables ¶
This section is empty.
Functions ¶
func ContainsOneOf ¶
func DescribeFunc ¶
func DescribeFunc(f interface{}) string
func DescribeStruct ¶
func DescribeStruct(v interface{}) string
func DisplayBytes ¶
func DisplayBytes(bytes []byte)
func DisplayResponseBody ¶
func GetMapKeys ¶
func NewHTTPClient ¶
func ProcessFile ¶
func SaveItem ¶
func SaveItem(item SaveableItem, dao DAO)
func SilentRecover ¶
func SilentRecover(name string)
func StripString ¶
Types ¶
type DAO ¶
type DisplayExtension ¶
type DisplayExtension struct { }
func (*DisplayExtension) ItemScraped ¶
func (d *DisplayExtension) ItemScraped(scraper *Scraper, item SaveableItem)
func (*DisplayExtension) ScraperStarted ¶
func (d *DisplayExtension) ScraperStarted(scraper *Scraper)
func (*DisplayExtension) ScraperStopped ¶
func (d *DisplayExtension) ScraperStopped(scraper *Scraper)
type Engine ¶
type Engine struct { Meta *EngineMeta Config *ScraperConfig // contains filtered or unexported fields }
func (*Engine) AddScrapers ¶
func (*Engine) FromConfig ¶
func (engine *Engine) FromConfig(config *ScraperConfig) *Engine
func (*Engine) GetScraper ¶
func (*Engine) HasScraper ¶
func (*Engine) IncrFinishedCounter ¶
func (engine *Engine) IncrFinishedCounter()
func (*Engine) PrepareRequest ¶
func (*Engine) SetHandler ¶
func (engine *Engine) SetHandler(handler ScrapingHandlerFunc) *Engine
func (*Engine) UseExtension ¶
func (*Engine) UseMiddleware ¶
func (engine *Engine) UseMiddleware(middleware ...RequestMiddlewareFunc) *Engine
type EngineMeta ¶
type EngineMeta struct { ScraperStats map[string]*ScraperMeta Started time.Time RequestsTotal int LastRequest *http.Request LastResponse *http.Response // contains filtered or unexported fields }
func NewEngineMeta ¶
func NewEngineMeta() (m *EngineMeta)
func (*EngineMeta) IncrSaved ¶
func (meta *EngineMeta) IncrSaved(scraper *Scraper)
func (*EngineMeta) IncrScraped ¶
func (meta *EngineMeta) IncrScraped(scraper *Scraper)
func (*EngineMeta) UpdateRequestStats ¶
type Extension ¶
type Extension interface { ScraperStarted(scraper *Scraper) ScraperStopped(scraper *Scraper) ItemScraped(scraper *Scraper, item SaveableItem) }
type Extractable ¶
type Extractable interface {
Extract(io.ReadCloser, func(string))
}
type HealthCheckResource ¶
type HealthCheckResource struct {
// contains filtered or unexported fields
}
func (HealthCheckResource) Get ¶
func (resource HealthCheckResource) Get(meta *fury.Meta)
type LinkExtractor ¶
type LinkExtractor struct {
Extractable
}
func (*LinkExtractor) Extract ¶
func (extractor *LinkExtractor) Extract(r io.ReadCloser, callback func(string))
type ListByScraperResource ¶
type ListByScraperResource struct {
// contains filtered or unexported fields
}
func (ListByScraperResource) Get ¶
func (resource ListByScraperResource) Get(meta *fury.Meta)
type RedisDAO ¶
type RedisDAO struct {
// contains filtered or unexported fields
}
func NewRedisDao ¶
func (RedisDAO) CountItems ¶
func (RedisDAO) GetLatestItem ¶
func (RedisDAO) KeyPrefixed ¶
func (RedisDAO) ProcessItem ¶
func (RedisDAO) ProcessItems ¶
type SaveInRedisExtension ¶
type SaveInRedisExtension struct { }
func (*SaveInRedisExtension) ItemScraped ¶
func (d *SaveInRedisExtension) ItemScraped(scraper *Scraper, item SaveableItem)
func (*SaveInRedisExtension) ScraperStarted ¶
func (d *SaveInRedisExtension) ScraperStarted(scraper *Scraper)
func (*SaveInRedisExtension) ScraperStopped ¶
func (d *SaveInRedisExtension) ScraperStopped(scraper *Scraper)
type SaveableItem ¶
type ScrapedItem ¶
type ScrapedItem struct { Url string FinalUrl string `json:"-"` BodyBytes []byte `json:"-"` // contains filtered or unexported fields }
func NewScrapedItem ¶
func NewScrapedItem(url string, scraper *Scraper, resp *http.Response) ScrapedItem
func (ScrapedItem) CheckIfRedirected ¶
func (proxy ScrapedItem) CheckIfRedirected() bool
func (ScrapedItem) CheckURLPatterns ¶
func (proxy ScrapedItem) CheckURLPatterns() (result bool)
func (ScrapedItem) FinalResponseBody ¶
func (proxy ScrapedItem) FinalResponseBody() (io.ReadCloser, error)
func (ScrapedItem) HTMLDocument ¶
func (proxy ScrapedItem) HTMLDocument() (document *goquery.Document, err error)
func (ScrapedItem) ScheduleScraperStop ¶
func (proxy ScrapedItem) ScheduleScraperStop()
func (ScrapedItem) String ¶
func (proxy ScrapedItem) String() (result string)
type Scraper ¶
type Scraper struct { Name string Domain string Scheme string BaseUrl string CurrentUrl string // contains filtered or unexported fields }
func NewScraper ¶
func NewScraper(params ScraperParams) (s *Scraper)
func (*Scraper) AddPatterns ¶
func (scraper *Scraper) AddPatterns(urlPatterns ...URLPattern) *Scraper
func (*Scraper) CheckIfFetched ¶
func (*Scraper) CheckIfShouldStop ¶
func (*Scraper) MarkAsFetched ¶
func (*Scraper) RunExtractor ¶
func (*Scraper) SetHandler ¶
func (scraper *Scraper) SetHandler(handler ScrapingHandlerFunc) *Scraper
type ScraperConfig ¶
type ScraperConfig struct { Project string `required:"true"` HttpAddress string TcpAddress string RedisAddress string Scrapers []struct { RequestLimit int `required:"true"` Extractor string Name string `required:"true"` Url string `required:"true"` Patterns []struct { Type string `required:"true"` Pattern string `required:"true"` } } }
func NewSpiderConfig ¶
func NewSpiderConfig(file string) (config *ScraperConfig)
type ScraperMeta ¶
type ScraperMeta struct {
// contains filtered or unexported fields
}
func NewScraperMeta ¶
func NewScraperMeta() (m *ScraperMeta)
type ScraperMixin ¶
type ScraperMixin struct {
Proxy ScrapedItem
}
func (ScraperMixin) Scraper ¶
func (item ScraperMixin) Scraper() *Scraper
func (*ScraperMixin) SetProxy ¶
func (s *ScraperMixin) SetProxy(proxy ScrapedItem) *ScraperMixin
type ScraperParams ¶
type ScraperParams struct { Name string Url string RequestLimit int Extractor Extractable }
type ScrapingHandlerFunc ¶
type ScrapingHandlerFunc func(ScrapedItem, chan<- SaveableItem)
type StatsResource ¶
type StatsResource struct {
// contains filtered or unexported fields
}
func (StatsResource) Get ¶
func (resource StatsResource) Get(meta *fury.Meta)
type TCPMessage ¶
type TCPMessage struct {
// contains filtered or unexported fields
}
type TCPServer ¶
type TCPServer struct {
// contains filtered or unexported fields
}
func NewTCPServer ¶
func (*TCPServer) AddCommand ¶
func (server *TCPServer) AddCommand(name string, handler TCPCommand)
type URLPattern ¶
func NewURLPattern ¶
func NewURLPattern(kind string, pattern string) (instance URLPattern)
func (URLPattern) String ¶
func (item URLPattern) String() (result string)
func (*URLPattern) Validate ¶
func (item *URLPattern) Validate(url string) (result bool)
Source Files ¶
Click to show internal directories.
Click to hide internal directories.