Documentation ¶
Index ¶
- Constants
- Variables
- func CrawlProcess(taskChannel chan struct{}, e *Engine, task *Task)
- func ParseHTML(parser HTMLParser, ctx *Context) error
- func RandomIntRangeWithStringSeed(min int, max int, seedString string) int
- func ReadListFile(listFilePath string) ([]string, error)
- func RequestWithURL(task *Task, middlewares ...Middleware) (io.Reader, error)
- type ChannelPipeline
- type ChannelPipelineToken
- type Context
- type CookieMiddleware
- type CookieMiddlewareOption
- type CookieStore
- type DefaultCookieStore
- type DefaultItem
- func (i *DefaultItem) GetFloat64(key string) (float64, error)
- func (i *DefaultItem) GetInt(key string) (int, error)
- func (i *DefaultItem) GetString(key string) (string, error)
- func (i DefaultItem) GetToken() string
- func (i *DefaultItem) GetValue(key string) (interface{}, error)
- func (i *DefaultItem) SetValue(key string, value interface{})
- type DelayMiddleware
- type Engine
- func (e *Engine) AddHTMLParser(parsers ...HTMLParser)
- func (e *Engine) AddPipelines(pipelines ...Pipeline)
- func (e *Engine) AddPlugins(plugins ...Plugin)
- func (e *Engine) AddPostProcess(postprocessList ...PostProcess)
- func (e *Engine) AddTasks(tasks ...*Task)
- func (e *Engine) AddURLs(urls ...string)
- func (e *Engine) Run(wg *sync.WaitGroup)
- func (e *Engine) RunAndWait()
- func (e *Engine) UseMiddleware(middlewares ...Middleware)
- func (e *Engine) UseTaskPool(taskPool TaskPool)
- type EngineOption
- type GlobalStore
- type GlobalStorePipeline
- type HTMLParser
- type ImageDownloadItem
- type ImageDownloadPipeline
- type MemoryGlobalStore
- type Middleware
- type OutputCSVPostProcess
- type OutputCSVPostProcessOption
- type OutputJsonPostProcess
- type Pipeline
- type Plugin
- type PostProcess
- type ProxyMiddleware
- type ProxyMiddlewareOption
- type RequestPool
- func (p *RequestPool) AddTasks(tasks ...*Task)
- func (p *RequestPool) AddURLs(urls ...string)
- func (p *RequestPool) Close()
- func (p *RequestPool) GetCompleteCount() (int, error)
- func (p *RequestPool) GetDoneChan() chan struct{}
- func (p *RequestPool) GetOneTask(e *Engine) <-chan *Task
- func (p *RequestPool) GetTotal() (int, error)
- func (p *RequestPool) GetUnRequestCount() (int, error)
- func (p *RequestPool) GetUnRequestedTask() (target *Task)
- func (p *RequestPool) OnTaskDone(task *Task)
- func (p *RequestPool) SetPrevent(isPrevent bool)
- type RequestPoolOption
- type StatusOutputPlugin
- type Task
- type TaskPool
- type UserAgentMiddleware
- type UserAgentMiddlewareOption
Constants ¶
View Source
const ( // total STATUS_KEY_TOTAL = "status.total" // unrequested count STATUS_KEY_UNREQUESTED = "status.unrequested" // complete count STATUS_KEY_COMPLETE = "status.complete" // speed STATUS_KEY_SPEED = "status.speed" )
View Source
const (
ItemKeyChannelToken = "channelToken"
)
Variables ¶
View Source
var ( KeyNotContainError = errors.New("key not in item") TypeError = errors.New("error type of item value") )
View Source
var EngineLogger *logrus.Entry = logrus.WithField("scope", "engine")
Functions ¶
func CrawlProcess ¶
func ReadListFile ¶
func RequestWithURL ¶
func RequestWithURL(task *Task, middlewares ...Middleware) (io.Reader, error)
make request with url
Types ¶
type ChannelPipeline ¶
func (*ChannelPipeline) Process ¶
func (p *ChannelPipeline) Process(item interface{}, _ GlobalStore) error
type ChannelPipelineToken ¶
type ChannelPipelineToken interface {
GetToken() string
}
type Context ¶
type Context struct { Request *http.Request Response *http.Response Item interface{} GlobalStore GlobalStore Pool TaskPool Cookie *cookiejar.Jar Doc *goquery.Document }
share data in crawl process
type CookieMiddleware ¶
type CookieMiddleware struct { Store CookieStore GetKey func(c *http.Client, r *http.Request, ctx *Context) string }
func NewCookieMiddleware ¶
func NewCookieMiddleware(option CookieMiddlewareOption) *CookieMiddleware
func (*CookieMiddleware) RequestCallback ¶
type CookieMiddlewareOption ¶
type CookieStore ¶
type DefaultCookieStore ¶
func (*DefaultCookieStore) GetCookie ¶
func (s *DefaultCookieStore) GetCookie(key string) *cookiejar.Jar
func (*DefaultCookieStore) GetOrCreate ¶
func (s *DefaultCookieStore) GetOrCreate(key string) *cookiejar.Jar
type DefaultItem ¶
type DefaultItem struct {
Store map[string]interface{}
}
func (*DefaultItem) GetFloat64 ¶
func (i *DefaultItem) GetFloat64(key string) (float64, error)
func (DefaultItem) GetToken ¶
func (i DefaultItem) GetToken() string
func (*DefaultItem) GetValue ¶
func (i *DefaultItem) GetValue(key string) (interface{}, error)
func (*DefaultItem) SetValue ¶
func (i *DefaultItem) SetValue(key string, value interface{})
type Engine ¶
type Engine struct { sync.Mutex *EngineOption // dispatch task Pool TaskPool Parsers []HTMLParser Middlewares []Middleware Pipelines []Pipeline GlobalStore GlobalStore PostProcess []PostProcess Plugins []Plugin // receive signal: force stop pool InterruptChan chan struct{} // receive signal: stop pool when all task has done StopPoolChan chan struct{} }
youcrawl engine
func (*Engine) AddPostProcess ¶
func (e *Engine) AddPostProcess(postprocessList ...PostProcess)
add postprocess
func (*Engine) AddTasks ¶
add task to crawl unsafe operation,engine must not in running status
in engine running ,use RequestPool.AddURLs method
func (*Engine) AddURLs ¶
add url to crawl unsafe operation,engine must not in running status
in engine running ,use RequestPool.AddURLs method
func (*Engine) UseMiddleware ¶
func (e *Engine) UseMiddleware(middlewares ...Middleware)
add middleware
type EngineOption ¶
type EngineOption struct { // max running in same time MaxRequest int // true for: // keep running until manually stopped Daemon bool }
init engine config
type GlobalStore ¶
type GlobalStore interface { Init() error SetValue(key string, value interface{}) GetValue(key string) interface{} GetOrCreate(key string, value interface{}) interface{} }
store engine global
type GlobalStorePipeline ¶
type GlobalStorePipeline struct { }
global store pipeline save current item to global items
func (*GlobalStorePipeline) Process ¶
func (g *GlobalStorePipeline) Process(item interface{}, store GlobalStore) error
type HTMLParser ¶
type ImageDownloadItem ¶
type ImageDownloadItem struct {
Urls []string
}
type ImageDownloadPipeline ¶
type ImageDownloadPipeline struct { // get store folder // //./download/image by default GetStoreFileFolder func(item interface{}, store GlobalStore) string // get save filename // // same name with image,by default GetSaveFileName func(item interface{}, store GlobalStore, rawURL string) string // get urls // //if the type of Item is ImageDownloadItem, no need to specify GetUrls func(item interface{}, store GlobalStore) []string // maximum number of concurrent downloads MaxDownload int // request middlewares to use Middlewares []Middleware // call on each image downloaded complete OnImageDownloadComplete func(item interface{}, store GlobalStore, url string, downloadFilePath string) // call on all image download, regardless of whether all image download is successful OnDone func(item interface{}, store GlobalStore) }
func (*ImageDownloadPipeline) Process ¶
func (i *ImageDownloadPipeline) Process(item interface{}, store GlobalStore) error
type MemoryGlobalStore ¶
func (*MemoryGlobalStore) GetOrCreate ¶
func (s *MemoryGlobalStore) GetOrCreate(key string, value interface{}) interface{}
func (*MemoryGlobalStore) GetValue ¶
func (s *MemoryGlobalStore) GetValue(key string) interface{}
func (*MemoryGlobalStore) Init ¶
func (s *MemoryGlobalStore) Init() error
func (*MemoryGlobalStore) SetValue ¶
func (s *MemoryGlobalStore) SetValue(key string, value interface{})
type Middleware ¶
type OutputCSVPostProcess ¶
type OutputCSVPostProcess struct {
// contains filtered or unexported fields
}
func NewOutputCSVPostProcess ¶
func NewOutputCSVPostProcess(option OutputCSVPostProcessOption) *OutputCSVPostProcess
func (*OutputCSVPostProcess) Process ¶
func (o *OutputCSVPostProcess) Process(store GlobalStore) error
type OutputCSVPostProcessOption ¶
type OutputCSVPostProcessOption struct { // output path. // if not provided,use `./output.csv` as default value OutputPath string // with header. // default : false WithHeader bool // key to write // if not provided,will write all key Keys []string // key to csv column name. // if not provide,use key name as csv column name KeysMapping map[string]string // if value not exist in item. // by default,use empty string NotExistValue string }
type OutputJsonPostProcess ¶
type OutputJsonPostProcess struct { StorePath string GetData func(store GlobalStore) interface{} }
func (*OutputJsonPostProcess) Process ¶
func (p *OutputJsonPostProcess) Process(store GlobalStore) error
type Pipeline ¶
type Pipeline interface {
Process(item interface{}, store GlobalStore) error
}
type PostProcess ¶
type PostProcess interface {
Process(store GlobalStore) error
}
type ProxyMiddleware ¶
type ProxyMiddleware struct {
List []string
}
func NewProxyMiddleware ¶
func NewProxyMiddleware(option ProxyMiddlewareOption) (*ProxyMiddleware, error)
func (*ProxyMiddleware) GetProxy ¶
func (p *ProxyMiddleware) GetProxy() string
func (*ProxyMiddleware) RequestCallback ¶
type ProxyMiddlewareOption ¶
type RequestPool ¶
type RequestPool struct { Tasks []Task Total int CompleteCount int NextTask *Task GetTaskChan chan *Task DoneChan chan struct{} CompleteChan chan *Task PreventStop bool Store GlobalStore sync.RWMutex }
func NewRequestPool ¶
func NewRequestPool(option RequestPoolOption, store GlobalStore) *RequestPool
func (*RequestPool) Close ¶
func (p *RequestPool) Close()
func (*RequestPool) GetCompleteCount ¶
func (p *RequestPool) GetCompleteCount() (int, error)
func (*RequestPool) GetDoneChan ¶
func (p *RequestPool) GetDoneChan() chan struct{}
func (*RequestPool) GetOneTask ¶
func (p *RequestPool) GetOneTask(e *Engine) <-chan *Task
func (*RequestPool) GetTotal ¶
func (p *RequestPool) GetTotal() (int, error)
func (*RequestPool) GetUnRequestCount ¶
func (p *RequestPool) GetUnRequestCount() (int, error)
func (*RequestPool) GetUnRequestedTask ¶
func (p *RequestPool) GetUnRequestedTask() (target *Task)
find unreauested task
func (*RequestPool) OnTaskDone ¶
func (p *RequestPool) OnTaskDone(task *Task)
get task from pool task
func (*RequestPool) SetPrevent ¶
func (p *RequestPool) SetPrevent(isPrevent bool)
type RequestPoolOption ¶
type StatusOutputPlugin ¶
type StatusOutputPlugin struct { // disable log output LogOutput bool }
log engine status plugin
func (*StatusOutputPlugin) Run ¶
func (p *StatusOutputPlugin) Run(e *Engine)
type TaskPool ¶
type TaskPool interface { AddURLs(urls ...string) AddTasks(task ...*Task) GetOneTask(e *Engine) <-chan *Task GetUnRequestedTask() (target *Task) OnTaskDone(task *Task) GetDoneChan() chan struct{} Close() SetPrevent(isPrevent bool) GetTotal() (int, error) GetUnRequestCount() (int, error) GetCompleteCount() (int, error) }
type UserAgentMiddleware ¶
type UserAgentMiddleware struct {
List []string
}
func NewUserAgentMiddleware ¶
func NewUserAgentMiddleware(option UserAgentMiddlewareOption) (*UserAgentMiddleware, error)
func (*UserAgentMiddleware) GetUserAgent ¶
func (p *UserAgentMiddleware) GetUserAgent() string
func (*UserAgentMiddleware) RequestCallback ¶
type UserAgentMiddlewareOption ¶
type UserAgentMiddlewareOption struct { // set user agent list, // if both UserAgentList and UserAgentFilePath are provided,combine tow list UserAgentList []string // read useragent from file,use `./ua.txt` by default, // if both UserAgentList and UserAgentFilePath are provided,combine tow list UserAgentFilePath string }
Click to show internal directories.
Click to hide internal directories.