robot

package module

v0.0.0-...-926784e Latest Latest Go to latest Published: Feb 1, 2016 License: Apache-2.0 Imports: 11 Imported by: 16

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/aosen/robot

Links

Open Source Insights

README ¶

robot

Golang爬虫框架。即可做通用爬虫也可做垂直爬虫

##使用过程中遇到任何问题，请联系QQ: 316052486

##思维脑图

思维脑图

##robot分四个接口：

下载模块(Downloader)
页面处理模块(PageProcesser)
任务调度模块(Scheduler)
pipline

其中下载模块，调度模块已经实现，需要开发者自行实现页面处理规则与pipline

##robot爬虫框架优势

完全接口话，你可以随意定制下载器，页面处理器，调度器，pipeline
爬虫池思想，稳定高效

##使用说明 ###初始化爬虫


//爬虫选项设置
options := robot.SpiderOptions{
    TaskName:      spidername,
    PageProcesser: process.NewWww79xsComProcessor(),
    Downloader:    downloader.NewHttpDownloader("text/html; charset=gb2312"),
    Scheduler:     scheduler.NewMysqlScheduler(spidername, dbinfo),
    //Scheduler: scheduler.NewQueueScheduler(false),
    Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)},
    //设置资源管理器，资源池容量为100
    ResourceManage: resource.NewSpidersPool(100, nil),
}

//爬虫初始化
sp := robot.NewSpider(options)
//增加根url
sp.AddRequest(utils.InitRequest(start_url, map[string]string{
    "handler": "mainParse",
}))
go sp.Run()

###详细用列见 example下

##开发进度

2015-01-18 增加爬虫案例： example/www79xscom/ Done
2015-01-18 Request结构体中增加回调函数支持，更好支持垂直爬虫的实现,具体实例见 example/www79xscom/ Done
2015-01-14 增加爬虫池支持，提高爬虫系统性能 Done
2015-01-07 优化目录结构 Done

Documentation ¶

Overview ¶

Package etc_config implements config initialization of one spider.

Index ¶

func Conf() *goutils.Config
func ReadHeaderFromFile(headerFile string) http.Header
func StartConf(configFilePath string) *goutils.Config
type CollectPipeline
type CollectPipelinePageItems
- func NewCollectPipelinePageItems() *CollectPipelinePageItems
- func (self *CollectPipelinePageItems) GetCollected() []*PageItems
- func (self *CollectPipelinePageItems) Process(items *PageItems, t Task)
type Downloader
type Page
- func NewPage(req *Request) *Page
- func (self *Page) AddField(key string, value string)
- func (self *Page) AddTargetRequest(req *Request) *Page
- func (self *Page) AddTargetRequests(reqs []*Request) *Page
- func (self *Page) Errormsg() string
- func (self *Page) GetBodyStr() string
- func (self *Page) GetCookies() []*http.Cookie
- func (self *Page) GetHeader() http.Header
- func (self *Page) GetHtmlParser() *goquery.Document
- func (self *Page) GetJson() *simplejson.Json
- func (self *Page) GetPageItems() *PageItems
- func (self *Page) GetRequest() *Request
- func (self *Page) GetSkip() bool
- func (self *Page) GetTargetRequests() []*Request
- func (self *Page) GetUrlTag() string
- func (self *Page) IsSucc() bool
- func (self *Page) ResetHtmlParser() *goquery.Document
- func (self *Page) SetBodyStr(body string) *Page
- func (self *Page) SetCookies(cookies []*http.Cookie)
- func (self *Page) SetHeader(header http.Header)
- func (self *Page) SetHtmlParser(doc *goquery.Document) *Page
- func (self *Page) SetJson(js *simplejson.Json) *Page
- func (self *Page) SetRequest(r *Request) *Page
- func (self *Page) SetSkip(skip bool)
- func (self *Page) SetStatus(isfail bool, errormsg string)
type PageItems
- func NewPageItems(req *Request) *PageItems
- func (self *PageItems) AddItem(key string, item string)
- func (self *PageItems) GetAll() map[string]string
- func (self *PageItems) GetItem(key string) (string, bool)
- func (self *PageItems) GetRequest() *Request
- func (self *PageItems) GetSkip() bool
- func (self *PageItems) SetSkip(skip bool) *PageItems
type PageProcesser
type Pipeline
type Request
- func NewRequest(req *Request) *Request
- func (self *Request) AddHeaderFile(headerFile string) *Request
- func (self *Request) AddProxyHost(host string) *Request
- func (self *Request) GetBaseUrl() string
- func (self *Request) GetCookies() []*http.Cookie
- func (self *Request) GetHeader() http.Header
- func (self *Request) GetMeta() interface{}
- func (self *Request) GetMethod() string
- func (self *Request) GetPostdata() string
- func (self *Request) GetProxyHost() string
- func (self *Request) GetResponceType() string
- func (self *Request) GetUrl() string
- func (self *Request) GetUrlTag() string
type ResourceManage
type Scheduler
type Spider
- func NewSpider(options SpiderOptions) *Spider
- func (self *Spider) AddPipeline(p Pipeline) *Spider
- func (self *Spider) AddRequest(req *Request) *Spider
- func (self *Spider) AddRequests(reqs []*Request) *Spider
- func (self *Spider) Close()
- func (self *Spider) CloseFileLog() *Spider
- func (self *Spider) CloseStrace() *Spider
- func (self *Spider) Get(req *Request) *PageItems
- func (self *Spider) GetAll(reqs []*Request) []*PageItems
- func (self *Spider) GetAllByRequest(reqs []*Request) []*PageItems
- func (self *Spider) GetByRequest(req *Request) *PageItems
- func (self *Spider) GetDownloader() Downloader
- func (self *Spider) GetExitWhenComplete() bool
- func (self *Spider) GetScheduler() Scheduler
- func (self *Spider) OpenFileLog(filePath string) *Spider
- func (self *Spider) OpenFileLogDefault() *Spider
- func (self *Spider) OpenStrace() *Spider
- func (self *Spider) Run()
- func (self *Spider) SetDownloader(d Downloader) *Spider
- func (self *Spider) SetExitWhenComplete(e bool) *Spider
- func (self *Spider) SetScheduler(s Scheduler) *Spider
- func (self *Spider) SetSleepTime(sleeptype string, s uint, e uint) *Spider
- func (self *Spider) Taskname() string
type SpiderOptions
type Task

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func ReadHeaderFromFile ¶

func ReadHeaderFromFile(headerFile string) http.Header

func StartConf ¶

func StartConf(configFilePath string) *goutils.Config

StartConf is used in Spider for initialization at first time.

Types ¶

type CollectPipeline ¶

type CollectPipeline interface {
	Pipeline

	// The GetCollected returns result saved in in process's memory temporarily.
	GetCollected() []*PageItems
}

The interface CollectPipeline recommend result in process's memory temporarily.

type CollectPipelinePageItems ¶

type CollectPipelinePageItems struct {
	// contains filtered or unexported fields
}

func NewCollectPipelinePageItems ¶

func NewCollectPipelinePageItems() *CollectPipelinePageItems

func (*CollectPipelinePageItems) GetCollected ¶

func (self *CollectPipelinePageItems) GetCollected() []*PageItems

func (*CollectPipelinePageItems) Process ¶

func (self *CollectPipelinePageItems) Process(items *PageItems, t Task)

type Downloader ¶

type Downloader interface {
	Download(req *Request) *Page
}

The Downloader interface. You can implement the interface by implement function Download. Function Download need to return Page instance pointer that has request result downloaded from Request.

func (*Page) AddField ¶

func (self *Page) AddField(key string, value string)

AddField saves KV string pair to PageItems preparing for Pipeline

func (*Page) AddTargetRequest ¶

func (self *Page) AddTargetRequest(req *Request) *Page

AddTargetRequest adds one new Request waitting for crawl.

func (*Page) AddTargetRequests ¶

func (self *Page) AddTargetRequests(reqs []*Request) *Page

AddTargetRequests adds new Requests waitting for crawl.

func (*Page) Errormsg ¶

func (self *Page) Errormsg() string

Errormsg show the download error message.

func (*Page) GetBodyStr ¶

func (self *Page) GetBodyStr() string

GetBodyStr returns plain string crawled.

func (*Page) GetCookies ¶

func (self *Page) GetCookies() []*http.Cookie

GetHeader returns the cookies of http responce

func (*Page) GetHeader ¶

func (self *Page) GetHeader() http.Header

GetHeader returns the header of http responce

func (*Page) GetHtmlParser ¶

func (self *Page) GetHtmlParser() *goquery.Document

GetHtmlParser returns goquery object binded to target crawl result.

func (*Page) GetJson ¶

func (self *Page) GetJson() *simplejson.Json

SetJson returns json result.

func (*Page) GetPageItems ¶

func (self *Page) GetPageItems() *PageItems

GetPageItems returns PageItems object that record KV pair parsed in PageProcesser.

func (*Page) GetRequest ¶

func (self *Page) GetRequest() *Request

GetRequest returns request oject of self page.

func (*Page) GetSkip ¶

func (self *Page) GetSkip() bool

GetSkip returns skip label of PageItems.

func (*Page) GetTargetRequests ¶

func (self *Page) GetTargetRequests() []*Request

GetTargetRequests returns the target requests that will put into Scheduler

func (*Page) GetUrlTag ¶

func (self *Page) GetUrlTag() string

GetUrlTag returns name of url.

func (*Page) IsSucc ¶

func (self *Page) IsSucc() bool

IsSucc test whether download process success or not.

func (*Page) ResetHtmlParser ¶

func (self *Page) ResetHtmlParser() *goquery.Document

GetHtmlParser returns goquery object binded to target crawl result.

func (*Page) SetBodyStr ¶

func (self *Page) SetBodyStr(body string) *Page

SetBodyStr saves plain string crawled in Page.

func (*Page) SetCookies ¶

func (self *Page) SetCookies(cookies []*http.Cookie)

SetHeader save the cookies of http responce

func (*Page) SetHeader ¶

func (self *Page) SetHeader(header http.Header)

SetHeader save the header of http responce

func (*Page) SetHtmlParser ¶

func (self *Page) SetHtmlParser(doc *goquery.Document) *Page

SetHtmlParser saves goquery object binded to target crawl result.

func (*Page) SetJson ¶

func (self *Page) SetJson(js *simplejson.Json) *Page

SetJson saves json result.

func (*Page) SetRequest ¶

func (self *Page) SetRequest(r *Request) *Page

SetRequest saves request oject of self page.

func (*Page) SetSkip ¶

func (self *Page) SetSkip(skip bool)

SetSkip set label "skip" of PageItems. PageItems will not be saved in Pipeline wher skip is set true

func (*Page) SetStatus ¶

func (self *Page) SetStatus(isfail bool, errormsg string)

SetStatus save status info about download process.

type PageItems ¶

type PageItems struct {
	// contains filtered or unexported fields
}

PageItems represents an entity save result parsed by PageProcesser and will be output at last. 保存解析后结果

func NewPageItems ¶

func NewPageItems(req *Request) *PageItems

NewPageItems returns initialized PageItems object. 返回一个初始化的pageitems

func (*PageItems) AddItem ¶

func (self *PageItems) AddItem(key string, item string)

AddItem saves a KV result into PageItems.

func (*PageItems) GetAll ¶

func (self *PageItems) GetAll() map[string]string

GetAll returns all the KVs result.

func (*PageItems) GetItem ¶

func (self *PageItems) GetItem(key string) (string, bool)

GetItem returns value of the key.

func (*PageItems) GetRequest ¶

func (self *PageItems) GetRequest() *Request

GetRequest returns request of PageItems

func (*PageItems) GetSkip ¶

func (self *PageItems) GetSkip() bool

GetSkip returns skip label.

func (*PageItems) SetSkip ¶

func (self *PageItems) SetSkip(skip bool) *PageItems

SetSkip set skip true to make self page not to be processed by Pipeline.

type Pipeline ¶

type Pipeline interface {
	// The Process implements result persistent.
	// The items has the result be crawled.
	// The t has informations of this crawl task.
	Process(items *PageItems, t Task)
}

The interface Pipeline can be implemented to customize ways of persistent. 最终抓取数据流向，需开发者自己实现，pipeline文件夹下有例子

type Request ¶

type Request struct {
	Url string

	// Responce type: html json jsonp text
	RespType string

	// GET POST
	Method string

	// POST data
	Postdata string

	// name for marking url and distinguish different urls in PageProcesser and Pipeline
	Urltag string

	// http header
	Header http.Header

	// http cookies
	Cookies []*http.Cookie

	//proxy host   example='localhost:80'
	ProxyHost string

	Meta interface{}
}

Request represents object waiting for being crawled.

func NewRequest ¶

func NewRequest(req *Request) *Request

func (*Request) AddHeaderFile ¶

func (self *Request) AddHeaderFile(headerFile string) *Request

point to a json file

xxx.json

{
	"User-Agent":"curl/7.19.3 (i386-pc-win32) libcurl/7.19.3 OpenSSL/1.0.0d",
	"Referer":"http://weixin.sogou.com/gzh?openid=oIWsFt6Sb7aZmuI98AU7IXlbjJps",
	"Cookie":""
}

func (*Request) AddProxyHost ¶

func (self *Request) AddProxyHost(host string) *Request

@host http://localhost:8765/

func (*Request) GetBaseUrl ¶

func (self *Request) GetBaseUrl() string

获取URL路径 http://www.79xs.com/Html/Book/147/147144/Index.html 返回http://www.79xs.com/Html/Book/147/147144/

func (*Request) GetCookies ¶

func (self *Request) GetCookies() []*http.Cookie

func (*Request) GetHeader ¶

func (self *Request) GetHeader() http.Header

func (*Request) GetMeta ¶

func (self *Request) GetMeta() interface{}

func (*Request) GetMethod ¶

func (self *Request) GetMethod() string

func (*Request) GetPostdata ¶

func (self *Request) GetPostdata() string

func (*Request) GetProxyHost ¶

func (self *Request) GetProxyHost() string

func (*Request) GetResponceType ¶

func (self *Request) GetResponceType() string

func (*Request) GetUrl ¶

func (self *Request) GetUrl() string

func (*Request) GetUrlTag ¶

func (self *Request) GetUrlTag() string

type Spider ¶

type Spider struct {
	// contains filtered or unexported fields
}

func NewSpider ¶

func NewSpider(options SpiderOptions) *Spider

2016-01-07 创建爬虫项目，一切从这个开始，首选需要你添加爬虫的各种选项参数，包括用哪种下载器，哪种调度器，哪种资源管理器，哪种pipeline，及页面处理器当然，我们也为你准备了一系列写好的类，给你进行参考和使用，你可以到对应的文件夹中去寻找

func (*Spider) AddPipeline ¶

func (self *Spider) AddPipeline(p Pipeline) *Spider

func (*Spider) AddRequest ¶

func (self *Spider) AddRequest(req *Request) *Spider

add Request to Schedule

func (*Spider) AddRequests ¶

func (self *Spider) AddRequests(reqs []*Request) *Spider

func (*Spider) Close ¶

func (self *Spider) Close()

func (*Spider) CloseFileLog ¶

func (self *Spider) CloseFileLog() *Spider

The CloseFileLog close file log.

func (*Spider) CloseStrace ¶

func (self *Spider) CloseStrace() *Spider

The CloseStrace close strace.

func (*Spider) Get ¶

func (self *Spider) Get(req *Request) *PageItems

Deal with one url and return the PageItems.

func (*Spider) GetAll ¶

func (self *Spider) GetAll(reqs []*Request) []*PageItems

Deal with several urls and return the PageItems slice.

func (*Spider) GetAllByRequest ¶

func (self *Spider) GetAllByRequest(reqs []*Request) []*PageItems

Deal with several urls and return the PageItems slice

func (*Spider) GetByRequest ¶

func (self *Spider) GetByRequest(req *Request) *PageItems

Deal with one url and return the PageItems with other setting.

func (*Spider) GetDownloader ¶

func (self *Spider) GetDownloader() Downloader

func (*Spider) GetExitWhenComplete ¶

func (self *Spider) GetExitWhenComplete() bool

func (*Spider) GetScheduler ¶

func (self *Spider) GetScheduler() Scheduler

func (*Spider) OpenFileLog ¶

func (self *Spider) OpenFileLog(filePath string) *Spider

The OpenFileLog initialize the log path and open log. If log is opened, error info or other useful info in spider will be logged in file of the filepath. Log command is mlog.LogInst().LogError("info") or mlog.LogInst().LogInfo("info"). Spider's default log is closed. The filepath is absolute path.

func (*Spider) OpenFileLogDefault ¶

func (self *Spider) OpenFileLogDefault() *Spider

OpenFileLogDefault open file log with default file path like "WD/log/log.2014-9-1".

func (*Spider) OpenStrace ¶

func (self *Spider) OpenStrace() *Spider

The OpenStrace open strace that output progress info on the screen. Spider's default strace is opened.

func (*Spider) Run ¶

func (self *Spider) Run()

func (*Spider) SetDownloader ¶

func (self *Spider) SetDownloader(d Downloader) *Spider

func (*Spider) SetExitWhenComplete ¶

func (self *Spider) SetExitWhenComplete(e bool) *Spider

If exit when each crawl task is done. If you want to keep spider in memory all the time and add url from outside, you can set it true.

func (*Spider) SetScheduler ¶

func (self *Spider) SetScheduler(s Scheduler) *Spider

func (*Spider) SetSleepTime ¶

func (self *Spider) SetSleepTime(sleeptype string, s uint, e uint) *Spider

The SetSleepTime set sleep time after each crawl task. The unit is millisecond. If sleeptype is "fixed", the s is the sleep time and e is useless. If sleeptype is "rand", the sleep time is rand between s and e.

func (*Spider) Taskname ¶

func (self *Spider) Taskname() string

type SpiderOptions ¶

type SpiderOptions struct {
	//任务名称
	TaskName string
	//页面处理接口实现
	PageProcesser PageProcesser
	//下载器接口实现
	Downloader Downloader
	//调度器接口实现
	Scheduler Scheduler
	//Pipeline的接口实现，直接将一系列pipeline的实现对象放入这个列表
	Pipelines []Pipeline
	//资源管理接口实现
	ResourceManage ResourceManage
	//最大协程数,用于协程池
	MaxGoroutineNum uint
}

爬虫设置选项

type Task ¶

type Task interface {
	Taskname() string
}

The Task represents interface that contains environment variables. It inherits by Spider.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
downloader
example
mongo
www79xscom
www79xscom/pipeline
www79xscom/process
www79xscom/utils
pipeline
resource
scheduler

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

README ¶

robot

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func Conf ¶

func ReadHeaderFromFile ¶

func StartConf ¶

Types ¶

type CollectPipeline ¶

type CollectPipelinePageItems ¶

func NewCollectPipelinePageItems ¶

func (*CollectPipelinePageItems) GetCollected ¶

func (*CollectPipelinePageItems) Process ¶

type Downloader ¶

type Page ¶

func NewPage ¶

func (*Page) AddField ¶

func (*Page) AddTargetRequest ¶

func (*Page) AddTargetRequests ¶

func (*Page) Errormsg ¶

func (*Page) GetBodyStr ¶

func (*Page) GetCookies ¶

func (*Page) GetHeader ¶

func (*Page) GetHtmlParser ¶

func (*Page) GetJson ¶

func (*Page) GetPageItems ¶

func (*Page) GetRequest ¶

func (*Page) GetSkip ¶

func (*Page) GetTargetRequests ¶

func (*Page) GetUrlTag ¶

func (*Page) IsSucc ¶

func (*Page) ResetHtmlParser ¶

func (*Page) SetBodyStr ¶

func (*Page) SetCookies ¶

func (*Page) SetHeader ¶

func (*Page) SetHtmlParser ¶

func (*Page) SetJson ¶

func (*Page) SetRequest ¶

func (*Page) SetSkip ¶

func (*Page) SetStatus ¶

type PageItems ¶

func NewPageItems ¶

func (*PageItems) AddItem ¶

func (*PageItems) GetAll ¶

func (*PageItems) GetItem ¶

func (*PageItems) GetRequest ¶

func (*PageItems) GetSkip ¶

func (*PageItems) SetSkip ¶

type PageProcesser ¶

type Pipeline ¶

type Request ¶

func NewRequest ¶

func (*Request) AddHeaderFile ¶

func (*Request) AddProxyHost ¶

func (*Request) GetBaseUrl ¶

func (*Request) GetCookies ¶

func (*Request) GetHeader ¶

func (*Request) GetMeta ¶

func (*Request) GetMethod ¶

func (*Request) GetPostdata ¶

func (*Request) GetProxyHost ¶

func (*Request) GetResponceType ¶

func (*Request) GetUrl ¶

func (*Request) GetUrlTag ¶

type ResourceManage ¶

type Scheduler ¶

type Spider ¶

func NewSpider ¶

func (*Spider) AddPipeline ¶

func (*Spider) AddRequest ¶

func (*Spider) AddRequests ¶

func (*Spider) Close ¶

func (*Spider) CloseFileLog ¶

func (*Spider) CloseStrace ¶

func (*Spider) Get ¶

func (*Spider) GetAll ¶