crawler

package
v0.0.0-...-f2a16fb Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 5, 2016 License: MIT Imports: 10 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DOWNLOADER_STATUS_STOP = iota
	DOWNLOADER_STATUS_RUNING
	DOWNLOADER_STATUS_PAUSE
	DOWNLOADER_STATUS_STOPED
)
View Source
const (
	SCRAPY_STATUS_STOP = iota
	SCRAPY_STATUS_STOPED
	SCRAPY_STATUS_RUNING
	SCRAPY_STATUS_PAUSE
)

Variables

This section is empty.

Functions

This section is empty.

Types

type Crawler

type Crawler struct {
	SpiderMap     map[string]*base_spider.Spider //contains all spiders
	RequestQuene  *RequestQuene                  //all waiting request
	ResponseQuene *ResponseQuene                 //all waiting response for scrape
	Downloader    *Downloader                    //download tools
	Scraper       *Scraper                       //scrape tools
}

crawler

func NewCrawler

func NewCrawler(resultQuene *ResultQuene, settings *util.Settings) *Crawler

resultQuene is for reporter,make sure it is the same ppointer

func (*Crawler) CloseSpider

func (this *Crawler) CloseSpider(spiderName string)

func (*Crawler) GetStartRequest

func (this *Crawler) GetStartRequest(spiderName string) []*http.Request

func (*Crawler) Pause

func (this *Crawler) Pause()

func (*Crawler) Start

func (this *Crawler) Start()

func (*Crawler) StartSpider

func (this *Crawler) StartSpider(spiderName string)

func (*Crawler) Stop

func (this *Crawler) Stop()

func (*Crawler) UnPause

func (this *Crawler) UnPause()

type CrawlerStatus

type CrawlerStatus struct {
	CrawledSpider []*SpiderStatus
	RunningSpider map[string]*SpiderStatus
}

status of crawler crawled spiders and running spiders

func NewCrawlerStatus

func NewCrawlerStatus() *CrawlerStatus

func (*CrawlerStatus) CanWeStop

func (this *CrawlerStatus) CanWeStop(spiderName string) bool

judge a is a spider can stop

func (*CrawlerStatus) CloseSpider

func (this *CrawlerStatus) CloseSpider(spiderName string) *SpiderStatus

no more request for spider ,close it remove from runningSpider add to crawledSpider

func (*CrawlerStatus) Crawled

func (this *CrawlerStatus) Crawled(spiderName string)

get crawl result runing -1 crawled +1

func (*CrawlerStatus) Distribute

func (this *CrawlerStatus) Distribute(spiderName string)

if cluster distribute a request, waiting -1 runing +1

func (*CrawlerStatus) IsSpiderRunning

func (this *CrawlerStatus) IsSpiderRunning(spiderName string) bool

func (*CrawlerStatus) Push

func (this *CrawlerStatus) Push(spiderName string)

add a request to wait in spiderName

func (*CrawlerStatus) StartSpider

func (this *CrawlerStatus) StartSpider(spiderName string)

add a spider to running map

type Downloader

type Downloader struct {
	Status           int
	RequestQuene     *RequestQuene
	ResponseQuene    *ResponseQuene
	ClientList       []*http.Client
	DownloadInterval int
}

downloader tools

func NewDownloader

func NewDownloader(resuqstQuene *RequestQuene, responseQuene *ResponseQuene, downloadInterval int) *Downloader

func (*Downloader) Download

func (this *Downloader) Download()

dead loop for download pop a request download it push to response quene

func (*Downloader) IsStop

func (this *Downloader) IsStop() bool

func (*Downloader) Pause

func (this *Downloader) Pause()

func (*Downloader) Start

func (this *Downloader) Start()

DOWNLOADER_STATUS_STOPED means the dead loop is actually dead

func (*Downloader) Stop

func (this *Downloader) Stop()

func (*Downloader) UnPause

func (this *Downloader) UnPause()

type RequestQuene

type RequestQuene struct {
	RequestList *list.List
}

func NewRequestQuene

func NewRequestQuene() *RequestQuene

func (*RequestQuene) IsEmpty

func (this *RequestQuene) IsEmpty() bool

func (*RequestQuene) Pop

func (this *RequestQuene) Pop() *http.Request

func (*RequestQuene) Push

func (this *RequestQuene) Push(request *http.Request)

type ResponseQuene

type ResponseQuene struct {
	ResponseList *list.List
}

func NewResponseQuene

func NewResponseQuene() *ResponseQuene

func (*ResponseQuene) Pop

func (this *ResponseQuene) Pop() *http.Response

for now only one routine pop the request,so do not add lock

func (*ResponseQuene) Push

func (this *ResponseQuene) Push(response *http.Response)

type ResultQuene

type ResultQuene struct {
	ResultList *list.List
}

func NewResultQuene

func NewResultQuene() *ResultQuene

func (*ResultQuene) Pop

func (this *ResultQuene) Pop() *ScrapeResult

func (*ResultQuene) Push

func (this *ResultQuene) Push(scrapeResult *ScrapeResult)

type ScrapeResult

type ScrapeResult struct {
	Request         *http.Request
	CrawlResult     string // if success just empty string,or error reason
	ScrapedRequests []*http.Request
}

type Scraper

type Scraper struct {
	Status        int
	ResultQuene   *ResultQuene
	ResponseQuene *ResponseQuene
	SpiderMap     map[string]*spiders.Spider
}

func NewScraper

func NewScraper(resultQuene *ResultQuene, responseQuene *ResponseQuene, spiderMap map[string]*spiders.Spider) *Scraper

func (*Scraper) Pause

func (this *Scraper) Pause()

func (*Scraper) Scrapy

func (this *Scraper) Scrapy()

dead loop for scrapy pop a response scrapy it if scrapy some request, push it to quene

func (*Scraper) Start

func (this *Scraper) Start()

func (*Scraper) Stop

func (this *Scraper) Stop()

func (*Scraper) UnPause

func (this *Scraper) UnPause()

type SpiderStatus

type SpiderStatus struct {
	Name      string
	Crawled   int
	Running   int
	Waiting   int
	StartTime time.Time
	EndTime   time.Time
}

func NewSpiderStatus

func NewSpiderStatus(name string) *SpiderStatus

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL