Documentation ¶
Index ¶
- Constants
- Variables
- func AddUrlToCrawl(c *gin.Context)
- func ArbitrateConfigs(c *config)
- func Authorize(c *gin.Context) bool
- func DumpHttp(req *UrlRequest)
- func GBKtoUTF(input string) (string, error)
- func GetProfile(c *gin.Context)
- func Help(c *gin.Context)
- func KeyForCrawlByDay() string
- func KeyForUrlStore(url string) string
- func KillMaster(c *gin.Context)
- func LoadConfig()
- func NewCluster()
- func NewNode()
- func Response(c *gin.Context, v interface{})
- func RestartCrawler(c *gin.Context)
- func StartCrawler(c *gin.Context)
- func StopCrawler(c *gin.Context)
- type Cluster
- func (c *Cluster) AddNode(node *NodeInfo)
- func (c *Cluster) BecomeMaster()
- func (c *Cluster) BecomeSlaver()
- func (c *Cluster) Discover()
- func (c *Cluster) GetNode(url string) string
- func (c *Cluster) IsMember(node *NodeInfo) bool
- func (c *Cluster) MonitorMaster()
- func (c *Cluster) PushRequest(req *UrlRequest)
- func (c *Cluster) ResetTimer()
- func (c *Cluster) RestartDistributor()
- func (c *Cluster) StartDistributor()
- func (c *Cluster) StartKeeper()
- func (c *Cluster) StopDistributor()
- func (c *Cluster) StopKeeper()
- func (c *Cluster) StopTheWorld()
- func (c *Cluster) UpdateSlaverStatus(node *NodeInfo, v bool)
- type Crawler
- func (c *Crawler) AddParser(name string, p Parser) *Crawler
- func (c *Crawler) AddSeedUrls(urls ...string) *Crawler
- func (c *Crawler) GetParam(key string) interface{}
- func (c *Crawler) GetParser(name string) Parser
- func (c *Crawler) PostRequest(url, parser string, cookieJar int) *UrlRequest
- func (c *Crawler) Request(url, parser string, cookieJar int) *UrlRequest
- func (c *Crawler) SetDepth(depth int32) *Crawler
- func (c *Crawler) SetInSite(inSite bool) *Crawler
- func (c *Crawler) SetParam(key string, value interface{}) *Crawler
- func (c *Crawler) SetRetry(retry int32) *Crawler
- func (c *Crawler) SetStorage(st Storage) *Crawler
- func (c *Crawler) SetTTL(ttl int32) *Crawler
- func (c *Crawler) SetTimeout(to int32) *Crawler
- func (c *Crawler) StartWith(call func() []*UrlRequest) *Crawler
- type CrawlerStatistic
- type DefaultStore
- type Distributor
- type Extractor
- type Fetcher
- type HttpServer
- type Node
- func (n *Node) AddCrawler(c *Crawler)
- func (n *Node) AddRequest(req *UrlRequest)
- func (n *Node) GetCrawler(name string) *Crawler
- func (n *Node) GetName() string
- func (n *Node) GetStatistic() (*Statistic, error)
- func (n *Node) IsMaster() bool
- func (n *Node) RemCrawler(name string)
- func (n *Node) Restart()
- func (n *Node) Start()
- func (n *Node) Stop()
- type NodeInfo
- type Parser
- type RedisClient
- type RedisInstance
- type Reporter
- type RequestChan
- type ResponseChan
- type RpcClient
- func (r *RpcClient) AddClient(node *NodeInfo)
- func (r *RpcClient) Distribute(req *UrlRequest) error
- func (r *RpcClient) Join(local, node *NodeInfo) error
- func (r *RpcClient) KeepAlive(remote *NodeInfo) error
- func (r *RpcClient) RemClient(node *NodeInfo)
- func (r *RpcClient) ReportRequest(req *UrlRequest) error
- func (r *RpcClient) SyncStatistic(node *NodeInfo) (*Statistic, error)
- type RpcServer
- func (r *RpcServer) Distribute(req *UrlRequest) error
- func (r *RpcServer) Join(remote *NodeInfo) error
- func (r *RpcServer) KeepAlive(remote *NodeInfo) error
- func (r *RpcServer) Register()
- func (r *RpcServer) RegisterType()
- func (r *RpcServer) ReportRequest(req *UrlRequest) error
- func (r *RpcServer) Start()
- func (r *RpcServer) Stop()
- func (r *RpcServer) SyncStatistic(node *NodeInfo) (*Statistic, error)
- type SlaverStatus
- type Statistic
- func (s *Statistic) AddCrawledCount(name string)
- func (s *Statistic) AddCrawlerStatistic(name string) *Statistic
- func (s *Statistic) AddNode(Node *NodeInfo) *Statistic
- func (s *Statistic) AddTotalCount(name string)
- func (s *Statistic) BeginNow() *Statistic
- func (s *Statistic) GetCrawlerStatistic(name string) *CrawlerStatistic
- func (s *Statistic) GetStatistic() *Statistic
- func (s *Statistic) RemCrawlerStatistic(name string) *Statistic
- func (s *Statistic) SetClusterName(name string) *Statistic
- func (s *Statistic) SetMaster(Node *NodeInfo) *Statistic
- func (s *Statistic) Stop() *Statistic
- func (s *Statistic) UpdateNodeAlive(Node *NodeInfo, v bool) *Statistic
- type Storage
- type UrlRequest
- func (u *UrlRequest) Incr() *UrlRequest
- func (u *UrlRequest) SetDepth(depth int32) *UrlRequest
- func (u *UrlRequest) SetHeader(header http.Header) *UrlRequest
- func (u *UrlRequest) SetParams(params string) *UrlRequest
- func (u *UrlRequest) SetProxy(proxy string) *UrlRequest
- func (u *UrlRequest) SetRetry(retry int32) *UrlRequest
- func (u *UrlRequest) ToRequest() (*http.Request, error)
- type UrlResponse
- type Worker
Constants ¶
const ( KeeperPeriod = time.Second * 5 HeartBeatInterval = time.Millisecond MonitorMasterPeriod = time.Second * 12 )
const ( Pretty = "pretty" StartSuccess = "Start Successfully" StopSuccess = "Stop Successfully" AuthFailed = "You do not have the authorization!" AddSuccess = "Add Successfully" GetStatFailed = "Fail to get Statistic Info" Success = "Success!" )
const ( KeyForStore = "url:%s" Expiration = time.Second * 3600 * 24 * 7 )
const ( STOP = "stop" RUNNING = "running" )
const (
KeyForCrawledUrls = "crawledurls"
)
const (
Message = "You know, for data"
)
Variables ¶
var ( ErrNotClusterMember = errors.New("Not the cluster member") ErrNotMaster = errors.New("I am not the master, thank you!") ErrNoneMaster = errors.New("Master is not found") ErrDistributeUrl = errors.New("Distribute the url to the wrong node") )
var Configs = new(config)
var OneCluster sync.Once
var OneNode sync.Once
Every Node has a DefaultNode.Singleton
var Stat = NewStatistic()
Functions ¶
func AddUrlToCrawl ¶
func ArbitrateConfigs ¶
func ArbitrateConfigs(c *config)
func GetProfile ¶
func KeyForCrawlByDay ¶
func KeyForCrawlByDay() string
func KeyForUrlStore ¶
func KillMaster ¶
func LoadConfig ¶
func LoadConfig()
func NewCluster ¶
func NewCluster()
func RestartCrawler ¶
func StartCrawler ¶
func StopCrawler ¶
Types ¶
type Cluster ¶
type Cluster struct { ClusterName string Local *Node Master *NodeInfo //Nodes describe the slavers' status, if true, the slaver is active, //otherwise, the slaver is down. Nodes map[*NodeInfo]bool // contains filtered or unexported fields }
var DefaultCluster *Cluster
func GetClusterInstance ¶
func GetClusterInstance() *Cluster
func (*Cluster) BecomeMaster ¶
func (c *Cluster) BecomeMaster()
Current node becomes Master, and startup tasks belong to master.
func (*Cluster) BecomeSlaver ¶
func (c *Cluster) BecomeSlaver()
/////////////////////////////////////////////////////////////////// Functions below are for slavers ///////////////////////////////////////////////////////////////////
func (*Cluster) Discover ¶
func (c *Cluster) Discover()
scan nodeList, call Join Rpc Method, if returns error, the remote is not the master, or set master to that node. if all the node list are not the Master, make itself Master.
func (*Cluster) MonitorMaster ¶
func (c *Cluster) MonitorMaster()
MonitorMaster check the heart beat package from master, If there is no HB package for 12 seconds, stop the world and discover the new master. when a new master is selected, restart the world.
func (*Cluster) PushRequest ¶
func (c *Cluster) PushRequest(req *UrlRequest)
func (*Cluster) ResetTimer ¶
func (c *Cluster) ResetTimer()
func (*Cluster) RestartDistributor ¶
func (c *Cluster) RestartDistributor()
func (*Cluster) StartDistributor ¶
func (c *Cluster) StartDistributor()
func (*Cluster) StartKeeper ¶
func (c *Cluster) StartKeeper()
Master must detect the slavers, if a slaver is down remove it. the func is only invoked by master.
func (*Cluster) StopDistributor ¶
func (c *Cluster) StopDistributor()
func (*Cluster) StopTheWorld ¶
func (c *Cluster) StopTheWorld()
func (*Cluster) UpdateSlaverStatus ¶
type Crawler ¶
type Crawler struct { // Name stand for a unique identifier of the crawler. Name string // StartURL is the entrance of website to crawl. SeedUrls []string //Depth defines the website depth to crawl Depth int32 // If the field is set true, the crawler will only crawl the pages // that are of same host address. InSite bool // the timeout of per request to the target website Timeout int32 // when a request fails, it will retry 'Retry' times. Retry int32 // TTL is the interval of two urls to fetch using by fetch TTL int32 ParserMap map[string]Parser // some extra data for http request, such as Header and PostForm MetaData map[string]interface{} // contains filtered or unexported fields }
Crawler implements the main work of the node. It defines some primitive info. If the current node is slave, a node will manage three entities fetcher, extractor and reporter. else, if the current node is master a distributor is appended.
func NewCrawler ¶
func (*Crawler) AddSeedUrls ¶
Set the Urls of the crawler
func (*Crawler) PostRequest ¶
func (c *Crawler) PostRequest(url, parser string, cookieJar int) *UrlRequest
func (*Crawler) SetStorage ¶
To customize the storage strategy.
func (*Crawler) SetTimeout ¶
Set the Timeout of the crawler
func (*Crawler) StartWith ¶
func (c *Crawler) StartWith(call func() []*UrlRequest) *Crawler
type CrawlerStatistic ¶
type CrawlerStatistic struct { Name string `json:"CrawlerName"` CrawledUrlsCount uint64 `json:"CrawledUrlsCount"` TotalCount uint64 `json:"TotalUrlsCount"` ToCrawledUrlsCount uint64 `json:"ToCrawledUrlsCount"` }
CrawlerStatistic demonstrate the crawler's basic info used by restful api to monitor current state of the crawler
func NewCrawlerStatistic ¶
func NewCrawlerStatistic(name string) *CrawlerStatistic
type DefaultStore ¶
type DefaultStore struct {
*RedisClient
}
By default, store the response into Redis.
func NewDefaultStore ¶
func NewDefaultStore(r *RedisClient) *DefaultStore
func (*DefaultStore) Store ¶
func (d *DefaultStore) Store(resp *UrlResponse)
type Distributor ¶
type Distributor struct { Requests RequestChan // contains filtered or unexported fields }
func NewDistributor ¶
func NewDistributor() *Distributor
func (*Distributor) Run ¶
func (r *Distributor) Run()
func (*Distributor) Status ¶
func (r *Distributor) Status() string
func (*Distributor) Stop ¶
func (r *Distributor) Stop()
type Extractor ¶
type Extractor struct {
// contains filtered or unexported fields
}
func NewExtractor ¶
func NewExtractor(in ResponseChan, out RequestChan) *Extractor
type Fetcher ¶
type Fetcher struct {
// contains filtered or unexported fields
}
Fetcher is an executer doing some kind of job
func NewFetcher ¶
func NewFetcher(in chan []*UrlRequest, out chan []*UrlResponse) *Fetcher
func (*Fetcher) Add ¶
func (f *Fetcher) Add(req *UrlRequest)
type HttpServer ¶
type HttpServer struct {
// contains filtered or unexported fields
}
func NewHttpServer ¶
func NewHttpServer(node *Node) *HttpServer
func (*HttpServer) Register ¶
func (h *HttpServer) Register()
func (*HttpServer) Serve ¶
func (h *HttpServer) Serve()
type Node ¶
type Node struct { Info *NodeInfo // contains filtered or unexported fields }
there is only one Node instance per go process.
var DefaultNode *Node
func GetNodeInstance ¶
func GetNodeInstance() *Node
func (*Node) AddCrawler ¶
func (*Node) AddRequest ¶
func (n *Node) AddRequest(req *UrlRequest)
func (*Node) GetCrawler ¶
func (*Node) GetStatistic ¶
func (*Node) RemCrawler ¶
type Parser ¶
type Parser func(resp *UrlResponse) (urls []*UrlRequest)
type RedisClient ¶
type RedisClient struct { Clients map[string]*redis.Client // contains filtered or unexported fields }
var DefaultRedisClient *RedisClient
func GetRedisClient ¶
func GetRedisClient() *RedisClient
func (*RedisClient) AddClient ¶
func (r *RedisClient) AddClient(name string, re *RedisInstance)
func (*RedisClient) GetClient ¶
func (r *RedisClient) GetClient(key string) *redis.Client
type RedisInstance ¶
type Reporter ¶
type Reporter struct {
// contains filtered or unexported fields
}
func NewReporter ¶
func NewReporter(pop RequestChan) *Reporter
type RequestChan ¶
type RequestChan chan []*UrlRequest
see http://gowithconfidence.tumblr.com/post/31426832143/stacked-channels
func NewRequestChan ¶
func NewRequestChan() RequestChan
type ResponseChan ¶
type ResponseChan chan []*UrlResponse
func NewResponseChan ¶
func NewResponseChan() ResponseChan
type RpcClient ¶
type RpcClient struct {
// contains filtered or unexported fields
}
func NewRpcClient ¶
func NewRpcClient() *RpcClient
func (*RpcClient) Distribute ¶
func (r *RpcClient) Distribute(req *UrlRequest) error
Rpc Method at Client side as Master, to distribute the request to the slavers.
func (*RpcClient) ReportRequest ¶
func (r *RpcClient) ReportRequest(req *UrlRequest) error
Rpc Method at Client side as slavers, Report the new reuests to the master.
type RpcServer ¶
type RpcServer struct {
// contains filtered or unexported fields
}
func NewRpcServer ¶
func NewRpcServer() *RpcServer
func (*RpcServer) Distribute ¶
func (r *RpcServer) Distribute(req *UrlRequest) error
Rpc Method at server side as Slave, receive the req distributed from master, and add it to the crawler to fetch content.
func (*RpcServer) Join ¶
Rpc Method at server side as either Master or slave , if it is Master, add the remote Node and return nil, otherwise return error.
func (*RpcServer) Register ¶
func (r *RpcServer) Register()
Register all the Rpc Service, they may be invoked by either the master or the slaver
func (*RpcServer) RegisterType ¶
func (r *RpcServer) RegisterType()
func (*RpcServer) ReportRequest ¶
func (r *RpcServer) ReportRequest(req *UrlRequest) error
Rpc Method at server side as Master, receive the request from slavers and store them to distribute
type SlaverStatus ¶
type Statistic ¶
type Statistic struct { ClusterName string `json:"ClusterName"` Running bool `json:"Running"` BeginAt string `json:"Begin At"` Elapse string `json:"Elapse"` NodeNum int `json:"NodeNumber"` Master *NodeInfo `json:"Master"` Slavers []*SlaverStatus `json:"slavers"` Crawler map[string]*CrawlerStatistic `json:"CrawlerStatistics"` Message string `json:"Message"` }
The statistic information of the cluster, it will be updated by the master node, when the slavers need it, they must call the Rpc SyncStatistic to sync this.
func NewStatistic ¶
func NewStatistic() *Statistic
func (*Statistic) AddCrawledCount ¶
func (*Statistic) AddCrawlerStatistic ¶
func (*Statistic) AddTotalCount ¶
func (*Statistic) GetCrawlerStatistic ¶
func (s *Statistic) GetCrawlerStatistic(name string) *CrawlerStatistic
func (*Statistic) GetStatistic ¶
Get the current info of the crawler cluster, this will always invoked by the master node
func (*Statistic) RemCrawlerStatistic ¶
func (*Statistic) SetClusterName ¶
type Storage ¶
type Storage interface {
Store(resp *UrlResponse)
}
You can customize the storage strategy in your application by implementing the interface Storage
type UrlRequest ¶
type UrlRequest struct { Url string Method string // Params include some key-value pairs, URL_Encode Headers http.Header Params string Proxy string Node string CookieJar int Crawler string Parser string Depth int32 Retry int32 }
func NewUrlRequest ¶
func NewUrlRequest(url, method, crawler, parser string, cookie int) *UrlRequest
func (*UrlRequest) Incr ¶
func (u *UrlRequest) Incr() *UrlRequest
func (*UrlRequest) SetDepth ¶
func (u *UrlRequest) SetDepth(depth int32) *UrlRequest
func (*UrlRequest) SetHeader ¶
func (u *UrlRequest) SetHeader(header http.Header) *UrlRequest
func (*UrlRequest) SetParams ¶
func (u *UrlRequest) SetParams(params string) *UrlRequest
func (*UrlRequest) SetProxy ¶
func (u *UrlRequest) SetProxy(proxy string) *UrlRequest
func (*UrlRequest) SetRetry ¶
func (u *UrlRequest) SetRetry(retry int32) *UrlRequest
type UrlResponse ¶
type UrlResponse struct { Src *UrlRequest Resp *http.Response Body string }
func NewResponse ¶
func NewResponse(req *UrlRequest, resp *http.Response) *UrlResponse