Documentation ¶
Overview ¶
Package protofiles is a generated protocol buffer package.
It is generated from these files:
protofiles/ideacrawler.proto
It has these top-level messages:
Status KVP DomainOpt Subscription PageRequest PageHTML
Index ¶
- Variables
- func RegisterIdeaCrawlerServer(s *grpc.Server, srv IdeaCrawlerServer)
- type DomainOpt
- func (*DomainOpt) Descriptor() ([]byte, []int)
- func (m *DomainOpt) GetCallbackUrlRegexp() string
- func (m *DomainOpt) GetCallbackXpathMatch() []*KVP
- func (m *DomainOpt) GetCallbackXpathRegexp() []*KVP
- func (m *DomainOpt) GetCancelOnDisconnect() bool
- func (m *DomainOpt) GetCheckContent() bool
- func (m *DomainOpt) GetCheckLoginAfterEachPage() bool
- func (m *DomainOpt) GetChrome() bool
- func (m *DomainOpt) GetChromeBinary() string
- func (m *DomainOpt) GetDepth() int32
- func (m *DomainOpt) GetDomLoadTime() int32
- func (m *DomainOpt) GetDomainDropPriority() bool
- func (m *DomainOpt) GetDropDomains() []string
- func (m *DomainOpt) GetFirstrun() *google_protobuf.Timestamp
- func (m *DomainOpt) GetFollowOtherDomains() bool
- func (m *DomainOpt) GetFollowUrlRegexp() string
- func (m *DomainOpt) GetFrequency() *google_protobuf1.Duration
- func (m *DomainOpt) GetImpolite() bool
- func (m *DomainOpt) GetKeepDomains() []string
- func (m *DomainOpt) GetLogin() bool
- func (m *DomainOpt) GetLoginJS() string
- func (m *DomainOpt) GetLoginParseFields() bool
- func (m *DomainOpt) GetLoginParseXpath() []*KVP
- func (m *DomainOpt) GetLoginPayload() []*KVP
- func (m *DomainOpt) GetLoginSuccessCheck() *KVP
- func (m *DomainOpt) GetLoginUrl() string
- func (m *DomainOpt) GetLoginUsingSelenium() bool
- func (m *DomainOpt) GetMaxConcurrentRequests() int32
- func (m *DomainOpt) GetMaxDelay() int32
- func (m *DomainOpt) GetMaxIdleTime() int32
- func (m *DomainOpt) GetMinDelay() int32
- func (m *DomainOpt) GetNetworkIface() string
- func (m *DomainOpt) GetNoFollow() bool
- func (m *DomainOpt) GetPrefetch() bool
- func (m *DomainOpt) GetRepeat() bool
- func (m *DomainOpt) GetSeedUrl() string
- func (m *DomainOpt) GetUnsafeNormalizeURL() bool
- func (m *DomainOpt) GetUseragent() string
- func (*DomainOpt) ProtoMessage()
- func (m *DomainOpt) Reset()
- func (m *DomainOpt) String() string
- type IdeaCrawlerClient
- type IdeaCrawlerServer
- type IdeaCrawler_AddDomainAndListenClient
- type IdeaCrawler_AddDomainAndListenServer
- type IdeaCrawler_AddPagesClient
- type IdeaCrawler_AddPagesServer
- type KVP
- type PageHTML
- func (*PageHTML) Descriptor() ([]byte, []int)
- func (m *PageHTML) GetContent() []byte
- func (m *PageHTML) GetError() string
- func (m *PageHTML) GetHttpstatuscode() int32
- func (m *PageHTML) GetMetaStr() string
- func (m *PageHTML) GetSub() *Subscription
- func (m *PageHTML) GetSuccess() bool
- func (m *PageHTML) GetUrl() string
- func (*PageHTML) ProtoMessage()
- func (m *PageHTML) Reset()
- func (m *PageHTML) String() string
- type PageReqType
- type PageRequest
- func (*PageRequest) Descriptor() ([]byte, []int)
- func (m *PageRequest) GetJs() string
- func (m *PageRequest) GetMetaStr() string
- func (m *PageRequest) GetNoCallback() bool
- func (m *PageRequest) GetReqtype() PageReqType
- func (m *PageRequest) GetSub() *Subscription
- func (m *PageRequest) GetUrl() string
- func (*PageRequest) ProtoMessage()
- func (m *PageRequest) Reset()
- func (m *PageRequest) String() string
- type Status
- type SubType
- type Subscription
- func (*Subscription) Descriptor() ([]byte, []int)
- func (m *Subscription) GetDatetime() *google_protobuf.Timestamp
- func (m *Subscription) GetDomainname() string
- func (m *Subscription) GetSeqnum() int32
- func (m *Subscription) GetSubcode() string
- func (m *Subscription) GetSubtype() SubType
- func (*Subscription) ProtoMessage()
- func (m *Subscription) Reset()
- func (m *Subscription) String() string
Constants ¶
This section is empty.
Variables ¶
View Source
var PageReqType_name = map[int32]string{
0: "GET",
1: "HEAD",
2: "BUILTINJS",
3: "JSCRIPT",
}
View Source
var PageReqType_value = map[string]int32{
"GET": 0,
"HEAD": 1,
"BUILTINJS": 2,
"JSCRIPT": 3,
}
View Source
var SubType_name = map[int32]string{
0: "SEQNUM",
1: "DATETIME",
}
View Source
var SubType_value = map[string]int32{
"SEQNUM": 0,
"DATETIME": 1,
}
Functions ¶
func RegisterIdeaCrawlerServer ¶
func RegisterIdeaCrawlerServer(s *grpc.Server, srv IdeaCrawlerServer)
Types ¶
type DomainOpt ¶
type DomainOpt struct { SeedUrl string `protobuf:"bytes,1,opt,name=seedUrl" json:"seedUrl,omitempty"` // crawl delay in seconds MinDelay int32 `protobuf:"varint,2,opt,name=minDelay" json:"minDelay,omitempty"` MaxDelay int32 `protobuf:"varint,3,opt,name=maxDelay" json:"maxDelay,omitempty"` // don't follow any pages, just send back responses for the received URLs. NoFollow bool `protobuf:"varint,4,opt,name=noFollow" json:"noFollow,omitempty"` // only pages matching reqUrlRegexp will be shipped back to the client. // only matching pages will be saved to s3 as well. CallbackUrlRegexp string `protobuf:"bytes,5,opt,name=callbackUrlRegexp" json:"callbackUrlRegexp,omitempty"` // only pages matching followUrlRegexp will be followed and sublinks added to fetcher. FollowUrlRegexp string `protobuf:"bytes,6,opt,name=followUrlRegexp" json:"followUrlRegexp,omitempty"` MaxConcurrentRequests int32 `protobuf:"varint,7,opt,name=maxConcurrentRequests" json:"maxConcurrentRequests,omitempty"` // TODO Useragent string `protobuf:"bytes,8,opt,name=useragent" json:"useragent,omitempty"` Impolite bool `protobuf:"varint,9,opt,name=impolite" json:"impolite,omitempty"` // TODO Depth int32 `protobuf:"varint,10,opt,name=depth" json:"depth,omitempty"` // TODO: maybe just remove all scheduling features, immediate jobs only Repeat bool `protobuf:"varint,11,opt,name=repeat" json:"repeat,omitempty"` // needs min limit of 5mins, ideally 1hour Frequency *google_protobuf1.Duration `protobuf:"bytes,12,opt,name=frequency" json:"frequency,omitempty"` // time of first run, if this is saturday 10pm, frequency is 2 weeks. ideally atleast 10 mins away. // it will continue to run at that time every 2 weeks Firstrun *google_protobuf.Timestamp `protobuf:"bytes,13,opt,name=firstrun" json:"firstrun,omitempty"` // Callback check order - // (1) - callbackUrlRegexp // (2) - callbackXpathMatch // (3) - callbackXpathRegexp // Any one has to match. // provide multiple xpaths as keys and expected values as value. Pages are // sent back to client only if all values are found in page. CallbackXpathMatch []*KVP `protobuf:"bytes,14,rep,name=callbackXpathMatch" json:"callbackXpathMatch,omitempty"` // TODO keepKeywords and followOtherDomains still need to be implemented // keep page only if these keywords are present // repeated string keepKeywords = 14; // drop pages if these keywords are present CallbackXpathRegexp []*KVP `protobuf:"bytes,15,rep,name=callbackXpathRegexp" json:"callbackXpathRegexp,omitempty"` // in seconds, it is the time to wait for a new // page, before stopping the job; affects workerIdleTTL of fetchbot. // min value is 600, it is also default. MaxIdleTime int32 `protobuf:"varint,16,opt,name=maxIdleTime" json:"maxIdleTime,omitempty"` FollowOtherDomains bool `protobuf:"varint,17,opt,name=followOtherDomains" json:"followOtherDomains,omitempty"` KeepDomains []string `protobuf:"bytes,18,rep,name=keepDomains" json:"keepDomains,omitempty"` DropDomains []string `protobuf:"bytes,19,rep,name=dropDomains" json:"dropDomains,omitempty"` DomainDropPriority bool `protobuf:"varint,20,opt,name=domainDropPriority" json:"domainDropPriority,omitempty"` // safe url normalizations happen by default. below is only for a few unsafe ones. // for list of safe normalizations: https://github.com/PuerkitoBio/purell/blob/master/purell.go#L59 // remove index.php, etc, fragments #section, +FlagsUsuallySafeGreedy from above link UnsafeNormalizeURL bool `protobuf:"varint,21,opt,name=unsafeNormalizeURL" json:"unsafeNormalizeURL,omitempty"` Login bool `protobuf:"varint,22,opt,name=login" json:"login,omitempty"` // currently not possible, assumes false LoginUsingSelenium bool `protobuf:"varint,23,opt,name=loginUsingSelenium" json:"loginUsingSelenium,omitempty"` LoginUrl string `protobuf:"bytes,24,opt,name=loginUrl" json:"loginUrl,omitempty"` // for username, password fields, other form data to send on post request LoginPayload []*KVP `protobuf:"bytes,25,rep,name=loginPayload" json:"loginPayload,omitempty"` // if there are hidden fields in the page that need to be scraped before login LoginParseFields bool `protobuf:"varint,26,opt,name=loginParseFields" json:"loginParseFields,omitempty"` // key is key of hidden fields to parse from form, value is the xpath of field to scrape. LoginParseXpath []*KVP `protobuf:"bytes,27,rep,name=loginParseXpath" json:"loginParseXpath,omitempty"` // to check if login succeeded, provide xpath as key, and expected value as value. // for example, after login, xpath of top right corner, and username as value. // if the xpath is not there of if there is no value match, then we probably didn't login. LoginSuccessCheck *KVP `protobuf:"bytes,28,opt,name=loginSuccessCheck" json:"loginSuccessCheck,omitempty"` // checks login state after downloading each page, using check defined in 'loginSuccessCheck' CheckLoginAfterEachPage bool `protobuf:"varint,29,opt,name=checkLoginAfterEachPage" json:"checkLoginAfterEachPage,omitempty"` // javascript for login in chrome browser. LoginJS string `protobuf:"bytes,30,opt,name=loginJS" json:"loginJS,omitempty"` // whether to use chrome, location of chrome binary Chrome bool `protobuf:"varint,31,opt,name=chrome" json:"chrome,omitempty"` ChromeBinary string `protobuf:"bytes,32,opt,name=chromeBinary" json:"chromeBinary,omitempty"` DomLoadTime int32 `protobuf:"varint,33,opt,name=domLoadTime" json:"domLoadTime,omitempty"` // check if this network interface is still active before every request. NetworkIface string `protobuf:"bytes,34,opt,name=networkIface" json:"networkIface,omitempty"` // TODO CancelOnDisconnect bool `protobuf:"varint,35,opt,name=cancelOnDisconnect" json:"cancelOnDisconnect,omitempty"` // if true, sends a HEAD request first ensure content is text/html before sending GET request. CheckContent bool `protobuf:"varint,36,opt,name=checkContent" json:"checkContent,omitempty"` // if prefetch flag is true, downloads resources like img, css, png, svg, js associated with the actual page to mimic browser behaviour. Prefetch bool `protobuf:"varint,37,opt,name=prefetch" json:"prefetch,omitempty"` }
func (*DomainOpt) Descriptor ¶
func (*DomainOpt) GetCallbackUrlRegexp ¶
func (*DomainOpt) GetCallbackXpathMatch ¶
func (*DomainOpt) GetCallbackXpathRegexp ¶
func (*DomainOpt) GetCancelOnDisconnect ¶
func (*DomainOpt) GetCheckContent ¶
func (*DomainOpt) GetCheckLoginAfterEachPage ¶
func (*DomainOpt) GetChromeBinary ¶
func (*DomainOpt) GetDomLoadTime ¶
func (*DomainOpt) GetDomainDropPriority ¶
func (*DomainOpt) GetDropDomains ¶
func (*DomainOpt) GetFirstrun ¶
func (m *DomainOpt) GetFirstrun() *google_protobuf.Timestamp
func (*DomainOpt) GetFollowOtherDomains ¶
func (*DomainOpt) GetFollowUrlRegexp ¶
func (*DomainOpt) GetFrequency ¶
func (m *DomainOpt) GetFrequency() *google_protobuf1.Duration
func (*DomainOpt) GetImpolite ¶
func (*DomainOpt) GetKeepDomains ¶
func (*DomainOpt) GetLoginJS ¶
func (*DomainOpt) GetLoginParseFields ¶
func (*DomainOpt) GetLoginParseXpath ¶
func (*DomainOpt) GetLoginPayload ¶
func (*DomainOpt) GetLoginSuccessCheck ¶
func (*DomainOpt) GetLoginUrl ¶
func (*DomainOpt) GetLoginUsingSelenium ¶
func (*DomainOpt) GetMaxConcurrentRequests ¶
func (*DomainOpt) GetMaxDelay ¶
func (*DomainOpt) GetMaxIdleTime ¶
func (*DomainOpt) GetMinDelay ¶
func (*DomainOpt) GetNetworkIface ¶
func (*DomainOpt) GetNoFollow ¶
func (*DomainOpt) GetPrefetch ¶
func (*DomainOpt) GetSeedUrl ¶
func (*DomainOpt) GetUnsafeNormalizeURL ¶
func (*DomainOpt) GetUseragent ¶
func (*DomainOpt) ProtoMessage ¶
func (*DomainOpt) ProtoMessage()
type IdeaCrawlerClient ¶
type IdeaCrawlerClient interface { // rpc AddDomain(DomainOpt) returns (Subscription) {} // rpc AddDomains(stream DomainOpt) returns (stream Subscription) {} AddDomainAndListen(ctx context.Context, in *DomainOpt, opts ...grpc.CallOption) (IdeaCrawler_AddDomainAndListenClient, error) AddPages(ctx context.Context, opts ...grpc.CallOption) (IdeaCrawler_AddPagesClient, error) CancelJob(ctx context.Context, in *Subscription, opts ...grpc.CallOption) (*Status, error) }
func NewIdeaCrawlerClient ¶
func NewIdeaCrawlerClient(cc *grpc.ClientConn) IdeaCrawlerClient
type IdeaCrawlerServer ¶
type IdeaCrawlerServer interface { // rpc AddDomain(DomainOpt) returns (Subscription) {} // rpc AddDomains(stream DomainOpt) returns (stream Subscription) {} AddDomainAndListen(*DomainOpt, IdeaCrawler_AddDomainAndListenServer) error AddPages(IdeaCrawler_AddPagesServer) error CancelJob(context.Context, *Subscription) (*Status, error) }
type IdeaCrawler_AddDomainAndListenClient ¶
type IdeaCrawler_AddDomainAndListenClient interface { Recv() (*PageHTML, error) grpc.ClientStream }
type IdeaCrawler_AddDomainAndListenServer ¶
type IdeaCrawler_AddDomainAndListenServer interface { Send(*PageHTML) error grpc.ServerStream }
type IdeaCrawler_AddPagesClient ¶
type IdeaCrawler_AddPagesClient interface { Send(*PageRequest) error CloseAndRecv() (*Status, error) grpc.ClientStream }
type IdeaCrawler_AddPagesServer ¶
type IdeaCrawler_AddPagesServer interface { SendAndClose(*Status) error Recv() (*PageRequest, error) grpc.ServerStream }
type KVP ¶
type KVP struct { Key string `protobuf:"bytes,1,opt,name=key" json:"key,omitempty"` Value string `protobuf:"bytes,2,opt,name=value" json:"value,omitempty"` }
func (*KVP) Descriptor ¶
func (*KVP) ProtoMessage ¶
func (*KVP) ProtoMessage()
type PageHTML ¶
type PageHTML struct { Success bool `protobuf:"varint,1,opt,name=success" json:"success,omitempty"` Error string `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"` Sub *Subscription `protobuf:"bytes,3,opt,name=sub" json:"sub,omitempty"` Url string `protobuf:"bytes,4,opt,name=url" json:"url,omitempty"` Httpstatuscode int32 `protobuf:"varint,5,opt,name=httpstatuscode" json:"httpstatuscode,omitempty"` Content []byte `protobuf:"bytes,6,opt,name=content,proto3" json:"content,omitempty"` MetaStr string `protobuf:"bytes,7,opt,name=metaStr" json:"metaStr,omitempty"` }
func (*PageHTML) Descriptor ¶
func (*PageHTML) GetContent ¶
func (*PageHTML) GetHttpstatuscode ¶
func (*PageHTML) GetMetaStr ¶
func (*PageHTML) GetSub ¶
func (m *PageHTML) GetSub() *Subscription
func (*PageHTML) GetSuccess ¶
func (*PageHTML) ProtoMessage ¶
func (*PageHTML) ProtoMessage()
type PageReqType ¶
type PageReqType int32
const ( PageReqType_GET PageReqType = 0 // Sends a HEAD request to first identify page is text/html before downloading // If we are unsure link will send back large gzip file, etc. which we want to // avoid. PageReqType_HEAD PageReqType = 1 PageReqType_BUILTINJS PageReqType = 2 PageReqType_JSCRIPT PageReqType = 3 )
func (PageReqType) EnumDescriptor ¶
func (PageReqType) EnumDescriptor() ([]byte, []int)
func (PageReqType) String ¶
func (x PageReqType) String() string
type PageRequest ¶
type PageRequest struct { Sub *Subscription `protobuf:"bytes,1,opt,name=sub" json:"sub,omitempty"` Reqtype PageReqType `protobuf:"varint,2,opt,name=reqtype,enum=protofiles.PageReqType" json:"reqtype,omitempty"` Url string `protobuf:"bytes,3,opt,name=url" json:"url,omitempty"` Js string `protobuf:"bytes,4,opt,name=js" json:"js,omitempty"` NoCallback bool `protobuf:"varint,5,opt,name=noCallback" json:"noCallback,omitempty"` MetaStr string `protobuf:"bytes,6,opt,name=metaStr" json:"metaStr,omitempty"` }
func (*PageRequest) Descriptor ¶
func (*PageRequest) Descriptor() ([]byte, []int)
func (*PageRequest) GetJs ¶
func (m *PageRequest) GetJs() string
func (*PageRequest) GetMetaStr ¶
func (m *PageRequest) GetMetaStr() string
func (*PageRequest) GetNoCallback ¶
func (m *PageRequest) GetNoCallback() bool
func (*PageRequest) GetReqtype ¶
func (m *PageRequest) GetReqtype() PageReqType
func (*PageRequest) GetSub ¶
func (m *PageRequest) GetSub() *Subscription
func (*PageRequest) GetUrl ¶
func (m *PageRequest) GetUrl() string
func (*PageRequest) ProtoMessage ¶
func (*PageRequest) ProtoMessage()
func (*PageRequest) Reset ¶
func (m *PageRequest) Reset()
func (*PageRequest) String ¶
func (m *PageRequest) String() string
type Status ¶
type Status struct { Success bool `protobuf:"varint,1,opt,name=success" json:"success,omitempty"` Error string `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"` }
func (*Status) Descriptor ¶
func (*Status) GetSuccess ¶
func (*Status) ProtoMessage ¶
func (*Status) ProtoMessage()
type Subscription ¶
type Subscription struct { Subcode string `protobuf:"bytes,1,opt,name=subcode" json:"subcode,omitempty"` Domainname string `protobuf:"bytes,2,opt,name=domainname" json:"domainname,omitempty"` Subtype SubType `protobuf:"varint,3,opt,name=subtype,enum=protofiles.SubType" json:"subtype,omitempty"` Seqnum int32 `protobuf:"varint,4,opt,name=seqnum" json:"seqnum,omitempty"` Datetime *google_protobuf.Timestamp `protobuf:"bytes,5,opt,name=datetime" json:"datetime,omitempty"` }
func (*Subscription) Descriptor ¶
func (*Subscription) Descriptor() ([]byte, []int)
func (*Subscription) GetDatetime ¶
func (m *Subscription) GetDatetime() *google_protobuf.Timestamp
func (*Subscription) GetDomainname ¶
func (m *Subscription) GetDomainname() string
func (*Subscription) GetSeqnum ¶
func (m *Subscription) GetSeqnum() int32
func (*Subscription) GetSubcode ¶
func (m *Subscription) GetSubcode() string
func (*Subscription) GetSubtype ¶
func (m *Subscription) GetSubtype() SubType
func (*Subscription) ProtoMessage ¶
func (*Subscription) ProtoMessage()
func (*Subscription) Reset ¶
func (m *Subscription) Reset()
func (*Subscription) String ¶
func (m *Subscription) String() string
Click to show internal directories.
Click to hide internal directories.