Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CrawlerConfig ¶
type CrawlerConfig struct { MaxGoRoutine int `config:"max_go_routine"` //Fetch Speed Control FetchThresholdInMs int `config:"fetch_threshold_ms"` }
func GetDefaultCrawlerConfig ¶
func GetDefaultCrawlerConfig() CrawlerConfig
type RoutingParameter ¶
type RoutingParameter struct {
Shard int
}
type TaskConfig ¶
type TaskConfig struct { //walking around pattern LinkUrlExtractRegexStr string `link_extract_pattern` LinkUrlExtractRegex *regexp.Regexp LinkUrlExtractRegexGroupIndex int `link_extract_group` LinkUrlMustContain string LinkUrlMustNotContain string //parsing url pattern,when url match this pattern,gopa will not parse urls from response of this url SkipPageParsePatternStr string `skip_page_parse_pattern` SkipPageParsePattern *regexp.Regexp //fetch url pattern FetchUrlPatternStr string `fetch_url_pattern` FetchUrlPattern *regexp.Regexp FetchUrlMustContain string FetchUrlMustNotContain string //saving pattern SavingUrlPatternStr string `save_url_pattern` SavingUrlPattern *regexp.Regexp SavingUrlMustContain string SavingUrlMustNotContain string //Crawling within domain FollowSameDomain bool `follow_same_domain` FollowSubDomain bool `follow_sub_domain` TaskDataPath string //User Cookie Cookie string //Fetch Speed Control FetchDelayThreshold int TaskDBFilename string `task_db_filename` }
Click to show internal directories.
Click to hide internal directories.