crawler

package module
v1.1.5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 11, 2020 License: BSD-3-Clause Imports: 17 Imported by: 0

README

domains-crawler

Domains Project Crawler

Documentation

Index

Constants

View Source
const HundredPercent = 100
View Source
const (
	OKToCrawlDivisor float64 = 10
)
View Source
const (
	One = iota + 1
)

Variables

This section is empty.

Functions

func CrawlURL

func CrawlURL(target string, client CacheAbstraction, profile CrawlingProfile)

func GetMemStats

func GetMemStats() int64

func GoodTLD

func GoodTLD(domain string, tldlist, countrylist map[string]string) (value, description string, result bool)

func HEADCheck

func HEADCheck(target string, userAgent string) int

func IntSecondsToDuration

func IntSecondsToDuration(seconds int) time.Duration

func RandomSleep added in v1.1.2

func RandomSleep(seconds int)

func SafeDescription

func SafeDescription(description string) (safeDescription string)

func URLConverters

func URLConverters(urlString string) string

Types

type CacheAbstraction

type CacheAbstraction interface {
	HGet(key, field string) (string, error)
	// To be fixed later: HSet should accept value ...interface{}
	HSet(key, field, value string) (int64, error)
	HDel(key string, fields ...string) (int64, error)
	Ping() (string, error)
	Close() error
}

type CrawlingProfile

type CrawlingProfile struct {
	DomainWhitelist []string
	// DomainWhitelist = []string{
	// "tumblr.com",
	GenericTLDList map[string]string
	// GenericTLDList = map[string]string{
	// "aaa": "generic_aaa", "aarp": "generic_aarp",
	TLDList map[string]string
	// TLDList = map[string]string{
	//	"ac": "Ascension Island",
	SecondLevelWhiteList []string
	// SecondLevelWhiteList = []string{
	//		"com",
	//		"net",
	AllowedPorts map[string]string
	// 	AllowedPorts = map[string]string{
	//		"80":   "1",
	//		"81":   "1",
	DisallowedPorts map[string]string
	// DisallowedPorts = map[string]string{
	//		// SMTP
	//		"25":   "1",
	//		"465":  "1",
	//		"587":  "1",
	PermanentDomainBlacklist map[string]string
	// PermanentDomainBlacklist = map[string]string{
	//		"localhost": "1",
	//		"127.0.0.1": "1",
	BlockedTLDs map[string]string
	// BlockedTLDs = map[string]string{
	//		"facebook":     "1",
	ConcurrentRequests       int
	RandomDelay              int
	CheckEvery               int
	RequestTimeout           int
	MaxRunTime               int
	MaxResults               int
	MaxDomainsPerParent      int
	MaxTimeToDiscoverDomains int
	GracePeriod              int
	// Bytes
	MaxMemory int64
	// in %
	MaxMemoryPressure int
	// Both HEAD and GET
	UserAgent string
}

type DomainsCrawler

type DomainsCrawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(cache *memcache.CacheType,
	client CacheAbstraction,
	profile CrawlingProfile) (crawler *DomainsCrawler)

func (*DomainsCrawler) CountDomainsParent

func (dc *DomainsCrawler) CountDomainsParent(domain string, client CacheAbstraction) (cnt int64)

func (*DomainsCrawler) Elapsed

func (dc *DomainsCrawler) Elapsed() int64

func (*DomainsCrawler) GetRobots added in v1.0.7

func (dc *DomainsCrawler) GetRobots() error

func (*DomainsCrawler) OKToCrawl

func (dc *DomainsCrawler) OKToCrawl() bool

func (*DomainsCrawler) OnRequest

func (dc *DomainsCrawler) OnRequest(r *colly.Request)

func (*DomainsCrawler) RunTicker

func (dc *DomainsCrawler) RunTicker()

func (*DomainsCrawler) SetTarget

func (dc *DomainsCrawler) SetTarget(target string)

func (*DomainsCrawler) Stop

func (dc *DomainsCrawler) Stop()

type RoboTester added in v1.0.7

type RoboTester struct {
	// contains filtered or unexported fields
}

func NewRoboTester added in v1.0.7

func NewRoboTester(fullURL, userAgent string) (*RoboTester, error)

func (*RoboTester) GetRobots added in v1.0.7

func (rt *RoboTester) GetRobots(path string) (robots *robotstxt.RobotsData, err error)

func (*RoboTester) Test added in v1.0.7

func (rt *RoboTester) Test(path string) bool

type TargetMetaData

type TargetMetaData struct {
	CrawledTS int64 `json:"crawled_ts"`
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL