Documentation ¶
Index ¶
- Constants
- type AppState
- type Connector
- type Crawler
- type EdgesSet
- type RMEntry
- type Record
- type RecordManager
- func (rm *RecordManager) AddEdge(fromURL string, toURL string) error
- func (rm *RecordManager) AddRecord(entry RMEntry)
- func (rm *RecordManager) Count() int
- func (rm *RecordManager) Dump() map[string]Record
- func (rm *RecordManager) Exists(rawURL string) bool
- func (rm *RecordManager) Get(rawURL string) (Record, bool)
- func (rm *RecordManager) LoadFromReader(r io.Reader) error
- func (rm *RecordManager) SaveToWriter(w io.Writer, indent bool) error
- func (rm *RecordManager) Update(rawURL string, statusCode int, err error) error
- type Result
- type StatsCLIOutWriter
- func (sm *StatsCLIOutWriter) AddErrorEntry(value string)
- func (sm *StatsCLIOutWriter) AddLatencySample(value time.Duration)
- func (sm *StatsCLIOutWriter) IncDecDepth(value int)
- func (sm *StatsCLIOutWriter) IncDecErrorsCount(value int)
- func (sm *StatsCLIOutWriter) IncDecLinksCount(value int)
- func (sm *StatsCLIOutWriter) IncDecLinksInQueue(value int)
- func (sm *StatsCLIOutWriter) IncDecTotalRequestsCount(value int)
- func (sm *StatsCLIOutWriter) IncDecWorkersRunning(value int)
- func (sm *StatsCLIOutWriter) RunOutputFlusher()
- func (sm *StatsCLIOutWriter) SetAppState(state AppState)
- func (sm *StatsCLIOutWriter) SetDepth(value int)
- func (sm *StatsCLIOutWriter) SetErrorsCount(value int)
- func (sm *StatsCLIOutWriter) SetLinksCount(value int)
- func (sm *StatsCLIOutWriter) SetLinksInQueue(value int)
- func (sm *StatsCLIOutWriter) SetTotalRequestsCount(value int)
- func (sm *StatsCLIOutWriter) SetWorkersRunning(value int)
- type StatsManager
- type Task
- type URLEntity
- type WebClient
Constants ¶
const ( // AppState_Unknown represents the 'unknown' state. AppState_Unknown = iota // AppState_IDLE represents the 'idle' state. AppState_IDLE // AppState_Running represents the 'run' state. AppState_Running // AppState_Finished represents the 'finish' state. AppState_Finished )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AppState ¶
type AppState int
AppState represents the current state of the App.
type Connector ¶
type Connector interface {
GetLinks(rawURL string) (statusCode int, links []URLEntity, latency time.Duration, err error)
}
Connector describes the connector interface.
type Crawler ¶
type Crawler struct { // Read-only vars InitialURL string Stats bool ShowErrors bool WorkersCount int Depth int StayInSubdomain bool TreeMode bool SubDomain string Retry int // contains filtered or unexported fields }
Crawler brings everything together and is responsible for starting goroutines and manage them.
func NewCrawler ¶
func NewCrawler(connector Connector, initialURL string, retry int, linksWriter io.Writer, stats bool, showErrors bool, stayinsubdomain bool, treemode bool, workersCount int, depth int) (*Crawler, error)
NewCrawler returns a new Crawler.
func (*Crawler) Merger ¶
Merger gets the results from the workers (links) and keeps all the relevant information feeding the new links to workers via another channel.
func (*Crawler) StatsWriter ¶
StatsWriter writes stats to a io.Writer (e.g. os.Stdout)
type EdgesSet ¶
type EdgesSet map[int]struct{}
func NewEdgesSet ¶
func NewEdgesSet() EdgesSet
func (EdgesSet) MarshalJSON ¶
func (*EdgesSet) UnmarshalJSON ¶
type Record ¶
type Record struct { // Index allows easy referencing of records (used in the edges) Index int `json:"index"` // This indicates whether this is the start of the graph // i.e., URL provided. InitPoint bool `json:"initPoint"` URL string `json:"url"` Host string `json:"host"` Depth int `json:"depth"` // Edges []uint `json:"edges"` // This is supposed to be mimicing a hashset // We use a struct as a value as it's a bit more space efficient Edges EdgesSet `json:"edges"` StatusCode int `json:"statusCode"` ErrString string `json:"errString,omitempty"` }
Record represents an entry in the RecordManager (internal state).
type RecordManager ¶
type RecordManager struct { // Keeps a table of Records. Key is the URL (scheme,authority,path,query) Records map[string]Record IndexCount int }
RecordManager keeps track of links visited and some metadata like depth level and its children.
func NewRecordManager ¶
func NewRecordManager() *RecordManager
NewRecordManager returns a new Record Manager.
func (*RecordManager) AddEdge ¶
func (rm *RecordManager) AddEdge(fromURL string, toURL string) error
AddEdge adds a new edge to a record if not already present.
func (*RecordManager) AddRecord ¶
func (rm *RecordManager) AddRecord(entry RMEntry)
AddRecord adds a record to the RecordManager.
func (*RecordManager) Count ¶
func (rm *RecordManager) Count() int
Count counts the number of records.
func (*RecordManager) Dump ¶
func (rm *RecordManager) Dump() map[string]Record
Dump returns all records in the RecordManager.
func (*RecordManager) Exists ¶
func (rm *RecordManager) Exists(rawURL string) bool
Exists checks whether this URL exists in the table.
func (*RecordManager) Get ¶
func (rm *RecordManager) Get(rawURL string) (Record, bool)
Get returns a record from the Record Manager.
func (*RecordManager) LoadFromReader ¶
func (rm *RecordManager) LoadFromReader(r io.Reader) error
LoadFromReader reads the records from a Reader in JSON format. Can pass a os.File, to read from a file.
func (*RecordManager) SaveToWriter ¶
func (rm *RecordManager) SaveToWriter(w io.Writer, indent bool) error
SaveToWriter dumps the records map into a Writer in JSON format. Can pass a os.File, to write to a file.
type Result ¶
type Result struct { ParentURL string StatusCode int Links []URLEntity // Depth of the ParentURL Depth int Err error }
Result is what workers return in a channel.
type StatsCLIOutWriter ¶
type StatsCLIOutWriter struct {
// contains filtered or unexported fields
}
StatsCLIOutWriter keeps track of stats and writes to a writer up to date stats.
func NewStatsCLIOutWriter ¶
func NewStatsCLIOutWriter(writer io.Writer, showErrors bool, totalWorkersCount int, depth int) *StatsCLIOutWriter
NewStatsCLIOutWriter returns a new StatsCLIOutWriter.
func (*StatsCLIOutWriter) AddErrorEntry ¶
func (sm *StatsCLIOutWriter) AddErrorEntry(value string)
func (*StatsCLIOutWriter) AddLatencySample ¶
func (sm *StatsCLIOutWriter) AddLatencySample(value time.Duration)
func (*StatsCLIOutWriter) IncDecDepth ¶
func (sm *StatsCLIOutWriter) IncDecDepth(value int)
func (*StatsCLIOutWriter) IncDecErrorsCount ¶
func (sm *StatsCLIOutWriter) IncDecErrorsCount(value int)
func (*StatsCLIOutWriter) IncDecLinksCount ¶
func (sm *StatsCLIOutWriter) IncDecLinksCount(value int)
func (*StatsCLIOutWriter) IncDecLinksInQueue ¶
func (sm *StatsCLIOutWriter) IncDecLinksInQueue(value int)
func (*StatsCLIOutWriter) IncDecTotalRequestsCount ¶
func (sm *StatsCLIOutWriter) IncDecTotalRequestsCount(value int)
func (*StatsCLIOutWriter) IncDecWorkersRunning ¶
func (sm *StatsCLIOutWriter) IncDecWorkersRunning(value int)
func (*StatsCLIOutWriter) RunOutputFlusher ¶
func (sm *StatsCLIOutWriter) RunOutputFlusher()
This functions writes the updated stats to an io.Writer Run this in a goroutine
func (*StatsCLIOutWriter) SetAppState ¶
func (sm *StatsCLIOutWriter) SetAppState(state AppState)
func (*StatsCLIOutWriter) SetDepth ¶
func (sm *StatsCLIOutWriter) SetDepth(value int)
func (*StatsCLIOutWriter) SetErrorsCount ¶
func (sm *StatsCLIOutWriter) SetErrorsCount(value int)
func (*StatsCLIOutWriter) SetLinksCount ¶
func (sm *StatsCLIOutWriter) SetLinksCount(value int)
func (*StatsCLIOutWriter) SetLinksInQueue ¶
func (sm *StatsCLIOutWriter) SetLinksInQueue(value int)
func (*StatsCLIOutWriter) SetTotalRequestsCount ¶
func (sm *StatsCLIOutWriter) SetTotalRequestsCount(value int)
func (*StatsCLIOutWriter) SetWorkersRunning ¶
func (sm *StatsCLIOutWriter) SetWorkersRunning(value int)
type StatsManager ¶
type StatsManager interface { SetAppState(state AppState) SetLinksInQueue(value int) IncDecLinksInQueue(value int) SetLinksCount(value int) IncDecLinksCount(value int) SetErrorsCount(value int) IncDecErrorsCount(value int) SetWorkersRunning(value int) IncDecWorkersRunning(value int) SetTotalRequestsCount(value int) IncDecTotalRequestsCount(value int) SetDepth(value int) IncDecDepth(value int) AddLatencySample(value time.Duration) RunOutputFlusher() }
StatsManager represents a tracker of statistics related to the crawler. This interface is unfortunately quite big as it needs to support several operations on the statistics it keeps track of.
type URLEntity ¶
type URLEntity struct { // NetLoc represents the NetLoc portion of the URL NetLoc string // Raw represents the entire URL Raw string }
URLEntity represents a URL.
func ExtractURL ¶
ExtractURL takes any URL and returns a URL string with scheme,authority,path ready to be used as a parent URL.
type WebClient ¶
type WebClient struct {
// contains filtered or unexported fields
}
WebClient is responsible to connect to the links and manage connections to websites. Implements Connector interface.
func NewWebClient ¶
NewWebClient returns a new WebClient.