crawler

package
v0.0.0-...-ec3c1f2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 4, 2016 License: Apache-2.0 Imports: 31 Imported by: 1

Documentation

Index

Constants

View Source
const (
	// ErrUnexpectedNodeType - found unexpected node type
	ErrUnexpectedNodeType = "Unexpected node type"
	// ErrUnexpectedTag - found unexpected tag
	ErrUnexpectedTag = "Unexpected tag"
	// ErrEncodingNotFound - not found encoding for content
	ErrEncodingNotFound = "Not found encoding for content"
	// ErrBodyNotHTML - not found HTML in body
	ErrBodyNotHTML = "Body not HTML"
	// ErrHTMLParse - HTML parse error
	ErrHTMLParse = "HTML parse error"
	// ErrReadGZipResponse - Can't read response body as gzip archive
	ErrReadGZipResponse = "Read response body as a gzip archive error"
	// ErrReadResponse - Can't read response body archive
	ErrReadResponse = "Read response body error"
	// ErrUnknownContentEncoding - Unknown http header Content-Encoding
	ErrUnknownContentEncoding = "Unknown http header \"Content-Encoding\""
	// ErrStatusCode - Wrong status code
	ErrStatusCode = "Wrong status code"
	// ErrNotFountContentType - Not found Content-Type in headers
	ErrNotFountContentType = "Not found Content-Type in headers"
	// ErrParseContentType - Error parse Content-Type header
	ErrParseContentType = "Parse Content-Type header"
	// ErrRenderHTML - Error Render HTML
	ErrRenderHTML = "Render HTML"
	// ErrParseBaseURL - Parse base URL
	ErrParseBaseURL = "Parse base URL"
	// ErrResolveBaseURL - Resolve base URL by host name
	ErrResolveBaseURL = "Resolve base URL by hostname: StatusCode != 200"
	// ErrGetRequest - Error Get request
	ErrGetRequest = "Get request"
	// ErrReadResponseBody - Error read response body
	ErrReadResponseBody = "Read response body"
	// ErrCloseResponseBody - Close response body
	ErrCloseResponseBody = "Close response body"
	// ErrCreateRobotsTxtFromDb - Error create robots.txt from db data
	ErrCreateRobotsTxtFromDb = "Create robots.txt from db data"
	// ErrCreateRobotsTxtFromURL - Error create robots.txt from url
	ErrCreateRobotsTxtFromURL = "Create robots.txt from url"
	// WarnPageNotIndexed - Page not indexed
	WarnPageNotIndexed = "Page not indexed (meta tag noindex)"
	// InfoUnsupportedMimeFormat - Unsupported mime format
	InfoUnsupportedMimeFormat = "Unsupported mime format"
	// DbgRequestDuration - Request duration
	DbgRequestDuration = "Request duration"
	// DbgBodyProcessingDuration - Body processing duration
	DbgBodyProcessingDuration = "Body processing duration"
	// DbgBodySize - Body size after processing
	DbgBodySize = "Body size"
)

Variables

This section is empty.

Functions

func NormalizeHostName

func NormalizeHostName(hostName string) string

NormalizeHostName - normalize host name

func NormalizeURL

func NormalizeURL(u *url.URL) string

NormalizeURL - nomalize URL

func Run

func Run(logger zap.Logger, baseHosts []string, cnt int) error

Run - start download cnt pages

Types

type HTMLMetadata

type HTMLMetadata struct {
	// [URL]hostID
	URLs         map[string]sql.NullInt64
	MetaTagIndex bool
	// contains filtered or unexported fields
}

HTMLMetadata extracted meta data from HTML

func NewHTMLMetadata

func NewHTMLMetadata(hostMng *hostsManager, urlStr string) (*HTMLMetadata, error)

NewHTMLMetadata - create new HTMLMetadata struct

func RunDataExtrator

func RunDataExtrator(hostMng *hostsManager, node *html.Node, urlStr string) (*HTMLMetadata, error)

RunDataExtrator - extart URLs and other meta data from page

func (*HTMLMetadata) AddURL

func (h *HTMLMetadata) AddURL(link string)

AddURL - add not parsed URL

func (*HTMLMetadata) ClearURLs

func (h *HTMLMetadata) ClearURLs()

ClearURLs - remove all URLs

func (*HTMLMetadata) GetTitle

func (h *HTMLMetadata) GetTitle() string

GetTitle - get title

func (*HTMLMetadata) SetTitle

func (h *HTMLMetadata) SetTitle(title string, rewrite bool)

SetTitle - set title

func (*HTMLMetadata) WrongURLsToLog

func (h *HTMLMetadata) WrongURLsToLog(logger zap.Logger)

WrongURLsToLog - write to log add wrong URLs

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL