crawler

package
v0.0.0-...-b275a4c Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 14, 2024 License: BSD-3-Clause Imports: 19 Imported by: 0

Documentation

Index

Constants

View Source
const (
	MANIFEST labels = iota
	MERKLETREE
	SIGZAG
	ASSET
	DIFF
	HISTORY
	DIRECTORY
	WEB
	URL
	URLS
	DOWNLOAD
	DATASOURCE
)

Variables

This section is empty.

Functions

func DownloadUrl

func DownloadUrl(config Config)

DownloadUrl download from a single url

func ValidateExt

func ValidateExt(url string) bool

func ValidateUrl

func ValidateUrl(url string) (string, bool, error)

func WriteDownloadManifest

func WriteDownloadManifest(res []*UrlResult)

WriteDownloadManifest write the metadata from downloading to file

Types

type Config

type Config struct {
	Root    int
	Depth   int
	TagFile string
	OutDir  string
	Url     string
	Urls    string
}

type Crawler

type Crawler interface {
	Crawl() error
}

type DirectoryCrawler

type DirectoryCrawler struct {
	Dir         string           `json:"dir"`
	Regex       []*regexp.Regexp `json:"regex"`
	Conf        Config           `json:"conf"`
	FileDigests [][]byte         `json:"file_digests"`
	Signatures  []*Sig           `json:"signatures"`
}

func NewDirectoryCrawler

func NewDirectoryCrawler(root string, conf *Config) *DirectoryCrawler

NewDirectoryCrawler instantiates a new directory crawler object

func (*DirectoryCrawler) Crawl

func (d *DirectoryCrawler) Crawl() error

func (*DirectoryCrawler) FileSignature

func (d *DirectoryCrawler) FileSignature(path string) []byte

FileSignature generates a hash value for files found in the path

func (*DirectoryCrawler) Write

func (d *DirectoryCrawler) Write(fileType labels) (string, error)

Write the manifest to disk

type Extension

type Extension int
const (
	XLSX Extension = iota
	XLSB
	XLSM
	CSV
	ARFF
	IPYNB
	PARQUET
	ZIP
	BIN
	PDF
	GZ
	TXT
	ROBOTS
)

func (Extension) Strings

func (e Extension) Strings() string

type History

type History struct {
	Asset   string
	History []Sig
}

type Manager

type Manager struct {
	Sig    []Sig
	Hist   []History
	Merkle hometree.Node
}

func Read

func Read(file string, label labels, timeless bool) Manager

Read manifest

func (*Manager) Compare

func (m *Manager) Compare(file1 string, file2 string, value labels)

Compare check the equality of two manifests

func (*Manager) Diff

func (m *Manager) Diff(m1 string, m2 string, timeless bool) []Sig

Diff check the difference between two manifests.

func (*Manager) Download

func (m *Manager) Download(config Config, label labels)

Download retrieve file from url and generate SHA256

func (*Manager) GenerateManifest

func (m *Manager) GenerateManifest(path string, config Config) (string, string, error)

GenerateManifest walks a directory and writes metadata and cryptographic signature to a file

func (*Manager) History

func (m *Manager) History(asset string, args []string)

History tracks the history of an asset across

func (*Manager) Write

func (m *Manager) Write(label labels)

Write the result of interrogating a manifest to disk

type Sig

type Sig struct {
	Asset     string `json:"asset"`
	Digest    string `json:"sha256"`
	Timestamp string `json:"timestamp"`
}

type SigTimeless

type SigTimeless struct {
	Asset     string `json:"asset"`
	Digest    string `json:"sha256"`
	Timestamp string `json:"_"`
}

type UrlResult

type UrlResult struct {
	File      string `json:"file"`
	Sha256    string `json:"sha256,omitempty"`
	Size      int64  `json:"size,omitempty"`
	Match     bool   `json:"match,omitempty"`
	Timestamp string `json:"timestamp,omitempty"`
}

func CompareDownloadSHA

func CompareDownloadSHA(s Urls, file *os.File, name string) *UrlResult

CompareDownloadSHA compares the SHA256 digest

type Urls

type Urls struct {
	Url    string `json:"url"`
	Sha256 string `json:"sha256,omitempty"`
	Size   string `json:"size,omitempty"`
}

type Util

type Util interface {
	Compare(file1 string, file2 string, value labels)
	Diff(m1 string, m2 string, timeless bool) []Sig
	Write(label labels)
	History(asset string, args []string)
}

type WebCrawler

type WebCrawler struct {
	Conf Config
}

func NewWebCrawler

func NewWebCrawler(conf *Config) *WebCrawler

NewWebCrawler instantiates a new web crawler object

func (WebCrawler) Crawl

func (w WebCrawler) Crawl() error

func (WebCrawler) Download

func (w WebCrawler) Download(file *os.File, retries int) error

Download manages direct call to url persisting the download over successive tries until timeout

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL