filter

package
v0.11.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 25, 2018 License: Apache-2.0 Imports: 32 Imported by: 3

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ContentDeduplicationJoint

type ContentDeduplicationJoint struct {
	pipeline.Parameters
}

ContentDeduplicationJoint used to check the hash of page body, if duplicated hash already exists, will break the pipeline

func (ContentDeduplicationJoint) Name

func (joint ContentDeduplicationJoint) Name() string

Name return: content_deduplication

func (ContentDeduplicationJoint) Process

func (joint ContentDeduplicationJoint) Process(c *pipeline.Context) error

Process the content hash Deduplication

type EmptyJoint

type EmptyJoint struct {
}

EmptyJoint is a place holder

func (EmptyJoint) Name

func (joint EmptyJoint) Name() string

Name return empty

func (EmptyJoint) Process

func (joint EmptyJoint) Process(s *pipeline.Context) error

Process do nothing

type ExtractJoint

type ExtractJoint struct {
	pipeline.Parameters
}

func (ExtractJoint) Name

func (joint ExtractJoint) Name() string

func (ExtractJoint) Process

func (joint ExtractJoint) Process(context *pipeline.Context) error

type FetchJoint

type FetchJoint struct {
	pipeline.Parameters
	// contains filtered or unexported fields
}

func (FetchJoint) Name

func (joint FetchJoint) Name() string

func (FetchJoint) Process

func (joint FetchJoint) Process(context *pipeline.Context) error

type FilterCheckJoint

type FilterCheckJoint struct {
	pipeline.Parameters
	//ignore files end with js,css,apk,zip
	SkipPageParsePattern *regexp.Regexp
}

FilterCheckJointused to check the task url if it is already in the filter, if not in the filter, then add it to task filter, and make sure won't add it next time

func (FilterCheckJoint) Name

func (joint FilterCheckJoint) Name() string

Name return: filter_check

func (FilterCheckJoint) Process

func (joint FilterCheckJoint) Process(context *pipeline.Context) error

Process the filtering and add it to the filter

type HashJoint

type HashJoint struct {
	pipeline.Parameters
}

func (HashJoint) Name

func (joint HashJoint) Name() string

func (HashJoint) Process

func (joint HashJoint) Process(context *pipeline.Context) error

type HtmlToTextJoint

type HtmlToTextJoint struct {
	pipeline.Parameters
}

func (HtmlToTextJoint) Name

func (joint HtmlToTextJoint) Name() string

func (HtmlToTextJoint) Process

func (joint HtmlToTextJoint) Process(context *pipeline.Context) error

type IgnoreTimeoutJoint

type IgnoreTimeoutJoint struct {
	pipeline.Parameters
}

func (IgnoreTimeoutJoint) Name

func (joint IgnoreTimeoutJoint) Name() string

func (IgnoreTimeoutJoint) Process

func (joint IgnoreTimeoutJoint) Process(context *pipeline.Context) error

type IndexJoint

type IndexJoint struct {
}

IndexJoint is used to send snapshot and task info into index

func (IndexJoint) Name

func (joint IndexJoint) Name() string

Name return index

func (IndexJoint) Process

func (joint IndexJoint) Process(c *pipeline.Context) error

Process wrapper index document and send to queue

type LanguageDetectJoint

type LanguageDetectJoint struct {
}

LanguageDetectJoint used to detect the language of the webpage

func (LanguageDetectJoint) Name

func (joint LanguageDetectJoint) Name() string

Name return lang_detect

func (LanguageDetectJoint) Process

func (joint LanguageDetectJoint) Process(c *pipeline.Context) error

Process language detect

type ParsePageJoint

type ParsePageJoint struct {
	pipeline.Parameters
}

func (ParsePageJoint) Name

func (joint ParsePageJoint) Name() string

func (ParsePageJoint) Process

func (joint ParsePageJoint) Process(context *pipeline.Context) error

type SaveSnapshotToDBJoint

type SaveSnapshotToDBJoint struct {
	pipeline.Parameters
}

func (SaveSnapshotToDBJoint) Name

func (this SaveSnapshotToDBJoint) Name() string

func (SaveSnapshotToDBJoint) Process

func (this SaveSnapshotToDBJoint) Process(c *pipeline.Context) error

type SaveSnapshotToFileSystemJoint

type SaveSnapshotToFileSystemJoint struct {
	// contains filtered or unexported fields
}

func (SaveSnapshotToFileSystemJoint) Name

func (SaveSnapshotToFileSystemJoint) Process

type TaskDeduplicationJoint

type TaskDeduplicationJoint struct {
}

TaskDeduplicationJoint is used to find whether the task already in the database

func (TaskDeduplicationJoint) Name

func (joint TaskDeduplicationJoint) Name() string

Name return task_deduplication

func (TaskDeduplicationJoint) Process

func (joint TaskDeduplicationJoint) Process(c *pipeline.Context) error

Process deduplication

type UpdateCheckTimeJoint

type UpdateCheckTimeJoint struct {
	pipeline.Parameters
}

func (UpdateCheckTimeJoint) Name

func (this UpdateCheckTimeJoint) Name() string

func (UpdateCheckTimeJoint) Process

func (this UpdateCheckTimeJoint) Process(c *pipeline.Context) error

type UrlFilterJoint

type UrlFilterJoint struct {
	pipeline.Parameters
}

UrlFilterJoint used to validate urls, include host,path,file and file extension

func (UrlFilterJoint) Name

func (joint UrlFilterJoint) Name() string

Name is url_filter

func (UrlFilterJoint) Process

func (joint UrlFilterJoint) Process(context *pipeline.Context) error

Process check all the url match rules

type UrlNormalizationJoint

type UrlNormalizationJoint struct {
	pipeline.Parameters
	// contains filtered or unexported fields
}

UrlNormalizationJoint used to cleanup url and do normalization

func (UrlNormalizationJoint) Name

func (joint UrlNormalizationJoint) Name() string

Name of this joint is: url_normalization

func (UrlNormalizationJoint) Process

func (joint UrlNormalizationJoint) Process(context *pipeline.Context) error

Process will handle relative url and cleanup url

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL