Documentation ¶
Index ¶
- type ContentDeduplicationJoint
- type EmptyJoint
- type ExtractJoint
- type FetchJoint
- type FilterCheckJoint
- type HashJoint
- type HtmlToTextJoint
- type IgnoreTimeoutJoint
- type IndexJoint
- type LanguageDetectJoint
- type ParsePageJoint
- type SaveSnapshotToDBJoint
- type SaveSnapshotToFileSystemJoint
- type TaskDeduplicationJoint
- type UpdateCheckTimeJoint
- type UrlFilterJoint
- type UrlNormalizationJoint
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ContentDeduplicationJoint ¶
type ContentDeduplicationJoint struct {
pipeline.Parameters
}
ContentDeduplicationJoint used to check the hash of page body, if duplicated hash already exists, will break the pipeline
func (ContentDeduplicationJoint) Name ¶
func (joint ContentDeduplicationJoint) Name() string
Name return: content_deduplication
type ExtractJoint ¶
type ExtractJoint struct {
pipeline.Parameters
}
func (ExtractJoint) Name ¶
func (joint ExtractJoint) Name() string
type FetchJoint ¶
type FetchJoint struct { pipeline.Parameters // contains filtered or unexported fields }
func (FetchJoint) Name ¶
func (joint FetchJoint) Name() string
type FilterCheckJoint ¶
type FilterCheckJoint struct { pipeline.Parameters //ignore files end with js,css,apk,zip SkipPageParsePattern *regexp.Regexp }
FilterCheckJointused to check the task url if it is already in the filter, if not in the filter, then add it to task filter, and make sure won't add it next time
func (FilterCheckJoint) Name ¶
func (joint FilterCheckJoint) Name() string
Name return: filter_check
type HashJoint ¶
type HashJoint struct {
pipeline.Parameters
}
type HtmlToTextJoint ¶
type HtmlToTextJoint struct {
pipeline.Parameters
}
func (HtmlToTextJoint) Name ¶
func (joint HtmlToTextJoint) Name() string
type IgnoreTimeoutJoint ¶
type IgnoreTimeoutJoint struct {
pipeline.Parameters
}
func (IgnoreTimeoutJoint) Name ¶
func (joint IgnoreTimeoutJoint) Name() string
type IndexJoint ¶
type IndexJoint struct { }
IndexJoint is used to send snapshot and task info into index
type LanguageDetectJoint ¶
type LanguageDetectJoint struct { }
LanguageDetectJoint used to detect the language of the webpage
func (LanguageDetectJoint) Name ¶
func (joint LanguageDetectJoint) Name() string
Name return lang_detect
type ParsePageJoint ¶
type ParsePageJoint struct {
pipeline.Parameters
}
func (ParsePageJoint) Name ¶
func (joint ParsePageJoint) Name() string
type SaveSnapshotToDBJoint ¶
type SaveSnapshotToDBJoint struct {
pipeline.Parameters
}
func (SaveSnapshotToDBJoint) Name ¶
func (this SaveSnapshotToDBJoint) Name() string
type SaveSnapshotToFileSystemJoint ¶
type SaveSnapshotToFileSystemJoint struct {
// contains filtered or unexported fields
}
func (SaveSnapshotToFileSystemJoint) Name ¶
func (joint SaveSnapshotToFileSystemJoint) Name() string
type TaskDeduplicationJoint ¶
type TaskDeduplicationJoint struct { }
TaskDeduplicationJoint is used to find whether the task already in the database
func (TaskDeduplicationJoint) Name ¶
func (joint TaskDeduplicationJoint) Name() string
Name return task_deduplication
type UpdateCheckTimeJoint ¶
type UpdateCheckTimeJoint struct {
pipeline.Parameters
}
func (UpdateCheckTimeJoint) Name ¶
func (this UpdateCheckTimeJoint) Name() string
type UrlFilterJoint ¶
type UrlFilterJoint struct {
pipeline.Parameters
}
UrlFilterJoint used to validate urls, include host,path,file and file extension
type UrlNormalizationJoint ¶
type UrlNormalizationJoint struct { pipeline.Parameters // contains filtered or unexported fields }
UrlNormalizationJoint used to cleanup url and do normalization
func (UrlNormalizationJoint) Name ¶
func (joint UrlNormalizationJoint) Name() string
Name of this joint is: url_normalization