pipe

package
v0.8.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 24, 2017 License: Apache-2.0 Imports: 30 Imported by: 0

Documentation

Index

Constants

View Source
const (
	CONTEXT_CRAWLER_DOMAIN   ParaKey = "CRAWLER_DOMAIN"
	CONTEXT_CRAWLER_TASK     ParaKey = "CRAWLER_TASK"
	CONTEXT_CRAWLER_SNAPSHOT ParaKey = "CRAWLER_SNAPSHOT"

	CONTEXT_PAGE_LINKS ParaKey = "PAGE_LINKS"
)
View Source
const Cookie ParaKey = "cookie"
View Source
const Empty JointKey = "empty"
View Source
const Fetch JointKey = "fetch"
View Source
const Hash JointKey = "hash"
View Source
const HtmlToText JointKey = "html2text"
View Source
const IgnoreTimeout JointKey = "ignore_timeout"
View Source
const InitTask JointKey = "init_task"
View Source
const LoadMetadata JointKey = "load_metadata"
View Source
const ParsePage JointKey = "parse"
View Source
const Proxy ParaKey = "proxy"
View Source
const Publish JointKey = "index"
View Source
const SaveSnapshotToDB JointKey = "save_snapshot_db"
View Source
const SaveSnapshotToFileSystem JointKey = "save_snapshot_fs"
View Source
const SaveTask JointKey = "save_task"
View Source
const TaskID ParaKey = "TASK_ID"
View Source
const UrlCheckFilter JointKey = "url_check_filter"
View Source
const UrlExtFilter JointKey = "url_ext_filter"
View Source
const UrlNormalization JointKey = "url_normalization"

Variables

This section is empty.

Functions

func NewTaskJoint

func NewTaskJoint(task *model.Task) Joint

Types

type EmptyJoint

type EmptyJoint struct {
}

func (EmptyJoint) Name

func (this EmptyJoint) Name() string

func (EmptyJoint) Process

func (this EmptyJoint) Process(s *Context) error

type FetchJoint

type FetchJoint struct {
	Parameters
	// contains filtered or unexported fields
}

func (FetchJoint) Name

func (this FetchJoint) Name() string

func (FetchJoint) Process

func (this FetchJoint) Process(context *Context) error

type HashJoint

type HashJoint struct {
	DictRoot string
	Simhash  bool
}

func (HashJoint) Name

func (this HashJoint) Name() string

func (HashJoint) Process

func (this HashJoint) Process(context *Context) error

type HtmlToTextJoint

type HtmlToTextJoint struct {
	MergeWhitespace bool //merge whitespace and \n
}

func (HtmlToTextJoint) Name

func (this HtmlToTextJoint) Name() string

func (HtmlToTextJoint) Process

func (this HtmlToTextJoint) Process(context *Context) error

type IgnoreTimeoutJoint

type IgnoreTimeoutJoint struct {
	IgnoreTimeoutAfterCount int64
}

func (IgnoreTimeoutJoint) Name

func (this IgnoreTimeoutJoint) Name() string

func (IgnoreTimeoutJoint) Process

func (this IgnoreTimeoutJoint) Process(context *Context) error

type IndexJoint

type IndexJoint struct {
}

func (IndexJoint) Name

func (this IndexJoint) Name() string

func (IndexJoint) Process

func (this IndexJoint) Process(c *Context) error

type InitTaskJoint

type InitTaskJoint struct {
	Parameters
	Task *model.Task
}

func (InitTaskJoint) Name

func (this InitTaskJoint) Name() string

func (InitTaskJoint) Process

func (this InitTaskJoint) Process(context *Context) error

type LoadMetadataJoint

type LoadMetadataJoint struct {
}

load metadata from db

func (LoadMetadataJoint) Name

func (this LoadMetadataJoint) Name() string

func (LoadMetadataJoint) Process

func (this LoadMetadataJoint) Process(context *Context) error

type ParsePageJoint

type ParsePageJoint struct {
	DispatchLinks    bool
	MaxDepth         int         //max depth of page to follow
	MaxBreadth       int         //max breadth of the domain to follow
	MaxPageOfBreadth map[int]int //max page to fetch in each level's breadth, eg: 1:100;2:50;3:5;4:1

}

func (ParsePageJoint) Name

func (this ParsePageJoint) Name() string

func (ParsePageJoint) Process

func (this ParsePageJoint) Process(context *Context) error

type SaveSnapshotToDBJoint

type SaveSnapshotToDBJoint struct {
	CompressBody bool
	Bucket       string
}

func (SaveSnapshotToDBJoint) Name

func (this SaveSnapshotToDBJoint) Name() string

func (SaveSnapshotToDBJoint) Process

func (this SaveSnapshotToDBJoint) Process(c *Context) error

type SaveSnapshotToFileSystemJoint

type SaveSnapshotToFileSystemJoint struct {
	// contains filtered or unexported fields
}

func (SaveSnapshotToFileSystemJoint) Name

func (SaveSnapshotToFileSystemJoint) Process

func (this SaveSnapshotToFileSystemJoint) Process(c *Context) error

type SaveTaskJoint

type SaveTaskJoint struct {
	IsCreate bool
}

func (SaveTaskJoint) Name

func (this SaveTaskJoint) Name() string

func (SaveTaskJoint) Process

func (this SaveTaskJoint) Process(context *Context) error

type UrlCheckFilterJoint

type UrlCheckFilterJoint struct {
	Parameters
	//ignore files end with js,css,apk,zip
	SkipPageParsePattern *regexp.Regexp
}

func (UrlCheckFilterJoint) Name

func (this UrlCheckFilterJoint) Name() string

func (UrlCheckFilterJoint) Process

func (this UrlCheckFilterJoint) Process(context *Context) error

type UrlExtFilterJoint

type UrlExtFilterJoint struct {
	//ignore files end with js,css,apk,zip
	SkipPageParsePattern *regexp.Regexp
}

func (UrlExtFilterJoint) Name

func (this UrlExtFilterJoint) Name() string

func (UrlExtFilterJoint) Process

func (this UrlExtFilterJoint) Process(context *Context) error

type UrlNormalizationJoint

type UrlNormalizationJoint struct {
	FollowAllDomain      bool
	FollowDomainSettings bool
	FollowSubDomain      bool
	// contains filtered or unexported fields
}

func (UrlNormalizationJoint) Name

func (this UrlNormalizationJoint) Name() string

func (UrlNormalizationJoint) Process

func (this UrlNormalizationJoint) Process(context *Context) error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL