Documentation
¶
Index ¶
Constants ¶
View Source
const ( ZhColon = ':' EnColon = ':' //结束提取字符 ENSP = '\u2002' EMSP = '\u2003' NBSP = '\xa0' Space = '\x20' Enter = '\n' Table = '\t' Sep1 = ',' Sep2 = '。' Sep3 = ';' Sep4 = '】' )
View Source
const KeyP = `(\s|\xa0)*`
关键字之间处理空格的正则段 32 or 160
Variables ¶
View Source
var ArrBlock = []string{
"h1", "h2", "h3", "h4", "h5",
"p",
"br", "b",
"div",
"ul", "li", "ol", "dl", "dt", "dd",
}
View Source
var DefaultFilter = func(item string) string { if utf8.RuneCountInString(item) > 60 { return "" } if strings.ContainsRune(item, ZhColon) { return "" } idx := strings.IndexFunc(item, func(r rune) bool { if r == Sep1 || r == Sep2 || r == Sep3 || r == Sep4 { return true } return false }) if idx > 0 { item = item[:idx] } return item }
Functions ¶
Types ¶
type Extractor ¶
type Extractor interface { //计算方法 ExtractKeywordsFromHtml(html string) error Clear() //无副作用方法 GetResult(filter bool) []ResultRow GetContent() string GetSubject() string Filter(item string) string //return filter 后的值 GetItemsByWeight(filter bool) *ResultRow //按关键字列表序列获取第一个非空结果集 }
单个实例 线程不安全
func NewExtractor ¶
func NewExtractor(keys []string, ops ...ExtractorOptionFunc) Extractor
arrKeys init
type ExtractorOptionFunc ¶ added in v1.0.3
type ExtractorOptionFunc func(o *extract)
func WithFilter ¶ added in v1.0.3
func WithFilter(fn func(string) string) ExtractorOptionFunc
func WithSubject ¶ added in v1.0.3
func WithSubject(s string) ExtractorOptionFunc
Click to show internal directories.
Click to hide internal directories.