Documentation ¶
Index ¶
- Constants
- Variables
- func FilterControlChar(in string) string
- func GBK2UTF8(gbkStr string) (string, error)
- func GetCommentInHtml(html string) []string
- func GetFileContent(filePath string) ([]byte, error)
- func InfoFromMeta(meta []map[string]string) (string, string, []string)
- func Obj2Json(obj interface{}, indent bool) ([]byte, error)
- func ParseTime(tz string, s string) time.Time
- func RawMeta(raw string) []map[string]string
- func SleepRandMS(m int, n int)
- func TimeOfDay(day string) (int64, int64)
- type Doc
- type Meta
Constants ¶
View Source
const ( BlocksWidth = 3 Threshold = 100 // in bytes )
Variables ¶
View Source
var ( ReIgnoreBlock = map[string]*regexp.Regexp{ "doctype": regexp.MustCompile(`(?ims)<!DOCTYPE.*?>`), "comment": regexp.MustCompile(`(?ims)<!--.*?-->`), "script": regexp.MustCompile(`(?ims)<script.*?>.*?</script>`), "noscript": regexp.MustCompile(`(?ims)<noscript.*?>.*?</noscript>`), "style": regexp.MustCompile(`(?ims)<style.*?>.*?</style>`), "link": regexp.MustCompile(`(?ims)<link.*?>`), } ReNewLineBlock = map[string]*regexp.Regexp{ "<div>": regexp.MustCompile(`(?ims)<div.*?>`), "<p>": regexp.MustCompile(`(?ims)<p.*?>`), "<br>": regexp.MustCompile(`(?ims)<br.*?>`), "<hr>": regexp.MustCompile(`(?ims)<hr.*?>`), "<li>": regexp.MustCompile(`(?ims)<li.*?>`), } ReMultiNewLine = regexp.MustCompile(`(?m)\n+`) ReSpaces = regexp.MustCompile(`(?m)\s+`) ReTag = regexp.MustCompile(`(?ims)<.*?>`) ReImg = regexp.MustCompile(`(?ims)<img.*?>`) //ReImgSrc = regexp.MustCompile(`(?ims)<img.+?src=\s*?"(.+?)"|'(.+?)'.*?>`) //ReImgSrc = regexp.MustCompile(`(?ims).+?src=\s*?"(.+?)"|'(.+?)'`) ReImgSrc = regexp.MustCompile(`(?ims)(?:.+?src|data-original)=\s*?"(.+?)"|'(.+?)'`) ReTitle = regexp.MustCompile(`(?ims)<title.*?>(.+?)</title>`) ReH = regexp.MustCompile(`(?ims)<h\d+.*?>(.*?)</h\d+>`) ReHead = regexp.MustCompile(`(?ims)<head.*?>(.*?)<\/head>`) MonthStr = `(?:(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*)` ReDate = regexp.MustCompile(`(?is)((?:` + MonthStr + `[\.,\-\s]*\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*(\d{4}))|` + `(?:\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*` + MonthStr + `[\.,\-\s]*(\d{4}))|` + MonthStr + `.\d{1,2}|` + `(?:(19|20)\d{2}[^0-9]\d{1,2}[^0-9]\d{1,2})|` + `(?:\d{1,2}[^0-9]\d{1,2}[^0-9](19|20)\d{2})|` + `(?:(\d{4}年){0,1}\d{1,2}月\d{1,2}日))`) ReTime = regexp.MustCompile(`(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|` + `(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)`) ReFavicon = regexp.MustCompile(`(?ims)<link rel="shortcut icon" href="(.+?)".*?/>`) //ReTitleNoNoisy = regexp.MustCompile(`(?ims)^[^|\-/•—_]+`) //ReTitleNoNoisy = regexp.MustCompile(`(?ims).*?——+.*|^[^|\-/•—_]+`) ReTitleNoNoisy = regexp.MustCompile(`(?ims)(.*?(——+|--+))?[^|\-/•—_]+`) IgnoreImgs = map[string]bool{ "": true, "": true, } )
View Source
var ( ReMeta = regexp.MustCompile(`(?ims)<meta.*?>`) ReKV = regexp.MustCompile(`(?ims)([^\s]+?)\s*?=\s*?"(.+?)"|'(.+?)'`) )
Functions ¶
func FilterControlChar ¶
func GetCommentInHtml ¶
func GetFileContent ¶
func SleepRandMS ¶
Types ¶
type Doc ¶
type Doc struct { Url string `json:"url"` From string `json:"from"` CanonicalUrl string `json:"canonical_url"` Title string `json:"title"` Text string `json:"text"` Html string `json:"html"` Language string `json:"language"` Location string `json:"location"` Favicon string `json:"favicon"` Images []string `json:"images"` Tags string `json:"tags"` Author string `json:"author"` Published string `json:"published"` PublishedParsed time.Time `json:"published_parsed"` Debug map[string]interface{} `json:"debug,omitempty"` }
Click to show internal directories.
Click to hide internal directories.