copple

package module
v0.0.0-...-f2ff751 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 5, 2022 License: LGPL-2.1 Imports: 20 Imported by: 0

Documentation

Index

Constants

View Source
const (
	BlocksWidth = 3
	Threshold   = 100 // in bytes
)

Variables

View Source
var (
	ReIgnoreBlock = map[string]*regexp.Regexp{
		"doctype":  regexp.MustCompile(`(?ims)<!DOCTYPE.*?>`),
		"comment":  regexp.MustCompile(`(?ims)<!--.*?-->`),
		"script":   regexp.MustCompile(`(?ims)<script.*?>.*?</script>`),
		"noscript": regexp.MustCompile(`(?ims)<noscript.*?>.*?</noscript>`),
		"style":    regexp.MustCompile(`(?ims)<style.*?>.*?</style>`),
		"link":     regexp.MustCompile(`(?ims)<link.*?>`),
	}
	ReNewLineBlock = map[string]*regexp.Regexp{
		"<div>": regexp.MustCompile(`(?ims)<div.*?>`),
		"<p>":   regexp.MustCompile(`(?ims)<p.*?>`),
		"<br>":  regexp.MustCompile(`(?ims)<br.*?>`),
		"<hr>":  regexp.MustCompile(`(?ims)<hr.*?>`),
		"<li>":  regexp.MustCompile(`(?ims)<li.*?>`),
	}
	ReMultiNewLine = regexp.MustCompile(`(?m)\n+`)
	ReSpaces       = regexp.MustCompile(`(?m)\s+`)
	ReTag          = regexp.MustCompile(`(?ims)<.*?>`)
	ReImg          = regexp.MustCompile(`(?ims)<img.*?>`)
	//ReImgSrc       = regexp.MustCompile(`(?ims)<img.+?src=\s*?"(.+?)"|'(.+?)'.*?>`)
	//ReImgSrc = regexp.MustCompile(`(?ims).+?src=\s*?"(.+?)"|'(.+?)'`)
	ReImgSrc = regexp.MustCompile(`(?ims)(?:.+?src|data-original)=\s*?"(.+?)"|'(.+?)'`)
	ReTitle  = regexp.MustCompile(`(?ims)<title.*?>(.+?)</title>`)
	ReH      = regexp.MustCompile(`(?ims)<h\d+.*?>(.*?)</h\d+>`)
	ReHead   = regexp.MustCompile(`(?ims)<head.*?>(.*?)<\/head>`)

	MonthStr = `(?:(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*)`
	ReDate   = regexp.MustCompile(`(?is)((?:` + MonthStr + `[\.,\-\s]*\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*(\d{4}))|` +
		`(?:\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*` + MonthStr + `[\.,\-\s]*(\d{4}))|` +
		MonthStr + `.\d{1,2}|` +
		`(?:(19|20)\d{2}[^0-9]\d{1,2}[^0-9]\d{1,2})|` +
		`(?:\d{1,2}[^0-9]\d{1,2}[^0-9](19|20)\d{2})|` +
		`(?:(\d{4}年){0,1}\d{1,2}月\d{1,2}日))`)

	ReTime = regexp.MustCompile(`(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|` +
		`(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)`)

	ReFavicon = regexp.MustCompile(`(?ims)<link rel="shortcut icon" href="(.+?)".*?/>`)

	//ReTitleNoNoisy = regexp.MustCompile(`(?ims)^[^|\-/•—_]+`)
	//ReTitleNoNoisy = regexp.MustCompile(`(?ims).*?——+.*|^[^|\-/•—_]+`)
	ReTitleNoNoisy = regexp.MustCompile(`(?ims)(.*?(——+|--+))?[^|\-/•—_]+`)

	IgnoreImgs = map[string]bool{
		"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7": true,
		"data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs=":         true,
	}
)
View Source
var (
	ReMeta = regexp.MustCompile(`(?ims)<meta.*?>`)
	ReKV   = regexp.MustCompile(`(?ims)([^\s]+?)\s*?=\s*?"(.+?)"|'(.+?)'`)
)

Functions

func FilterControlChar

func FilterControlChar(in string) string

func GBK2UTF8

func GBK2UTF8(gbkStr string) (string, error)

func GetCommentInHtml

func GetCommentInHtml(html string) []string

func GetFileContent

func GetFileContent(filePath string) ([]byte, error)

func InfoFromMeta

func InfoFromMeta(meta []map[string]string) (string, string, []string)

func Obj2Json

func Obj2Json(obj interface{}, indent bool) ([]byte, error)

func ParseTime

func ParseTime(tz string, s string) time.Time

func RawMeta

func RawMeta(raw string) []map[string]string

func SleepRandMS

func SleepRandMS(m int, n int)

func TimeOfDay

func TimeOfDay(day string) (int64, int64)

timeOfDay day is "2006-01-02"

Types

type Doc

type Doc struct {
	Url             string                 `json:"url"`
	From            string                 `json:"from"`
	CanonicalUrl    string                 `json:"canonical_url"`
	Title           string                 `json:"title"`
	Text            string                 `json:"text"`
	Html            string                 `json:"html"`
	Language        string                 `json:"language"`
	Location        string                 `json:"location"`
	Favicon         string                 `json:"favicon"`
	Images          []string               `json:"images"`
	Tags            string                 `json:"tags"`
	Author          string                 `json:"author"`
	Published       string                 `json:"published"`
	PublishedParsed time.Time              `json:"published_parsed"`
	Debug           map[string]interface{} `json:"debug,omitempty"`
}

func Parse

func Parse(rawurl, rawHtml string) *Doc

func ParsePro

func ParsePro(rawurl, rawHtml, ip string, debug bool) *Doc

type Meta

type Meta struct {
	Keywords    string `json:"keywords"`
	Tags        string `json:"tags"`
	Description string `json:"description"`
	Author      string `json:"author"`
}

func GetMeta

func GetMeta(meta []map[string]string) *Meta

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL