Documentation ¶
Index ¶
- Constants
- Variables
- func AndSimHash(a, b string) float32
- func DecodeHTMLBody(body io.Reader, charset string) (io.Reader, error)
- func DefaultProxyMode(mode int)
- func Failed(args ...interface{})
- func FindMostMayDate(title, raw string) (t time.Time)
- func GetMD5(c []byte) string
- func Info(args ...interface{})
- func L(args ...interface{})
- func ProgressLog(now int, all int, msg string)
- func SetProxyGenerater(proxyCreator NewDialler)
- func ShowDemo()
- func Skip(u string, us ...string) bool
- func Socks5Dialer(addr string) proxy.Dialer
- func Success(args ...interface{})
- func UrlJoin(f ...string) string
- type ArrayFilter
- func (array ArrayFilter) Add(o string) ArrayFilter
- func (array ArrayFilter) Every(handler func(no int, every string) string) ArrayFilter
- func (array ArrayFilter) Filter(reStrOrFunc_str_bool interface{}) (newArray ArrayFilter)
- func (array ArrayFilter) FilterFunc(reStrOrFunc_str_bool func(no int, every string) bool) (newArray ArrayFilter)
- func (array ArrayFilter) In(o string) int
- func (array ArrayFilter) Sort() ArrayFilter
- type Article
- type Async
- type AsyncOut
- type ConsoleBar
- func (pro *ConsoleBar) Add(i int) int
- func (pro *ConsoleBar) Error(err error)
- func (pro *ConsoleBar) Finished()
- func (pro *ConsoleBar) GetPercent() float32
- func (pro *ConsoleBar) Increment() int
- func (pro *ConsoleBar) Println(args ...interface{})
- func (pro *ConsoleBar) Reset()
- func (pro *ConsoleBar) SetAll(all int)
- func (pro *ConsoleBar) SetMsg(msg string)
- func (pro *ConsoleBar) Update()
- func (pro *ConsoleBar) Write(args ...interface{})
- type Dict
- type DictBool
- type EnumeConfig
- type FilterOption
- type G
- type Gfunc
- type Links
- type Loger
- type NewDialler
- type NextValue
- type Payloader
- func (pay Payloader) AsFile(howHandle func(f *os.File, err error))
- func (pay Payloader) Format(args ...interface{}) Payloader
- func (pay Payloader) FormatMap(args map[string]interface{}) Payloader
- func (pay Payloader) Lines() (a ArrayFilter)
- func (pay Payloader) Render(name string, v Value) string
- func (pay Payloader) String() string
- type ProxyDiallerPool
- type Result
- type Resulter
- type RunnerPool
- type Selection
- type Session
- func (session *Session) Asyncs(work int, loadCache bool, showState bool, do func(each *AsyncOut), ...) *Session
- func (sess *Session) CheckAlive(urls []string, showBar bool, after func(res *SmartResponse) bool, ...) (alived []string)
- func (session *Session) Copy() *Session
- func (session *Session) Get(url string, proxy ...interface{}) (resp *SmartResponse, err error)
- func (sess *Session) GetsWith(urltemp string, mapFuncs Gfunc, ...)
- func (session *Session) Json(url string, data map[string]interface{}, proxy ...interface{}) (resp *SmartResponse, err error)
- func (sess *Session) MultiGet(urls []string, handleRes func(loger Loger, res *SmartResponse, err error), ...)
- func (session *Session) Post(httpurl string, data map[string]string, proxy ...interface{}) (resp *SmartResponse, err error)
- func (session *Session) Send(raw string, proxy ...interface{}) (resp *SmartResponse, err error)
- func (session *Session) SetHeader(key string, value string)
- func (session *Session) SetProxy(proxy interface{})
- func (session *Session) SetProxyDialer(dialer proxy.Dialer)
- func (session *Session) SetSocks5Proxy(proxyAddr string) (err error)
- func (session *Session) SetTimeout(t int)
- func (session *Session) StartAsync(i int) *Async
- func (session *Session) TestErrorPage(url string, proxy ...interface{}) (string, string, string)
- func (session *Session) Upload(url string, filePath string, fileKey string, data map[string]string, ...) (resp *SmartResponse, err error)
- func (session *Session) UrlJoin(f ...string) string
- func (session *Session) With(urlstr string, proxy ...interface{}) (with *WithOper)
- type SmartResponse
- func (res *SmartResponse) Base64() string
- func (res *SmartResponse) Base64Mime() []byte
- func (res *SmartResponse) CssExtract(cssSelctors Dict) (out G)
- func (res *SmartResponse) CssSelect(css string, each func(i int, s *Selection))
- func (res *SmartResponse) FastCheckLineByLine(found func(line string) bool) (string, bool)
- func (res *SmartResponse) FileLinks(includeouter ...bool) (s []string)
- func (res *SmartResponse) HashMMH3() int32
- func (res *SmartResponse) HashMMH3Base64() int32
- func (smartres *SmartResponse) HeaderJson() string
- func (smartres *SmartResponse) HeaderString() (d string)
- func (smartres *SmartResponse) Html() []byte
- func (smartres *SmartResponse) Json(obj ...interface{}) (jdata map[string]interface{})
- func (res *SmartResponse) Links(includeouter ...bool) (s []string)
- func (smartres *SmartResponse) Md5() string
- func (res *SmartResponse) PageTextHash() string
- func (smartres *SmartResponse) ReExtractString(re string) []string
- func (smartres *SmartResponse) RequestURL() *url.URL
- func (smartres *SmartResponse) Search(key string, toLower bool) bool
- func (smartres *SmartResponse) Soup() (m *goquery.Document)
- func (smartres *SmartResponse) String() string
- func (resp *SmartResponse) Text() string
- func (smartres *SmartResponse) Title() string
- type UrlSim
- type Value
- type WithOper
- func (with *WithOper) AsArticle() *WithOper
- func (with *WithOper) AsSiteMap(do func(out *AsyncOut), breakpointContinue bool, showState bool, ...) *WithOper
- func (with *WithOper) Each(css string, do ...func(i int, s *Selection)) *WithOper
- func (with *WithOper) EndCache() *WithOper
- func (with *WithOper) Entry(url string) *WithOper
- func (with *WithOper) For(do func(i int, s *Selection)) *WithOper
- func (with *WithOper) News(filters ...FilterOption) *WithOper
- func (with *WithOper) PreTestSkip(name string, urls ...string) (o []string)
- func (with *WithOper) SimpleNews() *WithOper
- func (with *WithOper) StartCache(name string) *WithOper
Constants ¶
View Source
const (
LANGEMPTY = "" /* 391-byte string literal not displayed */
)
Variables ¶
View Source
var ( B2S = regexp.MustCompile("\\<[\\S\\s]+?\\>") DelStyle = regexp.MustCompile("\\<style[\\S\\s]+?\\</style\\>") DelScript = regexp.MustCompile("\\<script[\\S\\s]+?\\</script\\>") DelHtmlTag = regexp.MustCompile("\\<[\\S\\s]+?\\>") DelSpaceContinuesly = regexp.MustCompile("\\s{2,}") FileTp = regexp.MustCompile(`\.[a-zA-Z0-9]+`) )
View Source
var ( UA = []string{ "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0", "Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0", "Mozilla/5.0 (Windows; U; Windows NT 5.1 ; x64; en-US; rv:1.9.1b2pre) Gecko/20081026 Firefox/3.1b2pre", "Opera/10.60 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.60", "Opera/8.01 (J2ME/MIDP; Opera Mini/2.0.4062; en; U; ssr)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; ; rv:1.9.0.14) Gecko/2009082707 Firefox/3.0.14", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr; rv:1.9.2.4) Gecko/20100523 Firefox/3.6.4 ( .NET CLR 3.5.30729)", "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16", "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", } // UA = random.choice(user_agent) DeafultHeaders = map[string]string{ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": UA[0], "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Referer": "http://www.baidu.com/link?url=www.so.com&url=www.soso.com&&url=www.sogou.com", "Cookie": "PHPSESSID=gljsd5c3ei5n813roo4878q203", } )
View Source
var ( MAX_POOL = 100 RandomMode = 1 FlowMode = 0 HostParse = regexp.MustCompile(`Host: .+`) LogLevl = 0 )
View Source
var ( Red = color.New(color.FgRed).SprintFunc() Green = color.New(color.FgGreen).SprintFunc() GreenBack = color.New(color.BgGreen, color.Bold).SprintFunc() BlueBack = color.New(color.BgBlue, color.FgHiWhite, color.Bold).SprintFunc() Yello = color.New(color.FgYellow).SprintFunc() Blue = color.New(color.FgBlue).SprintFunc() Magenta = color.New(color.FgMagenta).SprintFunc() Bold = color.New(color.Bold).SprintFunc() Underline = color.New(color.Underline).SprintFunc() Hblue = color.New(color.FgHiBlue).SprintFunc() Hgreen = color.New(color.FgHiGreen).SprintFunc() Hyello = color.New(color.FgHiYellow).SprintFunc() )
View Source
var ( NW = regexp.MustCompile(`\W`) DateMatcher = map[*regexp.Regexp]string{ regexp.MustCompile(`[1-2]\d{3}年\d{1,2}月\d{1,2}日 \d{1,2}\:\d{1,2}\:\d{1,2}`): "2006年1月2日 15:04:05", regexp.MustCompile(`[1-2]\d{3}年[0-1]\d月[0-3]\d日 [0-2]\d\:[0-5]\d\:\d{2}`): "2006年1月2日 15:04:05", regexp.MustCompile(`[1-2]\d{3}-\d{1,2}-\d{1,2} \d{1,2}\:\d{1,2}\:\d{1,2}`): "2006-1-2 15:04:05", regexp.MustCompile(`[1-2]\d{3}/\d{1,2}/\d{1,2} \d{1,2}\:\d{1,2}\:\d{1,2}`): "2006/1/2 15:04:05", regexp.MustCompile(`[1-2]\d{3}/\d{1,2}/\d{1,2} \d{1,2}\:\d{1,2}`): "2006/1/2 15:04", regexp.MustCompile(`\d{2}\/\d{2}\/[1-2]\d{3} \- \d{2}\:\d{2}`): "02/01/2006 - 15:04", regexp.MustCompile(`[1-2]\d{3}年\d{1,2}月\d{1,2}日 \d{1,2}\:\d{1,2}`): "2006年1月2日 15:04", regexp.MustCompile(`[1-2]\d{3}-\d{1,2}-\d{1,2} \d{1,2}\:\d{1,2}`): "2006-1-2 15:04", regexp.MustCompile(`[1-2]\d{3}年\d{1,2}月\d{1,2}日`): "2006年1月2日", regexp.MustCompile(`[1-2]\d{3}-\d{1,2}-\d{1,2}`): "2006-1-2", regexp.MustCompile(`[1-2]\d{3}/\d{1,2}/\d{1,2}`): "2006/1/2", regexp.MustCompile(`\w{1,15}, \d{1,2} \w{1,15} [1-2]\d{3}`): "Mon, 02 Jan 2006", regexp.MustCompile(`\d{1,2} \w{1,15} [1-2]\d{3}`): "02 Jan 2006", regexp.MustCompile(`[1-2]\d{3}-[0-1]\d-\d{2}T\d{2}\:\d{2}\:\d{2}Z`): "2006-01-02T10:27:21Z", regexp.MustCompile(`\d{2}\.[0-1]\d\.[1-2]\d{3}`): "02.01.2006", } )
View Source
var ( MsgCacheChan = make(chan string) CacheStatus = false )
View Source
var ( DefaultProxyDialer NewDialler DefaultProxyPool ProxyDiallerPool )
View Source
var ( STOP = "[STOP]" DefaultLoger = &defaultloger{} )
View Source
var (
ConfigDocument = `` /* 798-byte string literal not displayed */
)
View Source
var (
DEFAULT_BREAKPOINT_FILE = "default-skip.txt"
)
View Source
var (
KeyEx = regexp.MustCompile(`\{\w+\}`)
)
Functions ¶
func AndSimHash ¶
func DecodeHTMLBody ¶
DecodeHTMLBody returns an decoding reader of the html Body for the specified `charset` If `charset` is empty, DecodeHTMLBody tries to guess the encoding from the content
func FindMostMayDate ¶
func ProgressLog ¶
func SetProxyGenerater ¶
func SetProxyGenerater(proxyCreator NewDialler)
func Socks5Dialer ¶
Types ¶
type ArrayFilter ¶
type ArrayFilter []string
func (ArrayFilter) Add ¶
func (array ArrayFilter) Add(o string) ArrayFilter
func (ArrayFilter) Every ¶
func (array ArrayFilter) Every(handler func(no int, every string) string) ArrayFilter
func (ArrayFilter) Filter ¶
func (array ArrayFilter) Filter(reStrOrFunc_str_bool interface{}) (newArray ArrayFilter)
func (ArrayFilter) FilterFunc ¶
func (array ArrayFilter) FilterFunc(reStrOrFunc_str_bool func(no int, every string) bool) (newArray ArrayFilter)
func (ArrayFilter) In ¶
func (array ArrayFilter) In(o string) int
func (ArrayFilter) Sort ¶
func (array ArrayFilter) Sort() ArrayFilter
type Article ¶
type Article struct { Text string `json:"text"` Title string `json:"title"` Date time.Time `json:"date"` Author string `json:"author"` Link string `json:"link"` }
func NewArticle ¶
func (*Article) WaitToFile ¶
func (article *Article) WaitToFile()
type ConsoleBar ¶
type ConsoleBar struct { All int64 Now int64 Width int NowWidth int Interval int Last time.Time LastMsg string LastWrite string LastBar string }
func NewConsoleBar ¶
func NewConsoleBar(all int64) (bar *ConsoleBar, err error)
func (*ConsoleBar) Add ¶
func (pro *ConsoleBar) Add(i int) int
func (*ConsoleBar) Error ¶
func (pro *ConsoleBar) Error(err error)
func (*ConsoleBar) Finished ¶
func (pro *ConsoleBar) Finished()
func (*ConsoleBar) GetPercent ¶
func (pro *ConsoleBar) GetPercent() float32
func (*ConsoleBar) Increment ¶
func (pro *ConsoleBar) Increment() int
func (*ConsoleBar) Println ¶
func (pro *ConsoleBar) Println(args ...interface{})
func (*ConsoleBar) Reset ¶
func (pro *ConsoleBar) Reset()
func (*ConsoleBar) SetAll ¶
func (pro *ConsoleBar) SetAll(all int)
func (*ConsoleBar) SetMsg ¶
func (pro *ConsoleBar) SetMsg(msg string)
func (*ConsoleBar) Update ¶
func (pro *ConsoleBar) Update()
func (*ConsoleBar) Write ¶
func (pro *ConsoleBar) Write(args ...interface{})
type EnumeConfig ¶
type EnumeConfig struct { Domain string Proxy string Proxy2 string Proxy3 string Output string Names []string IdFile string StartId int EndId int Template map[string]string }
func ReadConf ¶
func ReadConf(f string) (config *EnumeConfig)
func (*EnumeConfig) Marshal ¶
func (conf *EnumeConfig) Marshal() string
type FilterOption ¶
type NewDialler ¶
type Payloader ¶
type Payloader string
func (Payloader) Lines ¶
func (pay Payloader) Lines() (a ArrayFilter)
type ProxyDiallerPool ¶
type RunnerPool ¶
type RunnerPool struct { Thread int Handle func(arg string, tryTime int) interface{} After func(res Result, loger Loger) ErrDo func(error, int, Result, Loger) // Loger Loger RetryTime int LogLevl int Bar *ConsoleBar // contains filtered or unexported fields }
func NewAwaitPool ¶
func NewAwaitPool(thread int) (pool *RunnerPool)
func (*RunnerPool) Loop ¶
func (pool *RunnerPool) Loop(args []string, showBar bool)
func (*RunnerPool) LoopByFunc ¶
func (pool *RunnerPool) LoopByFunc(generate func() (string, bool))
func (*RunnerPool) Tick ¶
func (pool *RunnerPool) Tick(sec int)
type Session ¶
type Session struct { Header map[string]string Transprot httplib.Transport Timeout int RandomeUA bool MultiGetRetryTime int Proxy string Document *goquery.Document }
func NewSession ¶
func NewSession() (sess *Session)
func (*Session) CheckAlive ¶
func (*Session) Get ¶
func (session *Session) Get(url string, proxy ...interface{}) (resp *SmartResponse, err error)
*
- Get set proxy: socks5://xxx.x.x.x.x:port ss://xxasfsfs ssr://xasfsaf General.Config{...}
func (*Session) GetsWith ¶
func (sess *Session) GetsWith(urltemp string, mapFuncs Gfunc, handleRes func(loger Loger, res *SmartResponse, err error), thread int, proxy ...interface{})
HttpByCustom
example urltemp like : "https://www.baidu.com/?uid={id}"
GetSwith("https://www.baidu.com/?uid={id}", func(p Pyaloader) (string, bool){ defaultvalue := 0 p.SetValue("id", func(v Value)Value{ return v.Add(2) }, defaultvalue) })
func (*Session) Json ¶
func (session *Session) Json(url string, data map[string]interface{}, proxy ...interface{}) (resp *SmartResponse, err error)
func (*Session) Send ¶
func (session *Session) Send(raw string, proxy ...interface{}) (resp *SmartResponse, err error)
func (*Session) SetProxyDialer ¶
func (*Session) SetSocks5Proxy ¶
func (*Session) SetTimeout ¶
func (*Session) StartAsync ¶
func (*Session) TestErrorPage ¶
type SmartResponse ¶
func ParseRawData ¶
func ParseRawData(buf []byte, url string) (r *SmartResponse, err error)
func (*SmartResponse) Base64 ¶
func (res *SmartResponse) Base64() string
func (*SmartResponse) Base64Mime ¶
func (res *SmartResponse) Base64Mime() []byte
func (*SmartResponse) CssExtract ¶
func (res *SmartResponse) CssExtract(cssSelctors Dict) (out G)
CssExtract raw | href | id | class
exmaple : CssExtract(Dict{ "name": "div.names#one " , // will return *goquery.Selection "nameText": "div.names#one | raw " , // will return node's string "imgLink" : "img#head | href ", })
func (*SmartResponse) CssSelect ¶
func (res *SmartResponse) CssSelect(css string, each func(i int, s *Selection))
func (*SmartResponse) FastCheckLineByLine ¶
func (res *SmartResponse) FastCheckLineByLine(found func(line string) bool) (string, bool)
func (*SmartResponse) FileLinks ¶
func (res *SmartResponse) FileLinks(includeouter ...bool) (s []string)
func (*SmartResponse) HashMMH3 ¶
func (res *SmartResponse) HashMMH3() int32
func (*SmartResponse) HashMMH3Base64 ¶
func (res *SmartResponse) HashMMH3Base64() int32
func (*SmartResponse) HeaderJson ¶
func (smartres *SmartResponse) HeaderJson() string
func (*SmartResponse) HeaderString ¶
func (smartres *SmartResponse) HeaderString() (d string)
func (*SmartResponse) Html ¶
func (smartres *SmartResponse) Html() []byte
func (*SmartResponse) Json ¶
func (smartres *SmartResponse) Json(obj ...interface{}) (jdata map[string]interface{})
func (*SmartResponse) Links ¶
func (res *SmartResponse) Links(includeouter ...bool) (s []string)
func (*SmartResponse) PageTextHash ¶
func (res *SmartResponse) PageTextHash() string
func (*SmartResponse) ReExtractString ¶
func (smartres *SmartResponse) ReExtractString(re string) []string
Get regex group
func (*SmartResponse) RequestURL ¶
func (smartres *SmartResponse) RequestURL() *url.URL
func (*SmartResponse) String ¶
func (smartres *SmartResponse) String() string
func (*SmartResponse) Text ¶
func (resp *SmartResponse) Text() string
type WithOper ¶
type WithOper struct { URL *url.URL Document *goquery.Document LastSelections []*goquery.Selection Links Links Article *Article Err error // contains filtered or unexported fields }
func (*WithOper) AsSiteMap ¶
func (with *WithOper) AsSiteMap(do func(out *AsyncOut), breakpointContinue bool, showState bool, filter func(chanelUrl string) bool) *WithOper
* AsSiteMap 爬取site-map 提取xml sitemap的大部分标准
>@breakpointContinue 开启断点续传,会自动读取和存储 已爬页面到 /tmp/default-skip.txt 和 /tmp/skip-site.txt
>@showState 开启状态显示
>@filter 通过url 过滤每个channel true to entry false not entry
example func(u string){ return strings.Contains(u,"/zh/")}
func (*WithOper) News ¶
func (with *WithOper) News(filters ...FilterOption) *WithOper
func (*WithOper) PreTestSkip ¶
func (*WithOper) SimpleNews ¶
func (*WithOper) StartCache ¶
Click to show internal directories.
Click to hide internal directories.