Documentation
¶
Overview ¶
Copyright 2015 andeya Author. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Package surfer is a high level concurrency http client.
It has `surf` and` phantom` download engines, highly simulated browser behavior, the function of analog login and so on.
Features: - Both surf and phantomjs engines are supported - Support random User-Agent - Support cache cookie - Support http/https
Usage: package main
import (
"github.com/andeya/surfer" "io/ioutil" "log"
)
func main() { // Use surf engine resp, err := surfer.Download(&surfer.Request{ Url: "http://github.com/andeya/surfer", }) if err != nil { log.Fatal(err) } b, err := ioutil.ReadAll(resp.Body) log.Println(string(b), err) // Use phantomjs engine resp, err = surfer.Download(&surfer.Request{ Url: "http://github.com/andeya", DownloaderID: 1, }) if err != nil { log.Fatal(err) } b, err = ioutil.ReadAll(resp.Body) log.Println(string(b), err) resp.Body.Close() surfer.DestroyJsFiles() }
Index ¶
- Constants
- Variables
- func AutoToUTF8(resp *http.Response) error
- func BodyBytes(resp *http.Response) ([]byte, error)
- func CreateDefault(browser string) string
- func CreateReal() string
- func CreateVersion(browser, version string) string
- func DestroyJsFiles()
- func Download(req *Request) (resp *http.Response, err error)
- func Format(bname, bver string) string
- func GetWDPath() string
- func IsDirExists(path string) bool
- func IsFileExists(path string) bool
- func SetPhantomJsFilePath(filePath string)
- func TopVersion(bname string) string
- func UrlEncode(urlStr string) (*url.URL, error)
- func WalkDir(targpath string, suffixes ...string) (dirlist []string)
- type Bytes
- type Content
- type Cookie
- type DnsCache
- type File
- type Form
- type Formats
- type JSONObj
- type OSAttributes
- type Phantom
- type Request
- type RespBody
- type Response
- type Surf
- type Surfer
- type TemplateData
- type UAData
- type UATable
- type XMLObj
Constants ¶
const ( // Windows operating system. Windows int = iota // Linux based operating system. Linux // Macintosh /OS X operating system. Macintosh )
const ( SurfID = 0 // Surf下载器标识符 PhomtomJsID = 1 // PhomtomJs下载器标识符 DefaultMethod = "GET" // 默认请求方法 DefaultDialTimeout = 2 * time.Minute // 默认请求服务器超时 DefaultConnTimeout = 2 * time.Minute // 默认下载超时 DefaultTryTimes = 3 // 默认最大下载次数 DefaultRetryPause = 2 * time.Second // 默认重新下载前停顿时长 )
constant
Variables ¶
var Database = UATable{ "chrome": { "37.0.2049.0", Windows, Formats{ "37": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "36": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "35": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "34": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "33": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "32": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", }, }, "firefox": { "31.0", Windows, Formats{ "31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:31.0) Gecko/20100101 Firefox/{{.Ver}}", "30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:30.0) Gecko/20120101 Firefox/{{.Ver}}", "29": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:29.0) Gecko/20120101 Firefox/{{.Ver}}", "28": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:28.0) Gecko/20100101 Firefox/{{.Ver}}", "27": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:27.0) Gecko/20130101 Firefox/{{.Ver}}", "26": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:26.0) Gecko/20121011 Firefox/{{.Ver}}", "25": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:25.0) Gecko/20100101 Firefox/{{.Ver}}", }, }, "msie": { "10.0", Windows, Formats{ "10": "Mozilla/5.0 (compatible; MSIE 10.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.5.30729)", "9": "Mozilla/5.0 (compatible; MSIE 9.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.0.30729)", "8": "Mozilla/5.0 (compatible; MSIE 8.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/4.0; .NET CLR 3.0.04320)", "7": "Mozilla/4.0 (compatible; MSIE 7.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}.NET CLR 2.0.50727)", }, }, "opera": { "12.14", Windows, Formats{ "12": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.9.181 Version/{{.Ver}}", "11": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.7.62 Version/{{.Ver}}", "10": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.2.15 Version/{{.Ver}}", "9": "Opera/9.00 ({{.OSN}} {{.OSV}}; U{{.Coms}})", }, }, "safari": { "6.0", Macintosh, Formats{ "6": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/536.26 (KHTML, like Gecko) Version/{{.Ver}} Safari/8536.25", "5": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/531.2+ (KHTML, like Gecko) Version/{{.Ver}} Safari/531.2+", "4": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/528.16 (KHTML, like Gecko) Version/{{.Ver}} Safari/528.16", }, }, "itunes": { "9.1.1", Macintosh, Formats{ "9": "iTunes/{{.Ver}}", "8": "iTunes/{{.Ver}}", "7": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.7{{.Coms}})", "6": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.5{{.Coms}})", }, }, "aol": { "9.7", Windows, Formats{ "9": "Mozilla/5.0 (compatible; MSIE 9.0; AOL {{.Ver}}; AOLBuild 4343.19; {{.OSN}} {{.OSV}}; WOW64; Trident/5.0; FunWebProducts{{.Coms}})", "8": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727{{.Coms}})", "7": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; FunWebProducts{{.Coms}})", "6": "Mozilla/4.0 (compatible; MSIE 6.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}{{.Coms}})", }, }, "konqueror": { "4.9", Linux, Formats{ "4": "Mozilla/5.0 (compatible; Konqueror/4.0; {{.OSN}}{{.Coms}}) KHTML/4.0.3 (like Gecko)", "3": "Mozilla/5.0 (compatible; Konqueror/3.0-rc6; i686 {{.OSN}}; 20021127{{.Coms}})", "2": "Mozilla/5.0 (compatible; Konqueror/2.1.1; {{.OSN}}{{.Coms}})", }, }, "netscape": { "9.1.0285", Windows, Formats{ "9": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.9.2.4{{.Coms}}) Gecko/20070321 Netscape/{{.Ver}}", "8": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.7.5{{.Coms}}) Gecko/20050519 Netscape/{{.Ver}}", "7": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.0.1{{.Coms}}) Gecko/20020921 Netscape/{{.Ver}}", }, }, "lynx": { "2.8.8dev.3", Linux, Formats{ "2": "Lynx/{{.Ver}} libwww-FM/2.14 SSL-MM/1.4.1", "1": "Lynx (textmode)", }, }, "googlebot": { "2.1", Linux, Formats{ "2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})", "1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})", }, }, "bingbot": { "2.0", Windows, Formats{ "2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})", }, }, "yahoobot": { "2.0", Linux, Formats{ "2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})", }, }, "default": { "1.0", Linux, Formats{ "1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})", }, }, }
Database is the "database" of user agents.
var DefaultOSAttributes = map[int]OSAttributes{ Windows: {"Windows NT", "6.3", []string{"x64"}}, Linux: {"Linux", "3.16.1", []string{"x64"}}, Macintosh: {"Intel Mac OS X", "10_6_8", []string{}}, }
DefaultOSAttributes stores default OS attributes.
var UserAgents = map[string][]string{}
UserAgents all User-Agent
Functions ¶
func AutoToUTF8 ¶
AutoToUTF8 采用surf内核下载时,可以尝试自动转码为utf8 采用phantomjs内核时,无需转码(已是utf8)
func CreateDefault ¶
CreateDefault returns a user agent string using default values.
func CreateReal ¶
func CreateReal() string
CreateReal creates generates and returns a complete user agent string.
func CreateVersion ¶
CreateVersion generates and returns a complete user agent string for a specific browser version.
func Format ¶
Format returns the format string for the given browser name and version.
When a format can't be found for a version, the first format string for the browser is returned. When a format can't be found for the browser the default format is returned.
func TopVersion ¶
TopVersion returns the most recent version for the given browser name.
Types ¶
type Cookie ¶
type Cookie struct { Name string `json:"name"` Value string `json:"value"` Domain string `json:"domain"` Path string `json:"path"` }
给phantomjs传输cookie用
type DnsCache ¶
type DnsCache struct {
// contains filtered or unexported fields
}
DnsCache DNS cache
type Form ¶
type Form struct { // Values [field name]-[]value Values map[string][]string // Files [field name]-[]File Files map[string][]File }
Form impletes body interface
type Formats ¶
Formats is a collection of UA format strings. key is the browser version. value is the browser info.
type OSAttributes ¶
type OSAttributes struct { // OSName is the operating system name. OSName string // OSVersion is the operating system version. OSVersion string // Comments are additional comments to add to a user agent string. Comments []string }
OSAttributes stores OS attributes.
type Phantom ¶
type Phantom struct { PhantomjsFile string // Phantomjs完整文件名 TempJsDir string // 临时js存放目录 CookieJar *cookiejar.Jar // contains filtered or unexported fields }
Phantom 基于Phantomjs的下载器实现,作为surfer的补充 效率较surfer会慢很多,但是因为模拟浏览器,破防性更好 支持UserAgent/TryTimes/RetryPause/自定义js
type Request ¶
type Request struct { // url (必须填写) Url string // GET POST HEAD (默认为GET) Method string // http header Header http.Header // 是否使用cookies,在Spider的EnableCookie设置 EnableCookie bool // request body interface Body body // dial tcp: i/o timeout DialTimeout time.Duration // WSARecv tcp: i/o timeout ConnTimeout time.Duration // the max times of download TryTimes int // how long pause when retry RetryPause time.Duration // max redirect times // when RedirectTimes equal 0, redirect times is ∞ // when RedirectTimes less than 0, redirect times is 0 RedirectTimes int // the download ProxyHost Proxy string // 指定下载器ID // 0为Surf高并发下载器,各种控制功能齐全 // 1为PhantomJS下载器,特点破防力强,速度慢,低并发 DownloaderID int // contains filtered or unexported fields }
Request contains the necessary prerequisite information.
type Response ¶
type Response struct { Cookies []string Body string Error string Header []struct { Name string Value string } }
Response 用于解析Phantomjs的响应内容
type Surfer ¶
type Surfer interface { // GET @param url string, header http.Header, cookies []*http.Cookie // HEAD @param url string, header http.Header, cookies []*http.Cookie // POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie // POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie Download(*Request) (resp *http.Response, err error) }
Surfer represents an core of HTTP web browser for crawler.
type TemplateData ¶
TemplateData structure for template data.