Documentation ¶
Overview ¶
import (
"github.com/henrylee2cn/surfer" "io/ioutil" "log"
)
func main() { // Use surf engine resp, err := surfer.Download(&surfer.Request{ Url: "http://github.com/henrylee2cn/surfer", }) if err != nil { log.Fatal(err) } b, err := ioutil.ReadAll(resp.Body) log.Println(string(b), err) // Use phantomjs engine resp, err = surfer.Download(&surfer.Request{ Url: "http://github.com/henrylee2cn", DownloaderID: 1, }) if err != nil { log.Fatal(err) } b, err = ioutil.ReadAll(resp.Body) log.Println(string(b), err) resp.Body.Close() surfer.DestroyJsFiles() }
Index ¶
- Constants
- Variables
- func AutoToUTF8(resp *http.Response) error
- func BodyBytes(resp *http.Response) ([]byte, error)
- func CreateDefault(browser string) string
- func CreateReal() string
- func CreateVersion(browser, version string) string
- func DestroyJsFiles()
- func Download(req *Request) (resp *http.Response, err error)
- func Format(bname, bver string) string
- func GetWDPath() string
- func IsDirExists(path string) bool
- func IsFileExists(path string) bool
- func TopVersion(bname string) string
- func UrlEncode(urlStr string) (*url.URL, error)
- func WalkDir(targpath string, suffixes ...string) (dirlist []string)
- type Bytes
- type Content
- type File
- type Form
- type Formats
- type JSONObj
- type OSAttributes
- type Phantom
- type Request
- type RespBody
- type Response
- type Surf
- type Surfer
- type TemplateData
- type UAData
- type UATable
- type XMLObj
Constants ¶
const ( // Windows operating system. Windows int = iota // Linux based operating system. Linux // Macintosh /OS X operating system. Macintosh )
const ( SurfID = 0 // Surf下载器标识符 PhomtomJsID = 1 // PhomtomJs下载器标识符 DefaultMethod = "GET" // 默认请求方法 DefaultDialTimeout = 2 * time.Minute // 默认请求服务器超时 DefaultConnTimeout = 2 * time.Minute // 默认下载超时 DefaultTryTimes = 3 // 默认最大下载次数 DefaultRetryPause = 2 * time.Second // 默认重新下载前停顿时长 )
constant
Variables ¶
var Database = UATable{ "chrome": { "37.0.2049.0", Windows, Formats{ "37": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "36": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "35": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "34": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "33": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "32": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", "30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36", }, }, "firefox": { "31.0", Windows, Formats{ "31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:31.0) Gecko/20100101 Firefox/{{.Ver}}", "30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:30.0) Gecko/20120101 Firefox/{{.Ver}}", "29": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:29.0) Gecko/20120101 Firefox/{{.Ver}}", "28": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:28.0) Gecko/20100101 Firefox/{{.Ver}}", "27": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:27.0) Gecko/20130101 Firefox/{{.Ver}}", "26": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:26.0) Gecko/20121011 Firefox/{{.Ver}}", "25": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:25.0) Gecko/20100101 Firefox/{{.Ver}}", }, }, "msie": { "10.0", Windows, Formats{ "10": "Mozilla/5.0 (compatible; MSIE 10.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.5.30729)", "9": "Mozilla/5.0 (compatible; MSIE 9.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.0.30729)", "8": "Mozilla/5.0 (compatible; MSIE 8.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/4.0; .NET CLR 3.0.04320)", "7": "Mozilla/4.0 (compatible; MSIE 7.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}.NET CLR 2.0.50727)", }, }, "opera": { "12.14", Windows, Formats{ "12": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.9.181 Version/{{.Ver}}", "11": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.7.62 Version/{{.Ver}}", "10": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.2.15 Version/{{.Ver}}", "9": "Opera/9.00 ({{.OSN}} {{.OSV}}; U{{.Coms}})", }, }, "safari": { "6.0", Macintosh, Formats{ "6": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/536.26 (KHTML, like Gecko) Version/{{.Ver}} Safari/8536.25", "5": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/531.2+ (KHTML, like Gecko) Version/{{.Ver}} Safari/531.2+", "4": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/528.16 (KHTML, like Gecko) Version/{{.Ver}} Safari/528.16", }, }, "itunes": { "9.1.1", Macintosh, Formats{ "9": "iTunes/{{.Ver}}", "8": "iTunes/{{.Ver}}", "7": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.7{{.Coms}})", "6": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.5{{.Coms}})", }, }, "aol": { "9.7", Windows, Formats{ "9": "Mozilla/5.0 (compatible; MSIE 9.0; AOL {{.Ver}}; AOLBuild 4343.19; {{.OSN}} {{.OSV}}; WOW64; Trident/5.0; FunWebProducts{{.Coms}})", "8": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727{{.Coms}})", "7": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; FunWebProducts{{.Coms}})", "6": "Mozilla/4.0 (compatible; MSIE 6.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}{{.Coms}})", }, }, "konqueror": { "4.9", Linux, Formats{ "4": "Mozilla/5.0 (compatible; Konqueror/4.0; {{.OSN}}{{.Coms}}) KHTML/4.0.3 (like Gecko)", "3": "Mozilla/5.0 (compatible; Konqueror/3.0-rc6; i686 {{.OSN}}; 20021127{{.Coms}})", "2": "Mozilla/5.0 (compatible; Konqueror/2.1.1; {{.OSN}}{{.Coms}})", }, }, "netscape": { "9.1.0285", Windows, Formats{ "9": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.9.2.4{{.Coms}}) Gecko/20070321 Netscape/{{.Ver}}", "8": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.7.5{{.Coms}}) Gecko/20050519 Netscape/{{.Ver}}", "7": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.0.1{{.Coms}}) Gecko/20020921 Netscape/{{.Ver}}", }, }, "lynx": { "2.8.8dev.3", Linux, Formats{ "2": "Lynx/{{.Ver}} libwww-FM/2.14 SSL-MM/1.4.1", "1": "Lynx (textmode)", }, }, "googlebot": { "2.1", Linux, Formats{ "2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})", "1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})", }, }, "bingbot": { "2.0", Windows, Formats{ "2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})", }, }, "yahoobot": { "2.0", Linux, Formats{ "2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})", }, }, "default": { "1.0", Linux, Formats{ "1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})", }, }, }
Database is the "database" of user agents.
var DefaultOSAttributes = map[int]OSAttributes{ Windows: {"Windows NT", "6.3", []string{"x64"}}, Linux: {"Linux", "3.16.1", []string{"x64"}}, Macintosh: {"Intel Mac OS X", "10_6_8", []string{}}, }
DefaultOSAttributes stores default OS attributes.
var UserAgents = map[string][]string{}
UserAgents all User-Agent
Functions ¶
func AutoToUTF8 ¶
AutoToUTF8 采用surf内核下载时,可以尝试自动转码为utf8 采用phantomjs内核时,无需转码(已是utf8)
func CreateDefault ¶
CreateDefault returns a user agent string using default values.
func CreateReal ¶
func CreateReal() string
CreateReal creates generates and returns a complete user agent string.
func CreateVersion ¶
CreateVersion generates and returns a complete user agent string for a specific browser version.
func Format ¶
Format returns the format string for the given browser name and version.
When a format can't be found for a version, the first format string for the browser is returned. When a format can't be found for the browser the default format is returned.
func TopVersion ¶
TopVersion returns the most recent version for the given browser name.
Types ¶
type Form ¶
type Form struct { // Values [field name]-[]value Values map[string][]string // Files [field name]-[]File Files map[string][]File }
Form impletes body interface
type Formats ¶
Formats is a collection of UA format strings. key is the browser version. value is the browser info.
type OSAttributes ¶
type OSAttributes struct { // OSName is the operating system name. OSName string // OSVersion is the operating system version. OSVersion string // Comments are additional comments to add to a user agent string. Comments []string }
OSAttributes stores OS attributes.
type Phantom ¶
type Phantom struct { PhantomjsFile string //Phantomjs完整文件名 TempJsDir string //临时js存放目录 // contains filtered or unexported fields }
Phantom 基于Phantomjs的下载器实现,作为surfer的补充 效率较surfer会慢很多,但是因为模拟浏览器,破防性更好 支持UserAgent/TryTimes/RetryPause/自定义js
type Request ¶
type Request struct { // url (必须填写) Url string // GET POST HEAD (默认为GET) Method string // http header Header http.Header // 是否使用cookies,在Spider的EnableCookie设置 EnableCookie bool // request body interface Body body // dial tcp: i/o timeout DialTimeout time.Duration // WSARecv tcp: i/o timeout ConnTimeout time.Duration // the max times of download TryTimes int // how long pause when retry RetryPause time.Duration // max redirect times // when RedirectTimes equal 0, redirect times is ∞ // when RedirectTimes less than 0, redirect times is 0 RedirectTimes int // the download ProxyHost Proxy string // 指定下载器ID // 0为Surf高并发下载器,各种控制功能齐全 // 1为PhantomJS下载器,特点破防力强,速度慢,低并发 DownloaderID int // contains filtered or unexported fields }
Request contains the necessary prerequisite information.
type Surf ¶
type Surf struct {
// contains filtered or unexported fields
}
Surf is the default Download implementation.
type Surfer ¶
type Surfer interface { // GET @param url string, header http.Header, cookies []*http.Cookie // HEAD @param url string, header http.Header, cookies []*http.Cookie // POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie // POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie Download(*Request) (resp *http.Response, err error) }
Surfer represents an core of HTTP web browser for crawler.
func NewPhantom ¶
NewPhantom 创建一个Phantomjs下载器
type TemplateData ¶
TemplateData structure for template data.