Crawlab Go SDK
Crawlab Go SDK supports Golang-based spiders integration with Crawlab. It contains a number of APIs including saving crawled items into different data sources including MongoDB, MySQL, Postgres, ElasticSearch and Kafka.
Basic Usage
package main
import (
"github.com/xulei324/spiderlab-go-sdk"
"github.com/xulei324/spiderlab-go-sdk/entity"
)
func main() {
item := entity.Item{}
item["url"] = "http://example.com"
item["title"] = "hello world"
_ = crawlab.SaveItem(item)
}
Example Using Colly
package main
import (
"fmt"
"github.com/apex/log"
"github.com/xulei324/spiderlab-go-sdk"
"github.com/xulei324/spiderlab-go-sdk/entity"
"github.com/gocolly/colly/v2"
"runtime/debug"
)
func main() {
startUrl := "https://www.baidu.com/s?wd=crawlab"
c := colly.NewCollector(
colly.AllowedDomains("www.baidu.com"),
colly.Async(true),
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"),
)
c.OnHTML("#content_left > .c-container", func(e *colly.HTMLElement) {
item := entity.Item{}
item["title"] = e.ChildText("h3.t > a")
item["url"] = e.ChildAttr("h3.t > a", "href")
if err := crawlab.SaveItem(item); err != nil {
log.Errorf("save item error: " + err.Error())
debug.PrintStack()
return
}
})
c.OnRequest(func(r *colly.Request) {
log.Debugf(fmt.Sprintf("Visiting %s", r.URL.String()))
})
if err := c.Visit(startUrl); err != nil {
log.Errorf("visit error: " + err.Error())
debug.PrintStack()
panic(fmt.Sprintf("Unable to visit %s", startUrl))
}
c.Wait()
}