crawlab

package module
v1.0.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 1, 2021 License: BSD-3-Clause Imports: 4 Imported by: 0

README

Crawlab Go SDK

Crawlab Go SDK supports Golang-based spiders integration with Crawlab. It contains a number of APIs including saving crawled items into different data sources including MongoDB, MySQL, Postgres, ElasticSearch and Kafka.

Basic Usage

package main

import (
	"github.com/crawlab-team/crawlab-go-sdk"
	"github.com/crawlab-team/crawlab-go-sdk/entity"
)

func main() {
    item := entity.Item{}
    item["url"] = "http://example.com"
    item["title"] = "hello world"
    _ = crawlab.SaveItem(item)
}

Example Using Colly

package main

import (
	"fmt"
	"github.com/apex/log"
	"github.com/crawlab-team/crawlab-go-sdk"
	"github.com/crawlab-team/crawlab-go-sdk/entity"
	"github.com/gocolly/colly/v2"
	"runtime/debug"
)

func main() {
	startUrl := "https://www.baidu.com/s?wd=crawlab"

	c := colly.NewCollector(
		colly.AllowedDomains("www.baidu.com"),
		colly.Async(true),
		colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"),
	)

	c.OnHTML("#content_left > .c-container", func(e *colly.HTMLElement) {
		item := entity.Item{}
		item["title"] = e.ChildText("h3.t > a")
		item["url"] = e.ChildAttr("h3.t > a", "href")
		if err := crawlab.SaveItem(item); err != nil {
			log.Errorf("save item error: " + err.Error())
			debug.PrintStack()
			return
		}
	})

	c.OnRequest(func(r *colly.Request) {
		log.Debugf(fmt.Sprintf("Visiting %s", r.URL.String()))
	})

	if err := c.Visit(startUrl); err != nil {
		log.Errorf("visit error: " + err.Error())
		debug.PrintStack()
		panic(fmt.Sprintf("Unable to visit %s", startUrl))
	}

	c.Wait()
}

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func IsExistItem

func IsExistItem(item entity.Item) (bool, error)

func NotifyTarget

func NotifyTarget(item entity.Item, files []string) error

接口

func SaveFile

func SaveFile(name string, data []byte) error

func SaveFileFrom

func SaveFileFrom(name string, reader io.Reader, timer func()) error

func SaveItem

func SaveItem(item entity.Item) (err error)

Types

This section is empty.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL