crawlab

package module

v1.0.4 Latest Latest Go to latest Published: Apr 1, 2021 License: BSD-3-Clause Imports: 4 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/gditsec/crawlab-go-sdk

Links

Open Source Insights

README ¶

Crawlab Go SDK

Crawlab Go SDK supports Golang-based spiders integration with Crawlab. It contains a number of APIs including saving crawled items into different data sources including MongoDB, MySQL, Postgres, ElasticSearch and Kafka.

Basic Usage

package main

import (
	"github.com/crawlab-team/crawlab-go-sdk"
	"github.com/crawlab-team/crawlab-go-sdk/entity"
)

func main() {
    item := entity.Item{}
    item["url"] = "http://example.com"
    item["title"] = "hello world"
    _ = crawlab.SaveItem(item)
}

Example Using Colly

package main

import (
	"fmt"
	"github.com/apex/log"
	"github.com/crawlab-team/crawlab-go-sdk"
	"github.com/crawlab-team/crawlab-go-sdk/entity"
	"github.com/gocolly/colly/v2"
	"runtime/debug"
)

func main() {
	startUrl := "https://www.baidu.com/s?wd=crawlab"

	c := colly.NewCollector(
		colly.AllowedDomains("www.baidu.com"),
		colly.Async(true),
		colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"),
	)

	c.OnHTML("#content_left > .c-container", func(e *colly.HTMLElement) {
		item := entity.Item{}
		item["title"] = e.ChildText("h3.t > a")
		item["url"] = e.ChildAttr("h3.t > a", "href")
		if err := crawlab.SaveItem(item); err != nil {
			log.Errorf("save item error: " + err.Error())
			debug.PrintStack()
			return
		}
	})

	c.OnRequest(func(r *colly.Request) {
		log.Debugf(fmt.Sprintf("Visiting %s", r.URL.String()))
	})

	if err := c.Visit(startUrl); err != nil {
		log.Errorf("visit error: " + err.Error())
		debug.PrintStack()
		panic(fmt.Sprintf("Unable to visit %s", startUrl))
	}

	c.Wait()
}

Documentation ¶

Index ¶

func IsExistItem(item entity.Item) (bool, error)
func NotifyTarget(item entity.Item, files []string) error
func SaveFile(name string, data []byte) error
func SaveFileFrom(name string, reader io.Reader, timer func()) error
func SaveItem(item entity.Item) (err error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func IsExistItem ¶

func IsExistItem(item entity.Item) (bool, error)

func NotifyTarget ¶

func NotifyTarget(item entity.Item, files []string) error

接口

func SaveFile ¶

func SaveFile(name string, data []byte) error

func SaveFileFrom ¶

func SaveFileFrom(name string, reader io.Reader, timer func()) error

func SaveItem ¶

func SaveItem(item entity.Item) (err error)

Types ¶

This section is empty.

Source Files ¶

View all Source files

main.go

Directories ¶

Path	Synopsis
constants
database
datapool
entity
utils

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL