goscraper

package module
v0.0.0-...-a33c43a Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 6, 2018 License: MIT Imports: 18 Imported by: 0

README

goscraper Build Status

simple go scraper, using colly the scrapling framework.

install

go get -u github.com/ynishi/goscraper
goscraper

contribute

  • welcome to contribute, make issue or pr!

LICENSE

  • MIT, see LICENSE

Documentation

Index

Constants

View Source
const (
	OptSCRP          = "scrp"
	OptDOMAIN        = "domain"
	OptUA            = "ua"
	OptENTRY         = "entry"
	OptLOGINURL      = "loginURL"
	OptFORM_USERNAME = "form_username"
	OptUSERNAME      = "username"
	OptFORM_PASSWORD = "form_password"
	OptPASSWORD      = "password"
	OptMAXDEPTH      = "maxdepth"
	OptCONFIG        = "config"
	OptUSECONFIG     = "useConfig"
	OptOUTTYPE       = "outtype"
	OptOUTPUTCSV     = "csv"
	OptOUTPUTJSON    = "json"
	OptOUTFILE       = "outfile"
	OptDISURLFILTER  = "disurlfilter"
	OptURLFILTER     = "urlfilter"
	OptDBUSERNAME    = "dbusername"
	OptDBPASSWORD    = "dbpassword"
	OptDBDATABASE    = "dbdatabase"
	OptDBHOST        = "dbhost"
	OptDBPORT        = "dbport"
	OptLINKSELECTOR  = "linkselector"
	OptISDOPOST      = "isdopost"
	OptCHECKLOGIN    = "checklogin"
)

Variables

View Source
var FormTypeBtn = map[string]bool{
	"submit": true,
	"image":  true,
	"reset":  true,
	"button": true,
}

Functions

func BrowseLink(link Link, driver *agouti.WebDriver, db *sql.DB) (id *string, err error)

func Link2Click

func Link2Click(link Link, selection *agouti.Selection) *agouti.Selection

func Links2Json

func Links2Json(links Links) (b []byte, err error)
func LogLink(logger log.Logger, msg string, link *Link)

func MakeOutFilename

func MakeOutFilename(outfile, outtype string) (filename string)

func NewDriver

func NewDriver() (*agouti.WebDriver, error)

func Str2filters

func Str2filters(str, sep string) (filters []*regexp.Regexp)

func SummaryURL

func SummaryURL(u1, u2 *url.URL) (map[url.URL]bool, error)

func UniqURL

func UniqURL(links Links) (urls []*url.URL)

func WriteLinks2Csv

func WriteLinks2Csv(links Links, w io.Writer) (err error)

Types

type Browser

type Browser struct {
	Driver *agouti.WebDriver
	Db     *sql.DB
	Logger log.Logger
	Links  Links
}

func NewBrowser

func NewBrowser(config *BrowserConfig) (*Browser, error)

func (*Browser) Browse

func (b *Browser) Browse() error
func (b *Browser) BrowseLinks(links Links, driver *agouti.WebDriver, db *sql.DB) (err error)

type BrowserConfig

type BrowserConfig struct {
	Driver *agouti.WebDriver
	Db     *sql.DB
	Logger log.Logger
	Links  Links
}

type Config

type Config struct {
	Links        Links
	Logger       log.Logger
	Collector    *colly.Collector
	LoginURL     string
	LoginData    map[string]string
	Entry        string
	OutFile      string
	OutType      string
	LinkSelector string
	IsDoPost     bool
	CheckLogin   string
}

type GeneralLog

type GeneralLog struct {
	Event_time string
	User_host  string
	Argument   string
}
type Link struct {
	From        url.URL `json:"from"`
	To          url.URL `json:"to"`
	AttrId      string  `json:"attr_id"`
	AttrOnClick string  `json:"attr_onclick"`
	Text        string  `json:"text"`
	Tag         string  `json:"tag"`
	Method      string  `json:"method"`
	Selector    string  `json:"selector"`
}
func E2Link(e *colly.HTMLElement) (link *Link, err error)

type LinkScraper

type LinkScraper struct {
	Collector    *colly.Collector
	Links        Links
	Logger       log.Logger
	LoginURL     string
	LoginData    map[string]string
	Entry        string
	OutFile      string
	OutType      string
	LinkSelector string
	IsDoPost     bool
	CheckLogin   string
	URLs         []*url.URL
}

func DefaultLinkScraper

func DefaultLinkScraper() *LinkScraper

func NewLinkScraper

func NewLinkScraper(config *Config) (*LinkScraper, error)

func (*LinkScraper) FlushURLs

func (ls *LinkScraper) FlushURLs() []*url.URL

func (*LinkScraper) IsLogin

func (ls *LinkScraper) IsLogin(e *colly.HTMLElement) bool

func (*LinkScraper) Login

func (ls *LinkScraper) Login() (err error)

func (*LinkScraper) LoginE

func (ls *LinkScraper) LoginE(e *colly.HTMLElement) (err error)

func (*LinkScraper) Output

func (ls *LinkScraper) Output() (err error)

func (*LinkScraper) Scrape

func (ls *LinkScraper) Scrape() (err error)
type Links map[Link]bool

func Add

func Add(links Links, link *Link) (res Links, ok bool)
func SummaryLink(links Links) (res Links, err error)

Directories

Path Synopsis
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL