Documentation ¶
Overview ¶
Example ¶
package main import ( "fmt" "goscrapper" ) func main() { web := goscrapper.NewScrapper("https://www.domain.com") // scrape headers info fmt.Println(web.Title()) fmt.Println(web.CSRFToken()) fmt.Println(web.ContentType()) // scrape all headers fmt.Println(web.Headers()) // scrape paragraphs fmt.Println(web.Paragraphs()) fmt.Println(web.CleanParagraphs()) // scrape images and links and commonly interesting details fmt.Println(web.Links()) fmt.Println(web.InternalLinks()) fmt.Println(web.ExternalLinks()) fmt.Println(web.LinksWithDetails()) fmt.Println(web.Images()) fmt.Println(web.ImagesWithDetails()) // scrape emails fmt.Println(web.Emails()) // scrape using custom query quotes := web.Query(goscrapper.Query{Name: "Quotes", Selector: "quotes p"}) for _, q := range quotes { fmt.Printf("Attributes: %v, Value: %v\n", q.Attr, q.Text) } }
Output:
Index ¶
- type HeadingLevel
- type Query
- type QueryResult
- type Viewport
- type Web
- func (w *Web) CSRFToken() string
- func (w *Web) Canonical() string
- func (w *Web) Charset() string
- func (w *Web) CleanParagraphs() []string
- func (w *Web) ContentType() string
- func (w *Web) Emails() ([]string, error)
- func (w *Web) ExternalLinks() []string
- func (w *Web) Fetch() error
- func (w *Web) Headers() map[string]string
- func (w *Web) Heading(opt ...HeadingLevel) [][]string
- func (w *Web) Headings() [][]string
- func (w *Web) Images() []string
- func (w *Web) ImagesWithDetails() []map[string]interface{}
- func (w *Web) InternalLinks() []string
- func (w *Web) Links() []string
- func (w *Web) LinksWithDetails() []map[string]interface{}
- func (w *Web) Paragraphs() []string
- func (w *Web) Query(query Query) []QueryResult
- func (w *Web) Title() string
- func (w *Web) Viewport() *Viewport
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type HeadingLevel ¶
type HeadingLevel string
const ( H1 HeadingLevel = "h1" H2 HeadingLevel = "h2" H3 HeadingLevel = "h3" H4 HeadingLevel = "h4" H5 HeadingLevel = "h5" H6 HeadingLevel = "h6" )
type QueryResult ¶
type Viewport ¶
func NewViewport ¶
type Web ¶
func NewContextScrapper ¶ added in v1.1.0
initialize new scrapper instance with context
func (*Web) CSRFToken ¶
Fetch meta info of csrf token from head
Example:
html: <meta name="csrf-token" content="token" />
Result: token
func (*Web) Canonical ¶
Fetch canonical meta url from head
Example:
html: <link rel="canonical" href="https://test-page.goscrapper.com/page.html" />
func (*Web) Charset ¶
Fetch the charset meta info from head, if a tag wasn't found because it's missing in the source HTML, empty string will be returned.
Example:
html: <meta charset="utf-8" /> Result: utf-8
func (*Web) CleanParagraphs ¶
Empty p-tags would lead to empty strings in the returned array. To avoid this you can call w.CleanParagraphs() instead. This will filter empty paragraphs and only return those with content.
func (*Web) ContentType ¶
Fetch content type meta info from head
Example:
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
Result: [text/html, utf-8]
func (*Web) ExternalLinks ¶
get all external links on the page as absolute URLs
func (*Web) Heading ¶
func (w *Web) Heading(opt ...HeadingLevel) [][]string
Fetch slice of heading text, default level is h1 you pass different heading level -> w.Heading(H2, H3)
Example:
html: <h1>Heading 1</h1>
Result: Heading 1
func (*Web) Headings ¶
Fetch slice of all the heading tags text (h1, h2, h3, h4, h5, h6)
Example:
html: <h1>Heading 1</h1>
<h1>Heading 1</h1> <h2>Heading 2</h2> <h2>Heading 2</h2>
Result: [[Heading 1, heading 1], [Heading 2, Heading 2]]
func (*Web) Images ¶
get slice of all images on the page with absolute URLs
Example:
html: <img src="https://test-pages.de/assets/cat.jpg" alt="absolute path">
Result: ['https://test-pages.de/assets/cat.jpg']
func (*Web) ImagesWithDetails ¶
get all images on the page with commonly interesting details
Example:
html: <img src="https://test-pages.de/assets/cat.jpg" alt="absolute path">
Result: [
'url' => 'https://test-pages.de/assets/cat.jpg', 'alt' => 'absolute path', 'width' => null, 'height' => null,
]
func (*Web) InternalLinks ¶
get all internal links (same root or sub-domain) on the page as absolute URLs
func (*Web) LinksWithDetails ¶
get all links on the page with commonly interesting details
Example:
html: <a href="https://placekitten.com/432/287" rel="nofollow">external kitten</a>
Result: [
'url' => 'https://placekitten.com/432/287', 'text' => 'external kitten', 'title' => null, 'target' => null, 'rel' => 'nofollow', 'isNofollow' => true, 'isUGC' => false, 'isNoopener' => false, 'isNoreferrer' => false,
]
func (*Web) Paragraphs ¶
Fetch all the paragraphs (<p>) on a website
func (*Web) Query ¶
func (w *Web) Query(query Query) []QueryResult
get attributes and value of given query selector. return slice of result in case of multiple existence of an element
Example ¶
package main import ( "fmt" "goscrapper" ) func main() { web := goscrapper.NewScrapper("https://www.metalsucks.net/") metaResult := web.Query(goscrapper.Query{Name: "Meta Info", Selector: "meta[property='og:locale']"}) fmt.Println(metaResult[0].Attr) }
Output: map[content:en_US property:og:locale]
func (*Web) Title ¶
Fetch the title from head, if a tag wasn't found because it's missing in the source HTML, empty string will be returned.
Example:
html: <title>Lorem Ipsum</title>
Result: Lorem Ipsum
Example ¶
package main import ( "fmt" "goscrapper" ) func main() { web := goscrapper.NewScrapper("https://www.metalsucks.net/") fmt.Println(web.Title()) }
Output: MetalSucks | Metal News, Tour Dates, Reviews and Videos
func (*Web) Viewport ¶
Fetch viewport meta info from head
Examples:
html: <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" />
Results: w.Viewport().Val -> ['width=device-width', 'initial-scale=1', 'maximum-scale=1', 'user-scalable=no'] w.Viewport().String() -> 'width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no'