Documentation ¶
Index ¶
- Constants
- Variables
- func Abs(v int) int
- func AsciiLower(s string) string
- func ElementHasLinkType(element *HTMLNode, linkType string) bool
- func FindBaseUrl(htmlDocument *html.Node, fallbackBaseUrl string) string
- func GetLinkAttribute(element *HTMLNode, attrName string, baseUrl string) ([2]string, bool)
- func Hash(s string) int
- func IsIn(l []string, s string) bool
- func MaxInt(x, y int) int
- func MinInt(x, y int) int
- func PathToURL(file string) (out string, err error)
- func SafeUrljoin(baseUrl, urls string, allowRelative bool) (string, error)
- func Unquote(s string) string
- func UrlJoin(baseUrl, urlS string, allowRelative bool, context string) string
- type Attachment
- type ContentInput
- type DocumentMetadata
- type ElementKey
- type Fl
- type HTMLIterator
- type HTMLNode
- func (h *HTMLNode) AsHtmlNode() *html.Node
- func (h HTMLNode) Get(name string) string
- func (element HTMLNode) GetChildrenText() (content []byte)
- func (element HTMLNode) GetText() string
- func (element HTMLNode) GetUrlAttribute(attrName, baseUrl string, allowRelative bool) string
- func (h HTMLNode) HasAttr(name string) bool
- func (element HTMLNode) HasLinkType(linkType string) bool
- func (element HTMLNode) IsText() (bool, string)
- func (h *HTMLNode) Iter(tags ...atom.Atom) HTMLIterator
- func (element HTMLNode) NodeChildren(skipBlank bool) (children []*HTMLNode)
- func (h *HTMLNode) ToKey(pseudoType string) ElementKey
- type InputFilename
- type InputReader
- type InputString
- type InputUrl
- type PageElement
- type RemoteRessource
- type Set
- type Source
- type Url
- type UrlFetcher
Constants ¶
const (
Version = "0.62"
)
Variables ¶
var Has = struct{}{}
var VersionString = fmt.Sprintf("Go-WebRender %s", Version)
Used for "User-Agent" in HTTP
var (
W3CDateReGroupsIndexes = map[string]int{}
)
YYYY (eg 1997) YYYY-MM (eg 1997-07) YYYY-MM-DD (eg 1997-07-16) YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00) YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00) YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
Functions ¶
func AsciiLower ¶
Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
This is used for `ASCII case-insensitive <http://whatwg.org/C#ascii-case-insensitive>`_ matching. This is different from the strings.ToLower function which also affect non-ASCII characters, sometimes mapping them into the ASCII range: keyword = u"Bac\u212Aground" assert strings.ToLower(keyword) == u"background" assert asciiLower(keyword) != strings.ToLower(keyword) assert asciiLower(keyword) == u"bac\u212Aground"
func ElementHasLinkType ¶
Return whether the given element has a `rel` attribute with the given link type (must be a lower-case string).
func FindBaseUrl ¶
Return the base URL for the document. See http://www.w3.org/TR/html5/urls.html#document-base-url
func GetLinkAttribute ¶
Return ('external', absolute_uri) or ('internal', unquoted_fragment_id) or false
func SafeUrljoin ¶
defaut: allowRelative = false
Types ¶
type Attachment ¶
type Attachment struct {
URL, Title string
}
type ContentInput ¶
type ContentInput interface { String() string // contains filtered or unexported methods }
type DocumentMetadata ¶
type DocumentMetadata struct { // The title of the document, as a string. // Extracted from the `<title>` element in HTML // and written to the `/Title` info field in PDF. Title string // The description of the document, as a string. // Extracted from the `<meta name=description>` element in HTML // and written to the `/Subject` info field in PDF. Description string // The name of one of the software packages // used to generate the document, as a string. // Extracted from the `<meta name=generator>` element in HTML // and written to the `/Creator` info field in PDF. Generator string // Keywords associated with the document, as a list of strings. // (Defaults to the empty list.) // Extracted from `<meta name=keywords>` elements in HTML // and written to the `/Keywords` info field in PDF. Keywords []string // The authors of the document, as a list of strings. // (Defaults to the empty list.) // Extracted from the `<meta name=author>` elements in HTML // and written to the `/Author` info field in PDF. Authors []string // The creation date of the document, as a string. // Dates are in one of the six formats specified in // `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`. // Extracted from the `<meta name=dcterms.created>` element in HTML // and written to the `/CreationDate` info field in PDF. Created time.Time // The modification date of the document, as a string. // Dates are in one of the six formats specified in // `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`. // Extracted from the `<meta name=dcterms.modified>` element in HTML // and written to the `/ModDate` info field in PDF. Modified time.Time // File attachments, as a list of tuples of URL and a description. // (Defaults to the empty list.) // Extracted from the `<link rel=attachment>` elements in HTML // and written to the `/EmbeddedFiles` dictionary in PDF. Attachments []Attachment }
Meta-information belonging to a whole `Document`.
func GetHtmlMetadata ¶
func GetHtmlMetadata(wrapperElement *HTMLNode, baseUrl string) DocumentMetadata
Relevant specs:
http://www.whatwg.org/html#the-title-element http://www.whatwg.org/html#standard-metadata-names http://wiki.whatwg.org/wiki/MetaExtensions http://microformats.org/wiki/existing-rel-values#HTML5LinkExtensionsT
type ElementKey ¶
type ElementKey struct { Element *HTMLNode PseudoType string PageType PageElement }
func (ElementKey) IsPageType ¶
func (e ElementKey) IsPageType() bool
type Fl ¶
type Fl = float32
type HTMLIterator ¶
type HTMLIterator struct {
// contains filtered or unexported fields
}
HTMLIterator simplify the (depth first) walk on an HTML tree.
func NewHtmlIterator ¶
func NewHtmlIterator(root *html.Node, tags ...atom.Atom) HTMLIterator
NewHtmlIterator use `root` as start point. If `tags` is given, only node matching one of them are returned.
func (*HTMLIterator) HasNext ¶
func (h *HTMLIterator) HasNext() bool
HasNext returns true if a node still has to be visited.
func (*HTMLIterator) Next ¶
func (h *HTMLIterator) Next() *HTMLNode
type HTMLNode ¶
func (*HTMLNode) AsHtmlNode ¶
func (HTMLNode) Get ¶
Get returns the attribute `name` or "" See HasAttr if you need to distinguish between no attribute and an attribute with an empty string value.
func (HTMLNode) GetChildrenText ¶
GetChildrenText returns the text directly in the element, but not descendants. It's the concatenation of all children's TextNodes.
func (HTMLNode) GetText ¶
GetText returns the content of the first text node child. Due to Go html.Parse() behavior, this method mimic Python xml.etree.text attribute.
func (HTMLNode) GetUrlAttribute ¶
Get the URI corresponding to the “attrName“ attribute. Return "" if:
- the attribute is empty or missing or,
- the value is a relative URI but the document has no base URI and “allowRelative“ is “False“.
Otherwise return an URI, absolute if possible.
func (HTMLNode) HasLinkType ¶
Return whether the given element has a `rel` attribute with the given link type. `linkType` must be a lower-case string.
func (*HTMLNode) Iter ¶
func (h *HTMLNode) Iter(tags ...atom.Atom) HTMLIterator
Iter return an iterator over the html tree. If tags are given, only the node matching them will be returned by the iterator.
func (HTMLNode) NodeChildren ¶
NodeChildren returns the direct children of `element`. Skip empty text nodes
func (*HTMLNode) ToKey ¶
func (h *HTMLNode) ToKey(pseudoType string) ElementKey
type InputFilename ¶
type InputFilename string
func (InputFilename) String ¶
func (c InputFilename) String() string
type InputReader ¶
type InputReader struct {
io.ReadCloser
}
func (InputReader) String ¶
func (c InputReader) String() string
type InputString ¶
type InputString string
func (InputString) String ¶
func (c InputString) String() string
type PageElement ¶
func (PageElement) ToKey ¶
func (p PageElement) ToKey(pseudoType string) ElementKey
type RemoteRessource ¶
type RemoteRessource struct { Content *bytes.Reader // MIME type extracted e.g. from a *Content-Type* header. If not provided, the type is guessed from the // file extension in the URL. MimeType string // actual URL of the resource // if there were e.g. HTTP redirects. RedirectedUrl string // filename of the resource. Usually // derived from the *filename* parameter in a *Content-Disposition* // header Filename string ProtocolEncoding string }
func DefaultUrlFetcher ¶
func DefaultUrlFetcher(urlTarget string) (RemoteRessource, error)
Fetch an external resource such as an image or stylesheet.
type Source ¶
func FetchSource ¶
func FetchSource(input ContentInput, baseUrl string, urlFetcher UrlFetcher, checkCssMimeType bool, ) (out Source, err error)
FetchSource fetch the html input, and returns it with the normalized “BaseUrl“ (checkCssMimeType=false).
type UrlFetcher ¶
type UrlFetcher = func(url string) (RemoteRessource, error)