html

package
v2.4.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 3, 2019 License: MIT Imports: 4 Imported by: 13

README

HTML GoDoc

This package is an HTML5 lexer written in Go. It follows the specification at The HTML syntax. The lexer takes an io.Reader and converts it into tokens until the EOF.

Installation

Run the following command

go get -u github.com/tdewolff/parse/v2/html

or add the following import and run project with go get

import "github.com/tdewolff/parse/v2/html"

Lexer

Usage

The following initializes a new Lexer with io.Reader r:

l := html.NewLexer(r)

To tokenize until EOF an error, use:

for {
	tt, data := l.Next()
	switch tt {
	case html.ErrorToken:
		// error or EOF set in l.Err()
		return
	case html.StartTagToken:
		// ...
		for {
			ttAttr, dataAttr := l.Next()
			if ttAttr != html.AttributeToken {
				break
			}
			// ...
		}
	// ...
	}
}

All tokens:

ErrorToken TokenType = iota // extra token when errors occur
CommentToken
DoctypeToken
StartTagToken
StartTagCloseToken
StartTagVoidToken
EndTagToken
AttributeToken
TextToken
Examples
package main

import (
	"os"

	"github.com/tdewolff/parse/v2/html"
)

// Tokenize HTML from stdin.
func main() {
	l := html.NewLexer(os.Stdin)
	for {
		tt, data := l.Next()
		switch tt {
		case html.ErrorToken:
			if l.Err() != io.EOF {
				fmt.Println("Error on line", l.Line(), ":", l.Err())
			}
			return
		case html.StartTagToken:
			fmt.Println("Tag", string(data))
			for {
				ttAttr, dataAttr := l.Next()
				if ttAttr != html.AttributeToken {
					break
				}

				key := dataAttr
				val := l.AttrVal()
				fmt.Println("Attribute", string(key), "=", string(val))
			}
		// ...
		}
	}
}

License

Released under the MIT license.

Documentation

Overview

Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.

Index

Examples

Constants

This section is empty.

Variables

View Source
var EntitiesMap = map[string][]byte{}/* 1092 elements not displayed */

Entities are all named character entities.

View Source
var TextRevEntitiesMap = map[byte][]byte{
	'<': []byte("&lt;"),
}

Functions

func EscapeAttrVal

func EscapeAttrVal(buf *[]byte, orig, b []byte, isXML bool) []byte

EscapeAttrVal returns the escaped attribute value bytes without quotes.

Types

type Hash

type Hash uint32

Hash defines perfect hashes for a predefined list of strings

const (
	A               Hash = 0x1     // a
	Abbr            Hash = 0x36e04 // abbr
	About           Hash = 0x5     // about
	Accept          Hash = 0x1106  // accept
	Accept_Charset  Hash = 0x110e  // accept-charset
	Action          Hash = 0x25206 // action
	Address         Hash = 0x5a07  // address
	Align           Hash = 0x32d05 // align
	Alink           Hash = 0x8f05  // alink
	Allowfullscreen Hash = 0x2180f // allowfullscreen
	Area            Hash = 0x12c04 // area
	Article         Hash = 0x2707  // article
	Aside           Hash = 0x7205  // aside
	Async           Hash = 0xcb05  // async
	Audio           Hash = 0xdc05  // audio
	Autofocus       Hash = 0xfc09  // autofocus
	Autoplay        Hash = 0x11508 // autoplay
	Axis            Hash = 0x11d04 // axis
	B               Hash = 0x101   // b
	Background      Hash = 0x300a  // background
	Base            Hash = 0x17804 // base
	Bb              Hash = 0x36f02 // bb
	Bdi             Hash = 0x9403  // bdi
	Bdo             Hash = 0x32503 // bdo
	Bgcolor         Hash = 0x13507 // bgcolor
	Blockquote      Hash = 0x13f0a // blockquote
	Body            Hash = 0xd04   // body
	Br              Hash = 0x37002 // br
	Button          Hash = 0x14906 // button
	Canvas          Hash = 0x6e06  // canvas
	Caption         Hash = 0x23207 // caption
	Charset         Hash = 0x1807  // charset
	Checked         Hash = 0x1bd07 // checked
	Cite            Hash = 0xcf04  // cite
	Class           Hash = 0x15605 // class
	Classid         Hash = 0x15607 // classid
	Clear           Hash = 0x2b05  // clear
	Code            Hash = 0x17404 // code
	Codebase        Hash = 0x17408 // codebase
	Codetype        Hash = 0x1ae08 // codetype
	Col             Hash = 0x13703 // col
	Colgroup        Hash = 0x1c508 // colgroup
	Color           Hash = 0x13705 // color
	Cols            Hash = 0x1da04 // cols
	Colspan         Hash = 0x1da07 // colspan
	Compact         Hash = 0x20007 // compact
	Content         Hash = 0x29807 // content
	Controls        Hash = 0x20f08 // controls
	Data            Hash = 0x1f04  // data
	Datalist        Hash = 0x1f08  // datalist
	Datatype        Hash = 0x4d08  // datatype
	Dd              Hash = 0x5b02  // dd
	Declare         Hash = 0x7507  // declare
	Default         Hash = 0x9e07  // default
	DefaultChecked  Hash = 0x18f0e // defaultChecked
	DefaultMuted    Hash = 0x9e0c  // defaultMuted
	DefaultSelected Hash = 0xa90f  // defaultSelected
	Defer           Hash = 0xb705  // defer
	Del             Hash = 0xd903  // del
	Details         Hash = 0x15c07 // details
	Dfn             Hash = 0x18803 // dfn
	Dialog          Hash = 0xc506  // dialog
	Dir             Hash = 0x9503  // dir
	Disabled        Hash = 0x19c08 // disabled
	Div             Hash = 0x1a303 // div
	Dl              Hash = 0x1c302 // dl
	Dt              Hash = 0x24402 // dt
	Em              Hash = 0x4302  // em
	Embed           Hash = 0x4905  // embed
	Enabled         Hash = 0x28007 // enabled
	Enctype         Hash = 0x1f907 // enctype
	Face            Hash = 0x5604  // face
	Fieldset        Hash = 0x22708 // fieldset
	Figcaption      Hash = 0x22f0a // figcaption
	Figure          Hash = 0x23906 // figure
	Footer          Hash = 0xe606  // footer
	For             Hash = 0x24e03 // for
	Form            Hash = 0x24e04 // form
	Formaction      Hash = 0x24e0a // formaction
	Formnovalidate  Hash = 0x2580e // formnovalidate
	Frame           Hash = 0x2a005 // frame
	Frameborder     Hash = 0x2a00b // frameborder
	H1              Hash = 0x2eb02 // h1
	H2              Hash = 0x26602 // h2
	H3              Hash = 0x26802 // h3
	H4              Hash = 0x26a02 // h4
	H5              Hash = 0x26c02 // h5
	H6              Hash = 0x26e02 // h6
	Head            Hash = 0x2dd04 // head
	Header          Hash = 0x2dd06 // header
	Hgroup          Hash = 0x27006 // hgroup
	Hidden          Hash = 0x27c06 // hidden
	Hr              Hash = 0x33302 // hr
	Href            Hash = 0x33304 // href
	Hreflang        Hash = 0x33308 // hreflang
	Html            Hash = 0x28704 // html
	Http_Equiv      Hash = 0x28b0a // http-equiv
	I               Hash = 0x2401  // i
	Icon            Hash = 0x29704 // icon
	Id              Hash = 0x7402  // id
	Iframe          Hash = 0x29f06 // iframe
	Img             Hash = 0x2ab03 // img
	Inert           Hash = 0x6905  // inert
	Inlist          Hash = 0x2ae06 // inlist
	Input           Hash = 0x2b805 // input
	Ins             Hash = 0x2bd03 // ins
	Ismap           Hash = 0x11f05 // ismap
	Itemscope       Hash = 0xd009  // itemscope
	Kbd             Hash = 0x9303  // kbd
	Keygen          Hash = 0x1f506 // keygen
	Label           Hash = 0x7c05  // label
	Lang            Hash = 0x33704 // lang
	Language        Hash = 0x33708 // language
	Legend          Hash = 0x2d006 // legend
	Li              Hash = 0x2302  // li
	Link            Hash = 0x9004  // link
	Longdesc        Hash = 0x8008  // longdesc
	Main            Hash = 0x6704  // main
	Manifest        Hash = 0x2c708 // manifest
	Map             Hash = 0x10603 // map
	Mark            Hash = 0x2d604 // mark
	Math            Hash = 0x2da04 // math
	Max             Hash = 0x2e303 // max
	Maxlength       Hash = 0x2e309 // maxlength
	Media           Hash = 0xc305  // media
	Menu            Hash = 0xf804  // menu
	Meta            Hash = 0x2ed04 // meta
	Meter           Hash = 0x30505 // meter
	Method          Hash = 0x30a06 // method
	Multiple        Hash = 0x31008 // multiple
	Muted           Hash = 0x31805 // muted
	Name            Hash = 0xc104  // name
	Nav             Hash = 0x32a03 // nav
	Nohref          Hash = 0x33106 // nohref
	Noresize        Hash = 0x14e08 // noresize
	Noscript        Hash = 0x16808 // noscript
	Noshade         Hash = 0x18a07 // noshade
	Novalidate      Hash = 0x25c0a // novalidate
	Nowrap          Hash = 0x1e006 // nowrap
	Object          Hash = 0xe006  // object
	Ol              Hash = 0x8902  // ol
	Open            Hash = 0x32704 // open
	Optgroup        Hash = 0x35008 // optgroup
	Option          Hash = 0x31d06 // option
	Output          Hash = 0x206   // output
	P               Hash = 0x501   // p
	Param           Hash = 0x6305  // param
	Pauseonexit     Hash = 0xec0b  // pauseonexit
	Picture         Hash = 0x10807 // picture
	Plaintext       Hash = 0x12309 // plaintext
	Poster          Hash = 0x1cc06 // poster
	Pre             Hash = 0x1e503 // pre
	Prefix          Hash = 0x1e506 // prefix
	Profile         Hash = 0x27507 // profile
	Progress        Hash = 0x2f808 // progress
	Property        Hash = 0x35708 // property
	Q               Hash = 0x14401 // q
	Rb              Hash = 0x2f02  // rb
	Readonly        Hash = 0x12d08 // readonly
	Rel             Hash = 0x7a03  // rel
	Required        Hash = 0x23d08 // required
	Resource        Hash = 0x10d08 // resource
	Rev             Hash = 0x9703  // rev
	Reversed        Hash = 0x9708  // reversed
	Rows            Hash = 0xbb04  // rows
	Rowspan         Hash = 0xbb07  // rowspan
	Rp              Hash = 0xeb02  // rp
	Rt              Hash = 0x2802  // rt
	Rtc             Hash = 0x6c03  // rtc
	Ruby            Hash = 0x13b04 // ruby
	Rules           Hash = 0x1d105 // rules
	S               Hash = 0x1c01  // s
	Samp            Hash = 0x6004  // samp
	Scope           Hash = 0xd405  // scope
	Scoped          Hash = 0xd406  // scoped
	Script          Hash = 0x16a06 // script
	Scrolling       Hash = 0x8609  // scrolling
	Seamless        Hash = 0x17a08 // seamless
	Section         Hash = 0x16207 // section
	Select          Hash = 0x18106 // select
	Selected        Hash = 0x18108 // selected
	Shape           Hash = 0x1d505 // shape
	Size            Hash = 0x15204 // size
	Slot            Hash = 0x1ee04 // slot
	Small           Hash = 0x21605 // small
	Sortable        Hash = 0x2bf08 // sortable
	Source          Hash = 0x10f06 // source
	Span            Hash = 0xbe04  // span
	Src             Hash = 0x2ff03 // src
	Srcset          Hash = 0x2ff06 // srcset
	Start           Hash = 0x2505  // start
	Strong          Hash = 0x2b206 // strong
	Style           Hash = 0x2cd05 // style
	Sub             Hash = 0x32303 // sub
	Summary         Hash = 0x33f07 // summary
	Sup             Hash = 0x34603 // sup
	Svg             Hash = 0x34903 // svg
	Tabindex        Hash = 0x2ef08 // tabindex
	Table           Hash = 0x2c205 // table
	Target          Hash = 0x706   // target
	Tbody           Hash = 0xc05   // tbody
	Td              Hash = 0x1e02  // td
	Template        Hash = 0x4208  // template
	Text            Hash = 0x12804 // text
	Textarea        Hash = 0x12808 // textarea
	Tfoot           Hash = 0xe505  // tfoot
	Th              Hash = 0x2dc02 // th
	Thead           Hash = 0x2dc05 // thead
	Time            Hash = 0xf604  // time
	Title           Hash = 0x16f05 // title
	Tr              Hash = 0x1f102 // tr
	Track           Hash = 0x1f105 // track
	Translate       Hash = 0x20609 // translate
	Truespeed       Hash = 0x24509 // truespeed
	Type            Hash = 0x5104  // type
	Typemustmatch   Hash = 0x1b20d // typemustmatch
	Typeof          Hash = 0x5106  // typeof
	U               Hash = 0x301   // u
	Ul              Hash = 0xa202  // ul
	Undeterminate   Hash = 0x370d  // undeterminate
	Usemap          Hash = 0x10306 // usemap
	Valign          Hash = 0x32c06 // valign
	Value           Hash = 0x1a505 // value
	Valuetype       Hash = 0x1a509 // valuetype
	Var             Hash = 0x29403 // var
	Video           Hash = 0x34c05 // video
	Visible         Hash = 0x35f07 // visible
	Vlink           Hash = 0x36605 // vlink
	Vocab           Hash = 0x36b05 // vocab
	Wbr             Hash = 0x37203 // wbr
	Xmlns           Hash = 0x1ea05 // xmlns
	Xmp             Hash = 0x2f603 // xmp
)

Unique hash definitions to be used instead of strings

func ToHash

func ToHash(s []byte) Hash

ToHash returns the hash whose name is s. It returns zero if there is no such hash. It is case sensitive.

func (Hash) String

func (i Hash) String() string

String returns the hash' name.

type Lexer

type Lexer struct {
	// contains filtered or unexported fields
}

Lexer is the state for the lexer.

func NewLexer

func NewLexer(r io.Reader) *Lexer

NewLexer returns a new Lexer for a given io.Reader.

Example
l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>"))
out := ""
for {
	tt, data := l.Next()
	if tt == ErrorToken {
		break
	}
	out += string(data)
}
fmt.Println(out)
Output:

<span class='user'>John Doe</span>

func (*Lexer) AttrVal

func (l *Lexer) AttrVal() []byte

AttrVal returns the attribute value when an AttributeToken was returned from Next.

func (*Lexer) Err

func (l *Lexer) Err() error

Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.

func (*Lexer) Next

func (l *Lexer) Next() (TokenType, []byte)

Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.

func (*Lexer) Offset added in v2.4.0

func (l *Lexer) Offset() int

Offset returns the current position in the input stream.

func (*Lexer) Restore

func (l *Lexer) Restore()

Restore restores the NULL byte at the end of the buffer.

func (*Lexer) Text

func (l *Lexer) Text() []byte

Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.

type TokenType

type TokenType uint32

TokenType determines the type of token, eg. a number or a semicolon.

const (
	ErrorToken TokenType = iota // extra token when errors occur
	CommentToken
	DoctypeToken
	StartTagToken
	StartTagCloseToken
	StartTagVoidToken
	EndTagToken
	AttributeToken
	TextToken
	SvgToken
	MathToken
)

TokenType values.

func (TokenType) String

func (tt TokenType) String() string

String returns the string representation of a TokenType.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL