html

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 2, 2015 License: MIT Imports: 4 Imported by: 0

README

HTML GoDoc GoCover

This package is an HTML5 lexer written in Go. It follows the specification at The HTML syntax. The lexer takes an io.Reader and converts it into tokens until the EOF.

Installation

Run the following command

go get github.com/tdewolff/parse/html

or add the following import and run project with go get

import "github.com/tdewolff/parse/html"

Lexer

Usage

The following initializes a new Lexer with io.Reader r:

l := html.NewLexer(r)

To tokenize until EOF an error, use:

for {
	tt, data := l.Next()
	switch tt {
	case html.ErrorToken:
		// error or EOF set in l.Err()
		return
	case html.StartTagToken:
		// ...
		for {
			ttAttr, dataAttr := l.Next()
			if ttAttr != html.AttributeToken {
				break
			}
			// ...
		}
	// ...
	}
}

All tokens:

ErrorToken TokenType = iota // extra token when errors occur
CommentToken
DoctypeToken
StartTagToken
StartTagCloseToken
StartTagVoidToken
EndTagToken
AttributeToken
TextToken
Examples
package main

import (
	"os"

	"github.com/tdewolff/parse/html"
)

// Tokenize HTML from stdin.
func main() {
	l := html.NewLexer(os.Stdin)
	for {
		tt, data := l.Next()
		switch tt {
		case html.ErrorToken:
			if l.Err() != io.EOF {
				fmt.Println("Error on line", l.Line(), ":", l.Err())
			}
			return
		case html.StartTagToken:
			fmt.Println("Tag", string(data))
			for {
				ttAttr, dataAttr := l.Next()
				if ttAttr != html.AttributeToken {
					break
				}

				key := dataAttr
				val := l.AttrVal()
				fmt.Println("Attribute", string(key), "=", string(val))
			}
		// ...
		}
	}
}

License

Released under the MIT license.

Documentation

Overview

Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func EscapeAttrVal

func EscapeAttrVal(buf *[]byte, orig, b []byte) []byte

EscapeAttrVal returns the escaped attribute value bytes without quotes.

Types

type Hash

type Hash uint32

uses github.com/tdewolff/hasher

const (
	A                Hash = 0x1
	Abbr             Hash = 0x4
	Accept           Hash = 0x3206
	Accept_Charset   Hash = 0x320e
	Accesskey        Hash = 0x4409
	Acronym          Hash = 0xbb07
	Action           Hash = 0x2b906
	Address          Hash = 0x67607
	Align            Hash = 0x1605
	Alink            Hash = 0xd205
	Allowfullscreen  Hash = 0x23c0f
	Alt              Hash = 0xeb03
	Annotation       Hash = 0x2060a
	AnnotationXml    Hash = 0x2060d
	Applet           Hash = 0x16106
	Area             Hash = 0x38604
	Article          Hash = 0x40707
	Aside            Hash = 0x8305
	Async            Hash = 0xf705
	Audio            Hash = 0x11305
	Autocomplete     Hash = 0x14a0c
	Autofocus        Hash = 0x15609
	Autoplay         Hash = 0x16b08
	Axis             Hash = 0x17304
	B                Hash = 0x101
	Background       Hash = 0x1e0a
	Base             Hash = 0x44d04
	Basefont         Hash = 0x44d08
	Bdi              Hash = 0xcb03
	Bdo              Hash = 0x18a03
	Bgcolor          Hash = 0x19d07
	Bgsound          Hash = 0x1a407
	Big              Hash = 0x1ac03
	Blink            Hash = 0x1af05
	Blockquote       Hash = 0x1b40a
	Body             Hash = 0x4004
	Border           Hash = 0x33806
	Br               Hash = 0x202
	Button           Hash = 0x1be06
	Canvas           Hash = 0x7f06
	Caption          Hash = 0x27e07
	Center           Hash = 0x62306
	Challenge        Hash = 0x1eb09
	Charset          Hash = 0x3907
	Checked          Hash = 0x3ad07
	Cite             Hash = 0xfb04
	Class            Hash = 0x1c905
	Classid          Hash = 0x1c907
	Clear            Hash = 0x40b05
	Code             Hash = 0x1dc04
	Codebase         Hash = 0x44908
	Codetype         Hash = 0x1dc08
	Col              Hash = 0x19f03
	Colgroup         Hash = 0x1f408
	Color            Hash = 0x19f05
	Cols             Hash = 0x20104
	Colspan          Hash = 0x20107
	Command          Hash = 0x21307
	Compact          Hash = 0x21a07
	Content          Hash = 0x58107
	Contenteditable  Hash = 0x5810f
	Contextmenu      Hash = 0x3b60b
	Controls         Hash = 0x22908
	Coords           Hash = 0x23506
	Crossorigin      Hash = 0x25a0b
	Data             Hash = 0x4a604
	Datalist         Hash = 0x4a608
	Datetime         Hash = 0x2e908
	Dd               Hash = 0x31602
	Declare          Hash = 0x8607
	Default          Hash = 0x5407
	DefaultChecked   Hash = 0x4ea0e
	DefaultMuted     Hash = 0x54b0c
	DefaultSelected  Hash = 0x540f
	Defer            Hash = 0x6205
	Del              Hash = 0x7203
	Desc             Hash = 0x7c04
	Details          Hash = 0x9207
	Dfn              Hash = 0xab03
	Dialog           Hash = 0xcc06
	Dir              Hash = 0xd903
	Dirname          Hash = 0xd907
	Disabled         Hash = 0x10108
	Div              Hash = 0x10803
	Dl               Hash = 0x1aa02
	Download         Hash = 0x47f08
	Draggable        Hash = 0x1cf09
	Dropzone         Hash = 0x41208
	Dt               Hash = 0x5ff02
	Em               Hash = 0x6e02
	Embed            Hash = 0x6e05
	Enabled          Hash = 0x4e07
	Enctype          Hash = 0x2ce07
	Face             Hash = 0x62104
	Fieldset         Hash = 0x26b08
	Figcaption       Hash = 0x27b0a
	Figure           Hash = 0x28f06
	Font             Hash = 0x45104
	Footer           Hash = 0xee06
	For              Hash = 0x29b03
	ForeignObject    Hash = 0x29b0d
	Foreignobject    Hash = 0x2a80d
	Form             Hash = 0x2b504
	Formaction       Hash = 0x2b50a
	Formenctype      Hash = 0x2ca0b
	Formmethod       Hash = 0x2d50a
	Formnovalidate   Hash = 0x2df0e
	Formtarget       Hash = 0x2f40a
	Frame            Hash = 0xa305
	Frameborder      Hash = 0x3330b
	Frameset         Hash = 0xa308
	H1               Hash = 0x19b02
	H2               Hash = 0x32402
	H3               Hash = 0x34902
	H4               Hash = 0x37f02
	H5               Hash = 0x60102
	H6               Hash = 0x2fe02
	Head             Hash = 0x36b04
	Header           Hash = 0x36b06
	Headers          Hash = 0x36b07
	Height           Hash = 0x30006
	Hgroup           Hash = 0x30806
	Hidden           Hash = 0x31406
	High             Hash = 0x32104
	Hr               Hash = 0xaf02
	Href             Hash = 0xaf04
	Hreflang         Hash = 0xaf08
	Html             Hash = 0x30404
	Http_Equiv       Hash = 0x3260a
	I                Hash = 0x601
	Icon             Hash = 0x58004
	Id               Hash = 0x8502
	Iframe           Hash = 0x33206
	Image            Hash = 0x33e05
	Img              Hash = 0x34303
	Inert            Hash = 0x53605
	Input            Hash = 0x46c05
	Ins              Hash = 0x26303
	Isindex          Hash = 0x17507
	Ismap            Hash = 0x34b05
	Itemid           Hash = 0xfc06
	Itemprop         Hash = 0x56e08
	Itemref          Hash = 0x61b07
	Itemscope        Hash = 0x35609
	Itemtype         Hash = 0x36008
	Kbd              Hash = 0xca03
	Keygen           Hash = 0x4a06
	Keytype          Hash = 0x5b007
	Kind             Hash = 0xd604
	Label            Hash = 0x7405
	Lang             Hash = 0xb304
	Language         Hash = 0xb308
	Legend           Hash = 0x1d606
	Li               Hash = 0x1702
	Link             Hash = 0xd304
	List             Hash = 0x4aa04
	Listing          Hash = 0x4aa07
	Longdesc         Hash = 0x7808
	Loop             Hash = 0x11e04
	Low              Hash = 0x23e03
	Main             Hash = 0x1004
	Malignmark       Hash = 0xc10a
	Manifest         Hash = 0x65708
	Map              Hash = 0x16003
	Mark             Hash = 0xc704
	Marquee          Hash = 0x69907
	Math             Hash = 0x36804
	Max              Hash = 0x37703
	Maxlength        Hash = 0x37709
	Media            Hash = 0xde05
	Mediagroup       Hash = 0xde0a
	Menu             Hash = 0x3bd04
	Meta             Hash = 0x4b904
	Meter            Hash = 0x2ef05
	Method           Hash = 0x2d906
	Mglyph           Hash = 0x34406
	Mi               Hash = 0x2c02
	Min              Hash = 0x2c03
	Mn               Hash = 0x2e202
	Mo               Hash = 0x4dd02
	Ms               Hash = 0x35902
	Mtext            Hash = 0x38105
	Multiple         Hash = 0x38f08
	Muted            Hash = 0x39705
	Name             Hash = 0xdc04
	Nav              Hash = 0x1303
	Nobr             Hash = 0x1a04
	Noembed          Hash = 0x6c07
	Noframes         Hash = 0xa108
	Nohref           Hash = 0xad06
	Noresize         Hash = 0x24a08
	Noscript         Hash = 0x31908
	Noshade          Hash = 0x4e507
	Novalidate       Hash = 0x2e30a
	Nowrap           Hash = 0x57706
	Object           Hash = 0x2af06
	Ol               Hash = 0x12d02
	Onabort          Hash = 0x1c207
	Onafterprint     Hash = 0x2830c
	Onbeforeprint    Hash = 0x2bd0d
	Onbeforeunload   Hash = 0x66a0e
	Onblur           Hash = 0x14406
	Oncancel         Hash = 0x11708
	Oncanplay        Hash = 0x18c09
	Oncanplaythrough Hash = 0x18c10
	Onchange         Hash = 0x42808
	Onclick          Hash = 0x6a407
	Onclose          Hash = 0x39c07
	Oncontextmenu    Hash = 0x3b40d
	Oncuechange      Hash = 0x3c10b
	Ondblclick       Hash = 0x3cc0a
	Ondrag           Hash = 0x3d606
	Ondragend        Hash = 0x3d609
	Ondragenter      Hash = 0x3df0b
	Ondragleave      Hash = 0x3ea0b
	Ondragover       Hash = 0x3f50a
	Ondragstart      Hash = 0x3ff0b
	Ondrop           Hash = 0x41006
	Ondurationchange Hash = 0x42010
	Onemptied        Hash = 0x41709
	Onended          Hash = 0x43007
	Onerror          Hash = 0x43707
	Onfocus          Hash = 0x43e07
	Onhashchange     Hash = 0x45e0c
	Oninput          Hash = 0x46a07
	Oninvalid        Hash = 0x47109
	Onkeydown        Hash = 0x47a09
	Onkeypress       Hash = 0x4870a
	Onkeyup          Hash = 0x49707
	Onload           Hash = 0x49e06
	Onloadeddata     Hash = 0x49e0c
	Onloadedmetadata Hash = 0x4b110
	Onloadstart      Hash = 0x4c70b
	Onmessage        Hash = 0x4d209
	Onmousedown      Hash = 0x4db0b
	Onmousemove      Hash = 0x4f80b
	Onmouseout       Hash = 0x5030a
	Onmouseover      Hash = 0x5100b
	Onmouseup        Hash = 0x51b09
	Onmousewheel     Hash = 0x5240c
	Onoffline        Hash = 0x53009
	Ononline         Hash = 0x53b08
	Onpagehide       Hash = 0x5430a
	Onpageshow       Hash = 0x5570a
	Onpause          Hash = 0x56307
	Onplay           Hash = 0x59006
	Onplaying        Hash = 0x59009
	Onpopstate       Hash = 0x5990a
	Onprogress       Hash = 0x5a30a
	Onratechange     Hash = 0x5b70c
	Onreset          Hash = 0x5c307
	Onresize         Hash = 0x5ca08
	Onscroll         Hash = 0x5d208
	Onseeked         Hash = 0x5dd08
	Onseeking        Hash = 0x5e509
	Onselect         Hash = 0x5ee08
	Onshow           Hash = 0x5f806
	Onstalled        Hash = 0x60309
	Onstorage        Hash = 0x60c09
	Onsubmit         Hash = 0x61508
	Onsuspend        Hash = 0x62909
	Ontimeupdate     Hash = 0x4520c
	Onunload         Hash = 0x63208
	Onvolumechange   Hash = 0x63a0e
	Onwaiting        Hash = 0x64809
	Open             Hash = 0x57404
	Optgroup         Hash = 0x12008
	Optimum          Hash = 0x65107
	Option           Hash = 0x66606
	Output           Hash = 0x50a06
	P                Hash = 0xc01
	Param            Hash = 0xc05
	Pattern          Hash = 0x9b07
	Pauseonexit      Hash = 0x5650b
	Ping             Hash = 0xe704
	Placeholder      Hash = 0x1270b
	Plaintext        Hash = 0x17d09
	Poster           Hash = 0x1fb06
	Pre              Hash = 0x30d03
	Preload          Hash = 0x30d07
	Profile          Hash = 0x34f07
	Progress         Hash = 0x5a508
	Prompt           Hash = 0x66006
	Public           Hash = 0x57c06
	Q                Hash = 0x8d01
	Radiogroup       Hash = 0x30a
	Rb               Hash = 0x1d02
	Readonly         Hash = 0x38708
	Rel              Hash = 0x30e03
	Required         Hash = 0x8b08
	Rev              Hash = 0x29303
	Reversed         Hash = 0x29308
	Rows             Hash = 0x6604
	Rowspan          Hash = 0x6607
	Rp               Hash = 0x28902
	Rt               Hash = 0x1c702
	Rtc              Hash = 0x1c703
	Ruby             Hash = 0xf304
	Rules            Hash = 0x13105
	S                Hash = 0x3d01
	Samp             Hash = 0x9804
	Sandbox          Hash = 0x13507
	Scope            Hash = 0x35a05
	Scoped           Hash = 0x35a06
	Script           Hash = 0x31b06
	Scrolling        Hash = 0x5d409
	Seamless         Hash = 0x3a108
	Section          Hash = 0x13f07
	Select           Hash = 0x5f006
	Selected         Hash = 0x5f008
	Shape            Hash = 0x23005
	Size             Hash = 0x24e04
	Sizes            Hash = 0x24e05
	Small            Hash = 0x23a05
	Sortable         Hash = 0x25208
	Source           Hash = 0x26506
	Spacer           Hash = 0x37106
	Span             Hash = 0x6904
	Spellcheck       Hash = 0x3a80a
	Src              Hash = 0x44403
	Srcdoc           Hash = 0x44406
	Srclang          Hash = 0x49007
	Start            Hash = 0x40505
	Step             Hash = 0x65d04
	Strike           Hash = 0x5ac06
	Strong           Hash = 0x67c06
	Style            Hash = 0x68205
	Sub              Hash = 0x61703
	Summary          Hash = 0x68707
	Sup              Hash = 0x68e03
	Svg              Hash = 0x69103
	System           Hash = 0x69406
	Tabindex         Hash = 0x4bf08
	Table            Hash = 0x25505
	Target           Hash = 0x2f806
	Tbody            Hash = 0x3f05
	Td               Hash = 0xaa02
	Text             Hash = 0x18204
	Textarea         Hash = 0x38208
	Tfoot            Hash = 0xed05
	Th               Hash = 0x19502
	Thead            Hash = 0x36a05
	Time             Hash = 0x2ed04
	Title            Hash = 0x16605
	Tr               Hash = 0x18502
	Track            Hash = 0x18505
	Translate        Hash = 0x22009
	Truespeed        Hash = 0x27209
	Tt               Hash = 0x9d02
	Type             Hash = 0x10f04
	Typemustmatch    Hash = 0x1e00d
	U                Hash = 0xb01
	Ul               Hash = 0x5802
	Undeterminate    Hash = 0x250d
	Usemap           Hash = 0x15d06
	Valign           Hash = 0x1506
	Value            Hash = 0x10a05
	Valuetype        Hash = 0x10a09
	Var              Hash = 0x32f03
	Video            Hash = 0x6a005
	Visible          Hash = 0x6ab07
	Vlink            Hash = 0x6b205
	Wbr              Hash = 0x56003
	Width            Hash = 0x5fd05
	Wrap             Hash = 0x57904
	Xmlns            Hash = 0x13b05
	Xmp              Hash = 0x17b03
)

func ToHash

func ToHash(s []byte) Hash

ToHash returns the hash whose name is s. It returns zero if there is no such hash. It is case sensitive.

func (Hash) String

func (i Hash) String() string

String returns the hash' name.

type Lexer

type Lexer struct {
	// contains filtered or unexported fields
}

Lexer is the state for the lexer.

func NewLexer

func NewLexer(r io.Reader) *Lexer

NewLexer returns a new Lexer for a given io.Reader.

Example
l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>"))
out := ""
for {
	tt, data, n := l.Next()
	if tt == ErrorToken {
		break
	}
	if tt == StartTagToken {
		out += "<"
	} else if tt == EndTagToken {
		out += "</"
	}
	out += string(data)
	if tt == StartTagToken {
		out += " "
	} else if tt == EndTagToken {
		out += ">"
	} else if tt == AttributeToken {
		out += "=" + string(l.AttrVal())
	}
	l.Free(n)
}
fmt.Println(out)
Output:

<span class='user'>John Doe</span>

func (*Lexer) AttrVal

func (l *Lexer) AttrVal() []byte

AttrVal returns the attribute value when an AttributeToken was returned from Next.

func (*Lexer) Err

func (l *Lexer) Err() error

Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.

func (*Lexer) Free added in v1.1.0

func (l *Lexer) Free(n int)

Free frees up bytes of length n from previously shifted tokens.

func (*Lexer) Next

func (l *Lexer) Next() (TokenType, []byte, int)

Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.

type TokenType

type TokenType uint32

TokenType determines the type of token, eg. a number or a semicolon.

const (
	ErrorToken TokenType = iota // extra token when errors occur
	CommentToken
	DoctypeToken
	StartTagToken
	StartTagCloseToken
	StartTagVoidToken
	EndTagToken
	AttributeToken
	TextToken
)

TokenType values.

func (TokenType) String

func (tt TokenType) String() string

String returns the string representation of a TokenType.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL