syntax

package
v0.1.8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 5, 2022 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

View Source
const (
	LowercaseSet = 0 // Set to arg.
	LowercaseAdd = 1 // Add arg.
	LowercaseBor = 2 // Bitwise or with 1.
	LowercaseBad = 3 // Bitwise and with 1 and add original.
)
View Source
const (
	Onerep    InstOp = 0 // lef,back char,min,max    a {n}
	Notonerep        = 1 // lef,back char,min,max    .{n}
	Setrep           = 2 // lef,back set,min,max     [\d]{n}

	Oneloop    = 3 // lef,back char,min,max    a {,n}
	Notoneloop = 4 // lef,back char,min,max    .{,n}
	Setloop    = 5 // lef,back set,min,max     [\d]{,n}

	Onelazy    = 6 // lef,back char,min,max    a {,n}?
	Notonelazy = 7 // lef,back char,min,max    .{,n}?
	Setlazy    = 8 // lef,back set,min,max     [\d]{,n}?

	One    = 9  // lef      char            a
	Notone = 10 // lef      char            [^a]
	Set    = 11 // lef      set             [a-z\s]  \w \s \d

	Multi = 12 // lef      string          abcd
	Ref   = 13 // lef      group           \#

	Bol         = 14 //                          ^
	Eol         = 15 //                          $
	Boundary    = 16 //                          \b
	Nonboundary = 17 //                          \B
	Beginning   = 18 //                          \A
	Start       = 19 //                          \G
	EndZ        = 20 //                          \Z
	End         = 21 //                          \Z

	Nothing = 22 //                          Reject!

	Lazybranch      = 23 // back     jump            straight first
	Branchmark      = 24 // back     jump            branch first for loop
	Lazybranchmark  = 25 // back     jump            straight first for loop
	Nullcount       = 26 // back     val             set counter, null mark
	Setcount        = 27 // back     val             set counter, make mark
	Branchcount     = 28 // back     jump,limit      branch++ if zero<=c<limit
	Lazybranchcount = 29 // back     jump,limit      same, but straight first
	Nullmark        = 30 // back                     save position
	Setmark         = 31 // back                     save position
	Capturemark     = 32 // back     group           define group
	Getmark         = 33 // back                     recall position
	Setjump         = 34 // back                     save backtrack state
	Backjump        = 35 //                          zap back to saved state
	Forejump        = 36 //                          zap backtracking state
	Testref         = 37 //                          backtrack if ref undefined
	Goto            = 38 //          jump            just go

	Prune = 39 //                          prune it baby
	Stop  = 40 //                          done!

	ECMABoundary    = 41 //                          \b
	NonECMABoundary = 42 //                          \B

	Mask  = 63  // Mask to get unmodified ordinary operator
	Rtl   = 64  // bit to indicate that we're reverse scanning.
	Back  = 128 // bit to indicate that we're backtracking.
	Back2 = 256 // bit to indicate that we're backtracking on a second branch.
	Ci    = 512 // bit to indicate that we're case-insensitive.
)
View Source
const (
	IgnoreCase              RegexOptions = 0x0001 // "i"
	Multiline                            = 0x0002 // "m"
	ExplicitCapture                      = 0x0004 // "n"
	Compiled                             = 0x0008 // "c"
	Singleline                           = 0x0010 // "s"
	IgnorePatternWhitespace              = 0x0020 // "x"
	RightToLeft                          = 0x0040 // "r"
	Debug                                = 0x0080 // "d"
	ECMAScript                           = 0x0100 // "e"
	RE2                                  = 0x0200 // RE2 compat mode
)
View Source
const (
	// internal issue
	ErrInternalError ErrorCode = "regexp/syntax: internal error"
	// Parser errors
	ErrUnterminatedComment        = "unterminated comment"
	ErrInvalidCharRange           = "invalid character class range"
	ErrInvalidRepeatSize          = "invalid repeat count"
	ErrInvalidUTF8                = "invalid UTF-8"
	ErrCaptureGroupOutOfRange     = "capture group number out of range"
	ErrUnexpectedParen            = "unexpected )"
	ErrMissingParen               = "missing closing )"
	ErrMissingBrace               = "missing closing }"
	ErrInvalidRepeatOp            = "invalid nested repetition operator"
	ErrMissingRepeatArgument      = "missing argument to repetition operator"
	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
	ErrTooManyAlternates          = "too many | in (?()|)"
	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
	ErrCapNumNotZero              = "capture number cannot be zero"
	ErrUndefinedBackRef           = "reference to undefined group number %v"
	ErrUndefinedNameRef           = "reference to undefined group name %v"
	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
	ErrMalformedReference         = "(?(%v) ) malformed"
	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
	ErrMalformedSlashP            = "malformed \\p{X} character escape"
	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
	ErrMissingControl             = "missing control character"
	ErrUnrecognizedControl        = "unrecognized control character"
	ErrTooFewHex                  = "insufficient hexadecimal digits"
	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
	ErrUnterminatedBracket        = "unterminated [] set"
	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
	ErrReversedCharRange          = "[x-y] range in reverse order"
)
View Source
const (
	Q byte = 5 // quantifier
	S      = 4 // ordinary stopper
	Z      = 3 // ScanBlank stopper
	X      = 2 // whitespace
	E      = 1 // should be escaped
)
View Source
const (
	AnchorBeginning    AnchorLoc = 0x0001
	AnchorBol                    = 0x0002
	AnchorStart                  = 0x0004
	AnchorEol                    = 0x0008
	AnchorEndZ                   = 0x0010
	AnchorEnd                    = 0x0020
	AnchorBoundary               = 0x0040
	AnchorECMABoundary           = 0x0080
)

where the regex can be pegged

View Source
const (

	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
	MaxPrefixSize = 50
)

Variables

View Source
var (
	AnyClass          = getCharSetFromOldString([]rune{0}, false)
	ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
	NoneClass         = getCharSetFromOldString(nil, false)
	ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
	NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
	ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
	NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
	ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
	NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)

	WordClass     = getCharSetFromCategoryString(false, false, wordCategoryText)
	NotWordClass  = getCharSetFromCategoryString(true, false, wordCategoryText)
	SpaceClass    = getCharSetFromCategoryString(false, false, spaceCategoryText)
	NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
)
View Source
var ErrReplacementError = errors.New("Replacement pattern error.")

ErrReplacementError is a general error during parsing the replacement text

Functions

func CharDescription

func CharDescription(ch rune) string

CharDescription Produces a human-readable description for a single character.

func Escape

func Escape(input string) string

func IsECMAWordChar

func IsECMAWordChar(r rune) bool

func IsWordChar

func IsWordChar(r rune) bool

According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

func Unescape

func Unescape(input string) (string, error)

Types

type AnchorLoc

type AnchorLoc int16

func (AnchorLoc) String

func (anchors AnchorLoc) String() string

anchorDescription returns a human-readable description of the anchors

type BmPrefix

type BmPrefix struct {
	// contains filtered or unexported fields
}

BmPrefix precomputes the Boyer-Moore tables for fast string scanning. These tables allow you to scan for the first occurrence of a string within a large body of text without examining every character. The performance of the heuristic depends on the actual string and the text being searched, but usually, the longer the string that is being searched for, the fewer characters need to be examined.

func (*BmPrefix) Dump

func (b *BmPrefix) Dump(indent string) string

Dump returns the contents of the filter as a human readable string

func (*BmPrefix) IsMatch

func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool

When a regex is anchored, we can do a quick IsMatch test instead of a Scan

func (*BmPrefix) Scan

func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int

Scan uses the Boyer-Moore algorithm to find the first occurrence of the specified string within text, beginning at index, and constrained within beglimit and endlimit.

The direction and case-sensitivity of the match is determined by the arguments to the RegexBoyerMoore constructor.

func (*BmPrefix) String

func (b *BmPrefix) String() string

type CharSet

type CharSet struct {
	// contains filtered or unexported fields
}

CharSet combines start-end rune ranges and unicode categories representing a set of characters

func (CharSet) CharIn

func (c CharSet) CharIn(ch rune) bool

CharIn returns true if the rune is in our character set (either ranges or categories). It handles negations and subtracted sub-charsets.

func (CharSet) Copy

func (c CharSet) Copy() CharSet

Copy makes a deep copy to prevent accidental mutation of a set

func (CharSet) HasSubtraction

func (c CharSet) HasSubtraction() bool

func (CharSet) IsEmpty

func (c CharSet) IsEmpty() bool

func (CharSet) IsMergeable

func (c CharSet) IsMergeable() bool

func (CharSet) IsNegated

func (c CharSet) IsNegated() bool

func (CharSet) IsSingleton

func (c CharSet) IsSingleton() bool

func (CharSet) IsSingletonInverse

func (c CharSet) IsSingletonInverse() bool

func (CharSet) SingletonChar

func (c CharSet) SingletonChar() rune

SingletonChar will return the char from the first range without validation. It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input

func (CharSet) String

func (c CharSet) String() string

gets a human-readable description for a set string

type Code

type Code struct {
	Codes       []int       // the code
	Strings     [][]rune    // string table
	Sets        []*CharSet  //character set table
	TrackCount  int         // how many instructions use backtracking
	Caps        map[int]int // mapping of user group numbers -> impl group slots
	Capsize     int         // number of impl group slots
	FcPrefix    *Prefix     // the set of candidate first characters (may be null)
	BmPrefix    *BmPrefix   // the fixed prefix string as a Boyer-Moore machine (may be null)
	Anchors     AnchorLoc   // the set of zero-length start anchors (RegexFCD.Bol, etc)
	RightToLeft bool        // true if right to left
}

func Write

func Write(tree *RegexTree) (*Code, error)

func (*Code) Dump

func (c *Code) Dump() string

func (*Code) OpcodeDescription

func (c *Code) OpcodeDescription(offset int) string

OpcodeDescription is a humman readable string of the specific offset

type Error

type Error struct {
	Code ErrorCode
	Expr string
	Args []interface{}
}

An Error describes a failure to parse a regular expression and gives the offending expression.

func (*Error) Error

func (e *Error) Error() string

type ErrorCode

type ErrorCode string

An ErrorCode describes a failure to parse a regular expression.

func (ErrorCode) String

func (e ErrorCode) String() string

type InstOp

type InstOp int

type Prefix

type Prefix struct {
	PrefixStr       []rune
	PrefixSet       CharSet
	CaseInsensitive bool
}

type RegexOptions

type RegexOptions int32

type RegexTree

type RegexTree struct {
	Capnames map[string]int
	Caplist  []string
	// contains filtered or unexported fields
}

func Parse

func Parse(re string, op RegexOptions) (*RegexTree, error)

Parse converts a regex string into a parse tree

func (*RegexTree) Dump

func (t *RegexTree) Dump() string

type ReplacerData

type ReplacerData struct {
	Rep     string
	Strings []string
	Rules   []int
}

func NewReplacerData

func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)

NewReplacerData will populate a reusable replacer data struct based on the given replacement string and the capture group data from a regexp

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL