ucd

package

v0.10.0 Latest Latest Go to latest Published: Jun 12, 2023 License: BSD-3-Clause Imports: 8 Imported by: 0

Documentation ¶

Overview ¶

Package ucd provides a parser for Unicode Character Database files, the format of which is defined in https://www.unicode.org/reports/tr44/. See https://www.unicode.org/Public/UCD/latest/ucd/ for example files.

It currently does not support substitutions of missing fields.

Example ¶

package main

import (
	"fmt"
	"strings"

	"golang.org/x/text/internal/ucd"
)

func main() {
	// Read rune-by-rune from UnicodeData.
	var count int
	p := ucd.New(strings.NewReader(unicodeData))
	for p.Next() {
		count++
		if lower := p.Runes(ucd.SimpleLowercaseMapping); lower != nil {
			fmt.Printf("lower(%U) -> %U\n", p.Rune(0), lower[0])
		}
	}
	if err := p.Err(); err != nil {
		fmt.Println(err)
	}
	fmt.Println("Number of runes visited:", count)

	// Read raw ranges from Scripts.
	p = ucd.New(strings.NewReader(scripts), ucd.KeepRanges)
	for p.Next() {
		start, end := p.Range(0)
		fmt.Printf("%04X..%04X: %s\n", start, end, p.String(1))
	}
	if err := p.Err(); err != nil {
		fmt.Println(err)
	}

}

// Excerpt from UnicodeData.txt
const unicodeData = `
00B9;SUPERSCRIPT ONE;No;0;EN;<super> 0031;;1;1;N;SUPERSCRIPT DIGIT ONE;;;;
00BA;MASCULINE ORDINAL INDICATOR;Lo;0;L;<super> 006F;;;;N;;;;;
00BB;RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK;Pf;0;ON;;;;;Y;RIGHT POINTING GUILLEMET;;;;
00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;
00BD;VULGAR FRACTION ONE HALF;No;0;ON;<fraction> 0031 2044 0032;;;1/2;N;FRACTION ONE HALF;;;;
00BE;VULGAR FRACTION THREE QUARTERS;No;0;ON;<fraction> 0033 2044 0034;;;3/4;N;FRACTION THREE QUARTERS;;;;
00BF;INVERTED QUESTION MARK;Po;0;ON;;;;;N;;;;;
00C0;LATIN CAPITAL LETTER A WITH GRAVE;Lu;0;L;0041 0300;;;;N;LATIN CAPITAL LETTER A GRAVE;;;00E0;
00C1;LATIN CAPITAL LETTER A WITH ACUTE;Lu;0;L;0041 0301;;;;N;LATIN CAPITAL LETTER A ACUTE;;;00E1;
00C2;LATIN CAPITAL LETTER A WITH CIRCUMFLEX;Lu;0;L;0041 0302;;;;N;LATIN CAPITAL LETTER A CIRCUMFLEX;;;00E2;
00C3;LATIN CAPITAL LETTER A WITH TILDE;Lu;0;L;0041 0303;;;;N;LATIN CAPITAL LETTER A TILDE;;;00E3;
00C4;LATIN CAPITAL LETTER A WITH DIAERESIS;Lu;0;L;0041 0308;;;;N;LATIN CAPITAL LETTER A DIAERESIS;;;00E4;

# A legacy rune range.
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
`

// Excerpt from Scripts.txt
const scripts = `
# Property:	Script
# ================================================

0000..001F    ; Common # Cc  [32] <control-0000>..<control-001F>
0020          ; Common # Zs       SPACE
0021..0023    ; Common # Po   [3] EXCLAMATION MARK..NUMBER SIGN
0024          ; Common # Sc       DOLLAR SIGN
`

Output:

lower(U+00C0) -> U+00E0
lower(U+00C1) -> U+00E1
lower(U+00C2) -> U+00E2
lower(U+00C3) -> U+00E3
lower(U+00C4) -> U+00E4
Number of runes visited: 6594
0000..001F: Common
0020..0020: Common
0021..0023: Common
0024..0024: Common

Index ¶

Constants
func Parse(r io.ReadCloser, f func(p *Parser))
type Option
- func CommentHandler(f func(s string)) Option
- func Part(f func(p *Parser)) Option
type Parser
- func New(r io.Reader, o ...Option) *Parser

Examples ¶

Package

Constants ¶

View Source

const (
	CodePoint = iota
	Name
	GeneralCategory
	CanonicalCombiningClass
	BidiClass
	DecompMapping
	DecimalValue
	DigitValue
	NumericValue
	BidiMirrored
	Unicode1Name
	ISOComment
	SimpleUppercaseMapping
	SimpleLowercaseMapping
	SimpleTitlecaseMapping
)

UnicodeData.txt fields.

Variables ¶

This section is empty.

Functions ¶

func Parse ¶

func Parse(r io.ReadCloser, f func(p *Parser))

Parse calls f for each entry in the given reader of a UCD file. It will close the reader upon return. It will call log.Fatal if any error occurred.

This implements the most common usage pattern of using Parser.

Types ¶

type Option ¶

type Option func(p *Parser)

An Option is used to configure a Parser.

var (
	// KeepRanges prevents the expansion of ranges. The raw ranges can be
	// obtained by calling Range(0) on the parser.
	KeepRanges Option = keepRanges
)

func CommentHandler ¶

func CommentHandler(f func(s string)) Option

The CommentHandler option passes comments that are on a line by itself to a given handler.

func Part ¶

func Part(f func(p *Parser)) Option

The Part option register a handler for lines starting with a '@'. The text after a '@' is available as the first field. Comments are handled as usual.

type Parser ¶

type Parser struct {
	// contains filtered or unexported fields
}

A Parser parses Unicode Character Database (UCD) files.

func New ¶

func New(r io.Reader, o ...Option) *Parser

New returns a Parser for the given Reader.

func (*Parser) Bool ¶

func (p *Parser) Bool(i int) bool

Bool parses and returns field i as a boolean value.

func (*Parser) Comment ¶

func (p *Parser) Comment() string

Comment returns the comments for the current line.

func (*Parser) Enum ¶

func (p *Parser) Enum(i int, enum ...string) string

Enum interprets and returns field i as a value that must be one of the values in enum.

func (*Parser) Err ¶

func (p *Parser) Err() error

Err returns a non-nil error if any error occurred during parsing.

func (*Parser) Float ¶

func (p *Parser) Float(i int) float64

Float parses and returns field i as a decimal value.

func (*Parser) Int ¶

func (p *Parser) Int(i int) int

Int parses and returns field i as an integer value.

func (*Parser) Next ¶

func (p *Parser) Next() bool

Next parses the next line in the file. It returns true if a line was parsed and false if it reached the end of the file.

func (*Parser) Range ¶

func (p *Parser) Range(i int) (first, last rune)

Range parses and returns field i as a rune range. A range is inclusive at both ends. If the field only has one rune, first and last will be identical. It supports the legacy format for ranges used in UnicodeData.txt.

func (*Parser) Rune ¶

func (p *Parser) Rune(i int) rune

Rune parses and returns field i as a rune.

func (*Parser) Runes ¶

func (p *Parser) Runes(i int) (runes []rune)

Runes interprets and returns field i as a sequence of runes.

func (*Parser) String ¶

func (p *Parser) String(i int) string

String parses and returns field i as a string value.

func (*Parser) Strings ¶

func (p *Parser) Strings(i int) []string

Strings parses and returns field i as a space-separated list of strings.

func (*Parser) Uint ¶

func (p *Parser) Uint(i int) uint

Uint parses and returns field i as an unsigned integer value.

Source Files ¶

View all Source files

ucd.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL