Documentation
¶
Overview ¶
Package unicode provides data and functions to test some properties of Unicode code points.
Example (Is) ¶
Functions starting with "Is" can be used to inspect which table of range a rune belongs to. Note that runes may fit into more than one range.
package main import ( "fmt" "unicode" ) func main() { // constant with mixed type runes const mixed = "\b5Ὂg̀9! ℃ᾭG" for _, c := range mixed { fmt.Printf("For %q:\n", c) if unicode.IsControl(c) { fmt.Println("\tis control rune") } if unicode.IsDigit(c) { fmt.Println("\tis digit rune") } if unicode.IsGraphic(c) { fmt.Println("\tis graphic rune") } if unicode.IsLetter(c) { fmt.Println("\tis letter rune") } if unicode.IsLower(c) { fmt.Println("\tis lower case rune") } if unicode.IsMark(c) { fmt.Println("\tis mark rune") } if unicode.IsNumber(c) { fmt.Println("\tis number rune") } if unicode.IsPrint(c) { fmt.Println("\tis printable rune") } if !unicode.IsPrint(c) { fmt.Println("\tis not printable rune") } if unicode.IsPunct(c) { fmt.Println("\tis punct rune") } if unicode.IsSpace(c) { fmt.Println("\tis space rune") } if unicode.IsSymbol(c) { fmt.Println("\tis symbol rune") } if unicode.IsTitle(c) { fmt.Println("\tis title case rune") } if unicode.IsUpper(c) { fmt.Println("\tis upper case rune") } } }
Output: For '\b': is control rune is not printable rune For '5': is digit rune is graphic rune is number rune is printable rune For 'Ὂ': is graphic rune is letter rune is printable rune is upper case rune For 'g': is graphic rune is letter rune is lower case rune is printable rune For '̀': is graphic rune is mark rune is printable rune For '9': is digit rune is graphic rune is number rune is printable rune For '!': is graphic rune is printable rune is punct rune For ' ': is graphic rune is printable rune is space rune For '℃': is graphic rune is printable rune is symbol rune For 'ᾭ': is graphic rune is letter rune is printable rune is title case rune For 'G': is graphic rune is letter rune is printable rune is upper case rune
Index ¶
- Constants
- Variables
- func In(r rune, ranges ...*RangeTable) bool
- func Is(rangeTab *RangeTable, r rune) bool
- func IsControl(r rune) bool
- func IsDigit(r rune) bool
- func IsGraphic(r rune) bool
- func IsLetter(r rune) bool
- func IsLower(r rune) bool
- func IsMark(r rune) bool
- func IsNumber(r rune) bool
- func IsOneOf(ranges []*RangeTable, r rune) bool
- func IsPrint(r rune) bool
- func IsPunct(r rune) bool
- func IsSpace(r rune) bool
- func IsSymbol(r rune) bool
- func IsTitle(r rune) bool
- func IsUpper(r rune) bool
- func SimpleFold(r rune) rune
- func To(_case int, r rune) rune
- func ToLower(r rune) rune
- func ToTitle(r rune) rune
- func ToUpper(r rune) rune
- type CaseRange
- type Range16
- type Range32
- type RangeTable
- type SpecialCase
Examples ¶
Constants ¶
const ( MaxRune = '\U0010FFFF' ReplacementChar = '\uFFFD' MaxASCII = '\u007F' MaxLatin1 = '\u00FF' )
const ( UpperCase = iota LowerCase TitleCase MaxCase )
Indices into the Delta arrays inside CaseRanges for case mapping.
const (
UpperLower = MaxRune + 1
)
If the Delta field of a CaseRange is UpperLower, it means this CaseRange represents a sequence of the form (say) Upper Lower Upper Lower.
const Version = "15.0.0"
Version is the Unicode edition from which the tables are derived.
Variables ¶
var ( Cc = _Cc Cf = _Cf Co = _Co Cs = _Cs Digit = _Nd Nd = _Nd Letter = _L L = _L Lm = _Lm Lo = _Lo Lower = _Ll Ll = _Ll Mark = _M M = _M Mc = _Mc Me = _Me Mn = _Mn Nl = _Nl No = _No Number = _N N = _N Other = _C C = _C Pc = _Pc Pd = _Pd Pe = _Pe Pf = _Pf Pi = _Pi Po = _Po Ps = _Ps Punct = _P P = _P Sc = _Sc Sk = _Sk Sm = _Sm So = _So Space = _Z Z = _Z Symbol = _S S = _S Title = _Lt Lt = _Lt Upper = _Lu Lu = _Lu Zl = _Zl Zp = _Zp Zs = _Zs )
These variables have type *RangeTable.
var ( Adlam = _Adlam Ahom = _Ahom Anatolian_Hieroglyphs = _Anatolian_Hieroglyphs Arabic = _Arabic Armenian = _Armenian Avestan = _Avestan Balinese = _Balinese Bamum = _Bamum Bassa_Vah = _Bassa_Vah Batak = _Batak Bengali = _Bengali Bhaiksuki = _Bhaiksuki Bopomofo = _Bopomofo Brahmi = _Brahmi Braille = _Braille Buginese = _Buginese Buhid = _Buhid Canadian_Aboriginal = _Canadian_Aboriginal Carian = _Carian Caucasian_Albanian = _Caucasian_Albanian Chakma = _Chakma Cham = _Cham Cherokee = _Cherokee Chorasmian = _Chorasmian Common = _Common Coptic = _Coptic Cuneiform = _Cuneiform Cypriot = _Cypriot Cypro_Minoan = _Cypro_Minoan Cyrillic = _Cyrillic Deseret = _Deseret Devanagari = _Devanagari Dives_Akuru = _Dives_Akuru Dogra = _Dogra Duployan = _Duployan Egyptian_Hieroglyphs = _Egyptian_Hieroglyphs Elbasan = _Elbasan Elymaic = _Elymaic Ethiopic = _Ethiopic Georgian = _Georgian Glagolitic = _Glagolitic Gothic = _Gothic Grantha = _Grantha Greek = _Greek Gujarati = _Gujarati Gunjala_Gondi = _Gunjala_Gondi Gurmukhi = _Gurmukhi Han = _Han Hangul = _Hangul Hanifi_Rohingya = _Hanifi_Rohingya Hanunoo = _Hanunoo Hatran = _Hatran Hebrew = _Hebrew Hiragana = _Hiragana Imperial_Aramaic = _Imperial_Aramaic Inherited = _Inherited Inscriptional_Pahlavi = _Inscriptional_Pahlavi Inscriptional_Parthian = _Inscriptional_Parthian Javanese = _Javanese Kaithi = _Kaithi Kannada = _Kannada Katakana = _Katakana Kawi = _Kawi Kayah_Li = _Kayah_Li Kharoshthi = _Kharoshthi Khitan_Small_Script = _Khitan_Small_Script Khmer = _Khmer Khojki = _Khojki Khudawadi = _Khudawadi Lao = _Lao Latin = _Latin Lepcha = _Lepcha Limbu = _Limbu Linear_A = _Linear_A Linear_B = _Linear_B Lisu = _Lisu Lycian = _Lycian Lydian = _Lydian Mahajani = _Mahajani Makasar = _Makasar Malayalam = _Malayalam Mandaic = _Mandaic Manichaean = _Manichaean Marchen = _Marchen Masaram_Gondi = _Masaram_Gondi Medefaidrin = _Medefaidrin Meetei_Mayek = _Meetei_Mayek Mende_Kikakui = _Mende_Kikakui Meroitic_Cursive = _Meroitic_Cursive Meroitic_Hieroglyphs = _Meroitic_Hieroglyphs Miao = _Miao Modi = _Modi Mongolian = _Mongolian Mro = _Mro Multani = _Multani Myanmar = _Myanmar Nabataean = _Nabataean Nag_Mundari = _Nag_Mundari Nandinagari = _Nandinagari New_Tai_Lue = _New_Tai_Lue Newa = _Newa Nko = _Nko Nushu = _Nushu Nyiakeng_Puachue_Hmong = _Nyiakeng_Puachue_Hmong Ogham = _Ogham Ol_Chiki = _Ol_Chiki Old_Hungarian = _Old_Hungarian Old_Italic = _Old_Italic Old_North_Arabian = _Old_North_Arabian Old_Permic = _Old_Permic Old_Persian = _Old_Persian Old_Sogdian = _Old_Sogdian Old_South_Arabian = _Old_South_Arabian Old_Turkic = _Old_Turkic Old_Uyghur = _Old_Uyghur Oriya = _Oriya Osage = _Osage Osmanya = _Osmanya Pahawh_Hmong = _Pahawh_Hmong Palmyrene = _Palmyrene Pau_Cin_Hau = _Pau_Cin_Hau Phags_Pa = _Phags_Pa Phoenician = _Phoenician Psalter_Pahlavi = _Psalter_Pahlavi Rejang = _Rejang Runic = _Runic Samaritan = _Samaritan Saurashtra = _Saurashtra Sharada = _Sharada Shavian = _Shavian Siddham = _Siddham SignWriting = _SignWriting Sinhala = _Sinhala Sogdian = _Sogdian Sora_Sompeng = _Sora_Sompeng Soyombo = _Soyombo Sundanese = _Sundanese Syloti_Nagri = _Syloti_Nagri Syriac = _Syriac Tagalog = _Tagalog Tagbanwa = _Tagbanwa Tai_Le = _Tai_Le Tai_Tham = _Tai_Tham Tai_Viet = _Tai_Viet Takri = _Takri Tamil = _Tamil Tangsa = _Tangsa Tangut = _Tangut Telugu = _Telugu Thaana = _Thaana Thai = _Thai Tibetan = _Tibetan Tifinagh = _Tifinagh Tirhuta = _Tirhuta Toto = _Toto Ugaritic = _Ugaritic Vai = _Vai Vithkuqi = _Vithkuqi Wancho = _Wancho Warang_Citi = _Warang_Citi Yezidi = _Yezidi Yi = _Yi Zanabazar_Square = _Zanabazar_Square )
These variables have type *RangeTable.
var ( ASCII_Hex_Digit = _ASCII_Hex_Digit Bidi_Control = _Bidi_Control Dash = _Dash Deprecated = _Deprecated Diacritic = _Diacritic Extender = _Extender Hex_Digit = _Hex_Digit Hyphen = _Hyphen IDS_Binary_Operator = _IDS_Binary_Operator IDS_Trinary_Operator = _IDS_Trinary_Operator Ideographic = _Ideographic Join_Control = _Join_Control Logical_Order_Exception = _Logical_Order_Exception Noncharacter_Code_Point = _Noncharacter_Code_Point Other_Alphabetic = _Other_Alphabetic Other_Default_Ignorable_Code_Point = _Other_Default_Ignorable_Code_Point Other_Grapheme_Extend = _Other_Grapheme_Extend Other_ID_Continue = _Other_ID_Continue Other_ID_Start = _Other_ID_Start Other_Lowercase = _Other_Lowercase Other_Math = _Other_Math Other_Uppercase = _Other_Uppercase Pattern_Syntax = _Pattern_Syntax Pattern_White_Space = _Pattern_White_Space Prepended_Concatenation_Mark = _Prepended_Concatenation_Mark Quotation_Mark = _Quotation_Mark Radical = _Radical Regional_Indicator = _Regional_Indicator STerm = _Sentence_Terminal Sentence_Terminal = _Sentence_Terminal Soft_Dotted = _Soft_Dotted Terminal_Punctuation = _Terminal_Punctuation Unified_Ideograph = _Unified_Ideograph Variation_Selector = _Variation_Selector White_Space = _White_Space )
These variables have type *RangeTable.
var CaseRanges = _CaseRanges
CaseRanges is the table describing case mappings for all letters with non-self mappings.
var Categories = map[string]*RangeTable{ "C": C, "Cc": Cc, "Cf": Cf, "Co": Co, "Cs": Cs, "L": L, "Ll": Ll, "Lm": Lm, "Lo": Lo, "Lt": Lt, "Lu": Lu, "M": M, "Mc": Mc, "Me": Me, "Mn": Mn, "N": N, "Nd": Nd, "Nl": Nl, "No": No, "P": P, "Pc": Pc, "Pd": Pd, "Pe": Pe, "Pf": Pf, "Pi": Pi, "Po": Po, "Ps": Ps, "S": S, "Sc": Sc, "Sk": Sk, "Sm": Sm, "So": So, "Z": Z, "Zl": Zl, "Zp": Zp, "Zs": Zs, }
Categories is the set of Unicode category tables.
var FoldCategory = map[string]*RangeTable{
"L": foldL,
"Ll": foldLl,
"Lt": foldLt,
"Lu": foldLu,
"M": foldM,
"Mn": foldMn,
}
FoldCategory maps a category name to a table of code points outside the category that are equivalent under simple case folding to code points inside the category. If there is no entry for a category name, there are no such points.
var FoldScript = map[string]*RangeTable{
"Common": foldCommon,
"Greek": foldGreek,
"Inherited": foldInherited,
}
FoldScript maps a script name to a table of code points outside the script that are equivalent under simple case folding to code points inside the script. If there is no entry for a script name, there are no such points.
var GraphicRanges = []*RangeTable{ L, M, N, P, S, Zs, }
GraphicRanges defines the set of graphic characters according to Unicode.
var PrintRanges = []*RangeTable{ L, M, N, P, S, }
PrintRanges defines the set of printable characters according to Go. ASCII space, U+0020, is handled separately.
var Properties = map[string]*RangeTable{ "ASCII_Hex_Digit": ASCII_Hex_Digit, "Bidi_Control": Bidi_Control, "Dash": Dash, "Deprecated": Deprecated, "Diacritic": Diacritic, "Extender": Extender, "Hex_Digit": Hex_Digit, "Hyphen": Hyphen, "IDS_Binary_Operator": IDS_Binary_Operator, "IDS_Trinary_Operator": IDS_Trinary_Operator, "Ideographic": Ideographic, "Join_Control": Join_Control, "Logical_Order_Exception": Logical_Order_Exception, "Noncharacter_Code_Point": Noncharacter_Code_Point, "Other_Alphabetic": Other_Alphabetic, "Other_Default_Ignorable_Code_Point": Other_Default_Ignorable_Code_Point, "Other_Grapheme_Extend": Other_Grapheme_Extend, "Other_ID_Continue": Other_ID_Continue, "Other_ID_Start": Other_ID_Start, "Other_Lowercase": Other_Lowercase, "Other_Math": Other_Math, "Other_Uppercase": Other_Uppercase, "Pattern_Syntax": Pattern_Syntax, "Pattern_White_Space": Pattern_White_Space, "Prepended_Concatenation_Mark": Prepended_Concatenation_Mark, "Quotation_Mark": Quotation_Mark, "Radical": Radical, "Regional_Indicator": Regional_Indicator, "Sentence_Terminal": Sentence_Terminal, "STerm": Sentence_Terminal, "Soft_Dotted": Soft_Dotted, "Terminal_Punctuation": Terminal_Punctuation, "Unified_Ideograph": Unified_Ideograph, "Variation_Selector": Variation_Selector, "White_Space": White_Space, }
Properties is the set of Unicode property tables.
var Scripts = map[string]*RangeTable{}/* 163 elements not displayed */
Scripts is the set of Unicode script tables.
Functions ¶
func In ¶ added in v1.2.0
func In(r rune, ranges ...*RangeTable) bool
In reports whether the rune is a member of one of the ranges.
func Is ¶
func Is(rangeTab *RangeTable, r rune) bool
Is reports whether the rune is in the specified table of ranges.
func IsControl ¶
IsControl reports whether the rune is a control character. The C (Other) Unicode category includes more code points such as surrogates; use Is(C, r) to test for them.
func IsDigit ¶
IsDigit reports whether the rune is a decimal digit.
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsDigit('৩')) fmt.Printf("%t\n", unicode.IsDigit('A')) }
Output: true false
func IsGraphic ¶
IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such characters include letters, marks, numbers, punctuation, symbols, and spaces, from categories L, M, N, P, S, Zs.
func IsLetter ¶
IsLetter reports whether the rune is a letter (category L).
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsLetter('A')) fmt.Printf("%t\n", unicode.IsLetter('7')) }
Output: true false
func IsLower ¶
IsLower reports whether the rune is a lower case letter.
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsLower('a')) fmt.Printf("%t\n", unicode.IsLower('A')) }
Output: true false
func IsNumber ¶
IsNumber reports whether the rune is a number (category N).
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsNumber('Ⅷ')) fmt.Printf("%t\n", unicode.IsNumber('A')) }
Output: true false
func IsOneOf ¶
func IsOneOf(ranges []*RangeTable, r rune) bool
IsOneOf reports whether the rune is a member of one of the ranges. The function "In" provides a nicer signature and should be used in preference to IsOneOf.
func IsPrint ¶
IsPrint reports whether the rune is defined as printable by Go. Such characters include letters, marks, numbers, punctuation, symbols, and the ASCII space character, from categories L, M, N, P, S and the ASCII space character. This categorization is the same as IsGraphic except that the only spacing character is ASCII space, U+0020.
func IsSpace ¶
IsSpace reports whether the rune is a space character as defined by Unicode's White Space property; in the Latin-1 space this is
'\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP).
Other definitions of spacing characters are set by category Z and property Pattern_White_Space.
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsSpace(' ')) fmt.Printf("%t\n", unicode.IsSpace('\n')) fmt.Printf("%t\n", unicode.IsSpace('\t')) fmt.Printf("%t\n", unicode.IsSpace('a')) }
Output: true true true false
func IsTitle ¶
IsTitle reports whether the rune is a title case letter.
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsTitle('Dž')) fmt.Printf("%t\n", unicode.IsTitle('a')) }
Output: true false
func IsUpper ¶
IsUpper reports whether the rune is an upper case letter.
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%t\n", unicode.IsUpper('A')) fmt.Printf("%t\n", unicode.IsUpper('a')) }
Output: true false
func SimpleFold ¶
SimpleFold iterates over Unicode code points equivalent under the Unicode-defined simple case folding. Among the code points equivalent to rune (including rune itself), SimpleFold returns the smallest rune > r if one exists, or else the smallest rune >= 0. If r is not a valid Unicode code point, SimpleFold(r) returns r.
For example:
SimpleFold('A') = 'a' SimpleFold('a') = 'A' SimpleFold('K') = 'k' SimpleFold('k') = '\u212A' (Kelvin symbol, K) SimpleFold('\u212A') = 'K' SimpleFold('1') = '1' SimpleFold(-2) = -2
Example ¶
package main import ( "fmt" "unicode" ) func main() { fmt.Printf("%#U\n", unicode.SimpleFold('A')) // 'a' fmt.Printf("%#U\n", unicode.SimpleFold('a')) // 'A' fmt.Printf("%#U\n", unicode.SimpleFold('K')) // 'k' fmt.Printf("%#U\n", unicode.SimpleFold('k')) // '\u212A' (Kelvin symbol, K) fmt.Printf("%#U\n", unicode.SimpleFold('\u212A')) // 'K' fmt.Printf("%#U\n", unicode.SimpleFold('1')) // '1' }
Output: U+0061 'a' U+0041 'A' U+006B 'k' U+212A 'K' U+004B 'K' U+0031 '1'
func To ¶
To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
Example ¶
package main import ( "fmt" "unicode" ) func main() { const lcG = 'g' fmt.Printf("%#U\n", unicode.To(unicode.UpperCase, lcG)) fmt.Printf("%#U\n", unicode.To(unicode.LowerCase, lcG)) fmt.Printf("%#U\n", unicode.To(unicode.TitleCase, lcG)) const ucG = 'G' fmt.Printf("%#U\n", unicode.To(unicode.UpperCase, ucG)) fmt.Printf("%#U\n", unicode.To(unicode.LowerCase, ucG)) fmt.Printf("%#U\n", unicode.To(unicode.TitleCase, ucG)) }
Output: U+0047 'G' U+0067 'g' U+0047 'G' U+0047 'G' U+0067 'g' U+0047 'G'
func ToLower ¶
ToLower maps the rune to lower case.
Example ¶
package main import ( "fmt" "unicode" ) func main() { const ucG = 'G' fmt.Printf("%#U\n", unicode.ToLower(ucG)) }
Output: U+0067 'g'
Types ¶
type CaseRange ¶
CaseRange represents a range of Unicode code points for simple (one code point to one code point) case conversion. The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas are the number to add to the code point to reach the code point for a different case for that character. They may be negative. If zero, it means the character is in the corresponding case. There is a special case representing sequences of alternating corresponding Upper and Lower pairs. It appears with a fixed Delta of
{UpperLower, UpperLower, UpperLower}
The constant UpperLower has an otherwise impossible delta value.
type Range16 ¶
Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi inclusive and has the specified stride.
type Range32 ¶
Range32 represents of a range of Unicode code points and is used when one or more of the values will not fit in 16 bits. The range runs from Lo to Hi inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
type RangeTable ¶
RangeTable defines a set of Unicode code points by listing the ranges of code points within the set. The ranges are listed in two slices to save space: a slice of 16-bit ranges and a slice of 32-bit ranges. The two slices must be in sorted order and non-overlapping. Also, R32 should contain only values >= 0x10000 (1<<16).
type SpecialCase ¶
type SpecialCase []CaseRange
SpecialCase represents language-specific case mappings such as Turkish. Methods of SpecialCase customize (by overriding) the standard mappings.
Example ¶
package main import ( "fmt" "unicode" ) func main() { t := unicode.TurkishCase const lci = 'i' fmt.Printf("%#U\n", t.ToLower(lci)) fmt.Printf("%#U\n", t.ToTitle(lci)) fmt.Printf("%#U\n", t.ToUpper(lci)) const uci = 'İ' fmt.Printf("%#U\n", t.ToLower(uci)) fmt.Printf("%#U\n", t.ToTitle(uci)) fmt.Printf("%#U\n", t.ToUpper(uci)) }
Output: U+0069 'i' U+0130 'İ' U+0130 'İ' U+0069 'i' U+0130 'İ' U+0130 'İ'
var AzeriCase SpecialCase = _TurkishCase
var TurkishCase SpecialCase = _TurkishCase
func (SpecialCase) ToLower ¶
func (special SpecialCase) ToLower(r rune) rune
ToLower maps the rune to lower case giving priority to the special mapping.
func (SpecialCase) ToTitle ¶
func (special SpecialCase) ToTitle(r rune) rune
ToTitle maps the rune to title case giving priority to the special mapping.
func (SpecialCase) ToUpper ¶
func (special SpecialCase) ToUpper(r rune) rune
ToUpper maps the rune to upper case giving priority to the special mapping.
Directories
¶
Path | Synopsis |
---|---|
Package utf16 implements encoding and decoding of UTF-16 sequences.
|
Package utf16 implements encoding and decoding of UTF-16 sequences. |
Package utf8 implements functions and constants to support text encoded in UTF-8.
|
Package utf8 implements functions and constants to support text encoded in UTF-8. |