Documentation ¶
Index ¶
- Constants
- Variables
- func AddCharset(c *Charset)
- func AddCollation(c *Collation)
- func AddSupportedCollation(c *Collation)
- func CountValidBytes(e Encoding, src []byte) int
- func CountValidBytesDecode(e Encoding, src []byte) int
- func GetCharsetInfoByID(coID int) (charsetStr string, collateStr string, err error)
- func GetDefaultCharsetAndCollate() (defaultCharset string, defaultCollationName string)
- func GetDefaultCollation(charset string) (string, error)
- func GetDefaultCollationLegacy(charset string) (string, error)
- func HackSlice(s string) (b []byte)
- func HackString(b []byte) (s string)
- func IsSupportedEncoding(charset string) bool
- func Lookup(label string) (e encoding.Encoding, name string)
- func NewCustomGBKEncoder() *encoding.Encoder
- func RemoveCharset(c string)
- func ValidCharsetAndCollation(cs string, co string) bool
- type Charset
- type Collation
- type Encoding
- type EncodingTp
- type Op
Constants ¶
const ( // CollationBin is the default collation for CharsetBin. CollationBin = "binary" // CollationUTF8 is the default collation for CharsetUTF8. CollationUTF8 = "utf8_bin" // CollationUTF8MB4 is the default collation for CharsetUTF8MB4. CollationUTF8MB4 = "utf8mb4_bin" // CollationASCII is the default collation for CharsetACSII. CollationASCII = "ascii_bin" // CollationLatin1 is the default collation for CharsetLatin1. CollationLatin1 = "latin1_bin" // CollationGBKBin is the default collation for CharsetGBK when new collation is disabled. CollationGBKBin = "gbk_bin" // CollationGBKChineseCI is the default collation for CharsetGBK when new collation is enabled. CollationGBKChineseCI = "gbk_chinese_ci" )
const ( // CharsetASCII is a subset of UTF8. CharsetASCII = "ascii" // CharsetBin is used for marking binary charset. CharsetBin = "binary" // CharsetLatin1 is a single byte charset. CharsetLatin1 = "latin1" // CharsetUTF8 is the default charset for string types. CharsetUTF8 = "utf8" // CharsetUTF8MB3 is 3 bytes utf8, a MySQL legacy encoding. "utf8" and "utf8mb3" are aliases. CharsetUTF8MB3 = "utf8mb3" // CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go. CharsetUTF8MB4 = "utf8mb4" //revive:disable:exported CharsetARMSCII8 = "armscii8" CharsetBig5 = "big5" CharsetCP1250 = "cp1250" CharsetCP1251 = "cp1251" CharsetCP1256 = "cp1256" CharsetCP1257 = "cp1257" CharsetCP850 = "cp850" CharsetCP852 = "cp852" CharsetCP866 = "cp866" CharsetCP932 = "cp932" CharsetDEC8 = "dec8" CharsetEUCJPMS = "eucjpms" CharsetEUCKR = "euckr" CharsetGB18030 = "gb18030" CharsetGB2312 = "gb2312" CharsetGBK = "gbk" CharsetGEOSTD8 = "geostd8" CharsetGreek = "greek" CharsetHebrew = "hebrew" CharsetHP8 = "hp8" CharsetKEYBCS2 = "keybcs2" CharsetKOI8R = "koi8r" CharsetKOI8U = "koi8u" CharsetLatin2 = "latin2" CharsetLatin5 = "latin5" CharsetLatin7 = "latin7" CharsetMacCE = "macce" CharsetMacRoman = "macroman" CharsetSJIS = "sjis" CharsetSWE7 = "swe7" CharsetTIS620 = "tis620" CharsetUCS2 = "ucs2" CharsetUJIS = "ujis" CharsetUTF16 = "utf16" CharsetUTF16LE = "utf16le" CharsetUTF32 = "utf32" )
const ( // OpReplaceNoErr is used to replace invalid bytes with '?'. OpReplaceNoErr = opFromUTF8 | opTruncateReplace | opCollectFrom | opSkipError OpReplace = opFromUTF8 | opTruncateReplace | opCollectFrom OpEncode = opFromUTF8 | opTruncateTrim | opCollectTo OpEncodeNoErr = OpEncode | opSkipError OpEncodeReplace = opFromUTF8 | opTruncateReplace | opCollectTo OpDecode = opToUTF8 | opTruncateTrim | opCollectTo OpDecodeNoErr = OpDecode | opSkipError OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo )
Variables ¶
var ( // ErrUnknownCollation is unknown collation. ErrUnknownCollation = terror.ClassDDL.NewStd(mysql.ErrUnknownCollation) // ErrCollationCharsetMismatch is collation charset mismatch. ErrCollationCharsetMismatch = terror.ClassDDL.NewStd(mysql.ErrCollationCharsetMismatch) )
var ( // PadSpace is to mark that trailing spaces are insignificant in comparisons PadSpace = "PAD SPACE" // PadNone is to mark that trailing spaces are significant in comparisons PadNone = "NO PAD" )
var CharacterSetInfos = map[string]*Charset{ CharsetUTF8: {CharsetUTF8, CollationUTF8, make(map[string]*Collation), "UTF-8 Unicode", 3}, CharsetUTF8MB4: {CharsetUTF8MB4, CollationUTF8MB4, make(map[string]*Collation), "UTF-8 Unicode", 4}, CharsetASCII: {CharsetASCII, CollationASCII, make(map[string]*Collation), "US ASCII", 1}, CharsetLatin1: {CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1}, CharsetBin: {CharsetBin, CollationBin, make(map[string]*Collation), "binary", 1}, CharsetGBK: {CharsetGBK, CollationGBKBin, make(map[string]*Collation), "Chinese Internal Code Specification", 2}, }
CharacterSetInfos contains all the supported charsets.
var EncodingASCIIImpl = &encodingASCII{encodingBase{enc: encoding.Nop}}
EncodingASCIIImpl is the instance of encodingASCII
var EncodingBinImpl = &encodingBin{encodingBase{enc: encoding.Nop}}
EncodingBinImpl is the instance of encodingBin.
var EncodingGBKImpl = &encodingGBK{encodingBase{enc: customGBK{}}}
EncodingGBKImpl is the instance of encodingGBK
var EncodingLatin1Impl = &encodingLatin1{encodingUTF8{encodingBase{enc: encoding.Nop}}}
EncodingLatin1Impl is the instance of encodingLatin1. TiDB uses utf8 implementation for latin1 charset because of the backward compatibility.
var EncodingUTF8Impl = &encodingUTF8{encodingBase{enc: encoding.Nop}}
EncodingUTF8Impl is the instance of encodingUTF8.
var EncodingUTF8MB3StrictImpl = &encodingUTF8MB3Strict{ encodingUTF8{ encodingBase{ enc: encoding.Nop, }, }, }
EncodingUTF8MB3StrictImpl is the instance of encodingUTF8MB3Strict.
var ErrInvalidCharacterString = terror.ClassParser.NewStd(mysql.ErrInvalidCharacterString)
ErrInvalidCharacterString returns when the string is invalid in the specific charset.
var GBKCase = unicode.SpecialCase{ unicode.CaseRange{Lo: 0x00E0, Hi: 0x00E1, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x00E8, Hi: 0x00EA, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x00EC, Hi: 0x00ED, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x00F2, Hi: 0x00F3, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x00F9, Hi: 0x00FA, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x00FC, Hi: 0x00FC, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x0101, Hi: 0x0101, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x0113, Hi: 0x0113, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x011B, Hi: 0x011B, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x012B, Hi: 0x012B, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x0144, Hi: 0x0144, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x0148, Hi: 0x0148, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x014D, Hi: 0x014D, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x016B, Hi: 0x016B, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01CE, Hi: 0x01CE, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01D0, Hi: 0x01D0, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01D2, Hi: 0x01D2, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01D4, Hi: 0x01D4, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01D6, Hi: 0x01D6, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01D8, Hi: 0x01D8, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01DA, Hi: 0x01DA, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x01DC, Hi: 0x01DC, Delta: [unicode.MaxCase]rune{0, 0, 0}}, unicode.CaseRange{Lo: 0x216A, Hi: 0x216B, Delta: [unicode.MaxCase]rune{0, 0, 0}}, }
GBKCase follows https://dev.mysql.com/worklog/task/?id=4583.
var TiFlashSupportedCharsets = map[string]struct{}{
CharsetUTF8: {},
CharsetUTF8MB4: {},
CharsetASCII: {},
CharsetLatin1: {},
CharsetBin: {},
}
TiFlashSupportedCharsets is a map which contains TiFlash supports charsets.
Functions ¶
func AddCharset ¶
func AddCharset(c *Charset)
AddCharset adds a new charset. Use only when adding a custom charset to the parser.
func AddCollation ¶
func AddCollation(c *Collation)
AddCollation adds a new collation. Use only when adding a custom collation to the parser.
func AddSupportedCollation ¶
func AddSupportedCollation(c *Collation)
AddSupportedCollation adds a new collation into supportedCollations. Use only when adding a custom collation to the parser.
func CountValidBytes ¶
CountValidBytes counts the first valid bytes in src that can be encoded to the current encoding.
func CountValidBytesDecode ¶
CountValidBytesDecode counts the first valid bytes in src that can be decoded to utf-8.
func GetCharsetInfoByID ¶
GetCharsetInfoByID returns charset and collation for id as cs_number.
func GetDefaultCharsetAndCollate ¶
GetDefaultCharsetAndCollate returns the default charset and collation.
func GetDefaultCollation ¶
GetDefaultCollation returns the default collation for charset.
func GetDefaultCollationLegacy ¶
GetDefaultCollationLegacy is compatible with the charset support in old version parser.
func HackString ¶
HackString converts slice to string without copy. Use it at your own risk.
func IsSupportedEncoding ¶
IsSupportedEncoding checks if the charset is fully supported.
func Lookup ¶
Lookup returns the encoding with the specified label, and its canonical name. It returns nil and the empty string if label is not one of the standard encodings for HTML. Matching is case-insensitive and ignores leading and trailing whitespace.
func NewCustomGBKEncoder ¶
NewCustomGBKEncoder return a custom GBK encoding.
func RemoveCharset ¶
func RemoveCharset(c string)
RemoveCharset remove a charset. Use only when remove a custom charset to the parser.
func ValidCharsetAndCollation ¶
ValidCharsetAndCollation checks the charset and the collation validity and returns a boolean.
Types ¶
type Charset ¶
type Charset struct { Name string DefaultCollation string Collations map[string]*Collation Desc string Maxlen int }
Charset is a charset. Now we only support MySQL.
func GetCharsetInfo ¶
GetCharsetInfo returns charset and collation for cs as name.
func GetSupportedCharsets ¶
func GetSupportedCharsets() []*Charset
GetSupportedCharsets gets descriptions for all charsets supported so far.
type Collation ¶
type Collation struct { ID int CharsetName string Name string IsDefault bool Sortlen int PadAttribute string }
Collation is a collation. Now we only support MySQL.
func GetCollationByID ¶
GetCollationByID returns collations by given id.
func GetCollationByName ¶
GetCollationByName returns the collation by name.
func GetSupportedCollations ¶
func GetSupportedCollations() []*Collation
GetSupportedCollations gets information for all collations supported so far.
type Encoding ¶
type Encoding interface { // Name is the name of the encoding. Name() string // Tp is the type of the encoding. Tp() EncodingTp // Peek returns the next char. Peek(src []byte) []byte // MbLen returns multiple byte length, if the next character is single byte, return 0. MbLen(string) int // IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding. IsValid(src []byte) bool // Foreach iterates the characters in current encoding. Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) // Transform map the bytes in src to dest according to Op. // **the caller should initialize the dest if it wants to avoid memory alloc every time, // or else it will always make a new one** // // **the returned array may be the alias of `src`, edit the returned array on your own risk** Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error) // ToUpper change a string to uppercase. ToUpper(src string) string // ToLower change a string to lowercase. ToLower(src string) string }
Encoding provide encode/decode functions for a string with a specific charset.
func FindEncoding ¶
FindEncoding finds the encoding according to charset.
func FindEncodingTakeUTF8AsNoop ¶
FindEncodingTakeUTF8AsNoop finds the encoding according to the charset except that utf-8 is treated as no-operation encoding. This is used to reduce the overhead of utf-8 validation in some cases.
type EncodingTp ¶
type EncodingTp int8
EncodingTp is the type of the encoding.
const ( EncodingTpNone EncodingTp = iota EncodingTpUTF8 EncodingTpUTF8MB3Strict EncodingTpASCII EncodingTpLatin1 EncodingTpBin EncodingTpGBK )