charset

package

v0.2.13 Latest Latest Go to latest Published: Mar 22, 2023 License: Apache-2.0 Imports: 21 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/arana-db/parser

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func AddCharset(c *Charset)
func AddCollation(c *Collation)
func AddSupportedCollation(c *Collation)
func CountValidBytes(e Encoding, src []byte) int
func CountValidBytesDecode(e Encoding, src []byte) int
func GetCharsetInfoByID(coID int) (string, string, error)
func GetDefaultCharsetAndCollate() (string, string)
func GetDefaultCollation(charset string) (string, error)
func GetDefaultCollationLegacy(charset string) (string, error)
func HackSlice(s string) (b []byte)
func HackString(b []byte) (s string)
func IsSupportedEncoding(charset string) bool
func Lookup(label string) (e encoding.Encoding, name string)
func NewCustomGBKEncoder() *encoding.Encoder
func RemoveCharset(c string)
func ValidCharsetAndCollation(cs string, co string) bool
type Charset
- func GetCharsetInfo(cs string) (*Charset, error)
- func GetSupportedCharsets() []*Charset
type Collation
- func GetCollationByID(id int) (*Collation, error)
- func GetCollationByName(name string) (*Collation, error)
- func GetCollations() []*Collation
- func GetSupportedCollations() []*Collation
type Encoding
- func FindEncoding(charset string) Encoding
- func FindEncodingTakeUTF8AsNoop(charset string) Encoding
type EncodingTp
type Op

Constants ¶

View Source

const (
	// CharsetBin is used for marking binary charset.
	CharsetBin = "binary"
	// CollationBin is the default collation for CharsetBin.
	CollationBin = "binary"
	// CharsetUTF8 is the default charset for string types.
	CharsetUTF8 = "utf8"
	// CollationUTF8 is the default collation for CharsetUTF8.
	CollationUTF8 = "utf8_bin"
	// CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go.
	CharsetUTF8MB4 = "utf8mb4"
	// CollationUTF8MB4 is the default collation for CharsetUTF8MB4.
	CollationUTF8MB4 = "utf8mb4_bin"
	// CharsetASCII is a subset of UTF8.
	CharsetASCII = "ascii"
	// CollationASCII is the default collation for CharsetACSII.
	CollationASCII = "ascii_bin"
	// CharsetLatin1 is a single byte charset.
	CharsetLatin1 = "latin1"
	// CollationLatin1 is the default collation for CharsetLatin1.
	CollationLatin1 = "latin1_bin"

	CollationGBKBin       = "gbk_bin"
	CollationGBKChineseCI = "gbk_chinese_ci"

	CharsetARMSCII8 = "armscii8"
	CharsetBig5     = "big5"
	CharsetBinary   = "binary"
	CharsetCP1250   = "cp1250"
	CharsetCP1251   = "cp1251"
	CharsetCP1256   = "cp1256"
	CharsetCP1257   = "cp1257"
	CharsetCP850    = "cp850"
	CharsetCP852    = "cp852"
	CharsetCP866    = "cp866"
	CharsetCP932    = "cp932"
	CharsetDEC8     = "dec8"
	CharsetEUCJPMS  = "eucjpms"
	CharsetEUCKR    = "euckr"
	CharsetGB18030  = "gb18030"
	CharsetGB2312   = "gb2312"
	CharsetGBK      = "gbk"
	CharsetGEOSTD8  = "geostd8"
	CharsetGreek    = "greek"
	CharsetHebrew   = "hebrew"
	CharsetHP8      = "hp8"
	CharsetKEYBCS2  = "keybcs2"
	CharsetKOI8R    = "koi8r"
	CharsetKOI8U    = "koi8u"
	CharsetLatin2   = "latin2"
	CharsetLatin5   = "latin5"
	CharsetLatin7   = "latin7"
	CharsetMacCE    = "macce"
	CharsetMacRoman = "macroman"
	CharsetSJIS     = "sjis"
	CharsetSWE7     = "swe7"
	CharsetTIS620   = "tis620"
	CharsetUCS2     = "ucs2"
	CharsetUJIS     = "ujis"
	CharsetUTF16    = "utf16"
	CharsetUTF16LE  = "utf16le"
	CharsetUTF32    = "utf32"
)

View Source

const (
	OpReplaceNoErr  = opFromUTF8 | opTruncateReplace | opCollectFrom | opSkipError
	OpReplace       = opFromUTF8 | opTruncateReplace | opCollectFrom
	OpEncode        = opFromUTF8 | opTruncateTrim | opCollectTo
	OpEncodeNoErr   = OpEncode | opSkipError
	OpEncodeReplace = opFromUTF8 | opTruncateReplace | opCollectTo
	OpDecode        = opToUTF8 | opTruncateTrim | opCollectTo
	OpDecodeNoErr   = OpDecode | opSkipError
	OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo
)

Variables ¶

View Source

var (
	ErrUnknownCollation         = terror.ClassDDL.NewStd(mysql.ErrUnknownCollation)
	ErrCollationCharsetMismatch = terror.ClassDDL.NewStd(mysql.ErrCollationCharsetMismatch)
)

View Source

var CharacterSetInfos = map[string]*Charset{
	CharsetUTF8:    {CharsetUTF8, CollationUTF8, make(map[string]*Collation), "UTF-8 Unicode", 3},
	CharsetUTF8MB4: {CharsetUTF8MB4, CollationUTF8MB4, make(map[string]*Collation), "UTF-8 Unicode", 4},
	CharsetASCII:   {CharsetASCII, CollationASCII, make(map[string]*Collation), "US ASCII", 1},
	CharsetLatin1:  {CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1},
	CharsetBin:     {CharsetBin, CollationBin, make(map[string]*Collation), "binary", 1},
	CharsetGBK:     {CharsetGBK, CollationGBKBin, make(map[string]*Collation), "Chinese Internal Code Specification", 2},
}

CharacterSetInfos: All the supported charsets should be in the following table.

View Source

var EncodingASCIIImpl = &encodingASCII{encodingBase{enc: encoding.Nop}}

EncodingASCIIImpl is the instance of encodingASCII

View Source

var EncodingBinImpl = &encodingBin{encodingBase{enc: encoding.Nop}}

EncodingBinImpl is the instance of encodingBin.

View Source

var EncodingGBKImpl = &encodingGBK{encodingBase{enc: customGBK{}}}

EncodingGBKImpl is the instance of encodingGBK

View Source

var EncodingLatin1Impl = &encodingLatin1{encodingUTF8{encodingBase{enc: encoding.Nop}}}

EncodingLatin1Impl is the instance of encodingLatin1. TiDB uses utf8 implementation for latin1 charset because of the backward compatibility.

View Source

var EncodingUTF8Impl = &encodingUTF8{encodingBase{enc: encoding.Nop}}

EncodingUTF8Impl is the instance of encodingUTF8.

View Source

var EncodingUTF8MB3StrictImpl = &encodingUTF8MB3Strict{
	encodingUTF8{
		encodingBase{
			enc: encoding.Nop,
		},
	},
}

EncodingUTF8MB3StrictImpl is the instance of encodingUTF8MB3Strict.

View Source

var ErrInvalidCharacterString = terror.ClassParser.NewStd(mysql.ErrInvalidCharacterString)

ErrInvalidCharacterString returns when the string is invalid in the specific charset.

View Source

var GBKCase = unicode.SpecialCase{
	unicode.CaseRange{Lo: 0x00E0, Hi: 0x00E1, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x00E8, Hi: 0x00EA, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x00EC, Hi: 0x00ED, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x00F2, Hi: 0x00F3, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x00F9, Hi: 0x00FA, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x00FC, Hi: 0x00FC, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x0101, Hi: 0x0101, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x0113, Hi: 0x0113, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x011B, Hi: 0x011B, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x012B, Hi: 0x012B, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x0144, Hi: 0x0144, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x0148, Hi: 0x0148, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x014D, Hi: 0x014D, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x016B, Hi: 0x016B, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01CE, Hi: 0x01CE, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01D0, Hi: 0x01D0, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01D2, Hi: 0x01D2, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01D4, Hi: 0x01D4, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01D6, Hi: 0x01D6, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01D8, Hi: 0x01D8, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01DA, Hi: 0x01DA, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x01DC, Hi: 0x01DC, Delta: [unicode.MaxCase]rune{0, 0, 0}},
	unicode.CaseRange{Lo: 0x216A, Hi: 0x216B, Delta: [unicode.MaxCase]rune{0, 0, 0}},
}

GBKCase follows https://dev.mysql.com/worklog/task/?id=4583.

View Source

var TiFlashSupportedCharsets = map[string]struct{}{
	CharsetUTF8:    {},
	CharsetUTF8MB4: {},
	CharsetASCII:   {},
	CharsetLatin1:  {},
	CharsetBin:     {},
}

TiFlashSupportedCharsets is a map which contains TiFlash supports charsets.

Functions ¶

func AddCharset ¶ added in v0.2.0

func AddCharset(c *Charset)

AddCharset adds a new charset. Use only when adding a custom charset to the parser.

func AddCollation ¶ added in v0.2.0

func AddCollation(c *Collation)

AddCollation adds a new collation. Use only when adding a custom collation to the parser.

func AddSupportedCollation ¶ added in v0.2.0

func AddSupportedCollation(c *Collation)

AddSupportedCollation adds a new collation into supportedCollations. Use only when adding a custom collation to the parser.

func CountValidBytes ¶ added in v0.2.0

func CountValidBytes(e Encoding, src []byte) int

CountValidBytes counts the first valid bytes in src that can be encoded to the current encoding.

func CountValidBytesDecode ¶ added in v0.2.0

func CountValidBytesDecode(e Encoding, src []byte) int

CountValidBytesDecode counts the first valid bytes in src that can be decoded to utf-8.

func GetCharsetInfoByID ¶

func GetCharsetInfoByID(coID int) (string, string, error)

GetCharsetInfoByID returns charset and collation for id as cs_number.

func GetDefaultCharsetAndCollate ¶

func GetDefaultCharsetAndCollate() (string, string)

GetDefaultCharsetAndCollate returns the default charset and collation.

func GetDefaultCollation ¶

func GetDefaultCollation(charset string) (string, error)

GetDefaultCollation returns the default collation for charset.

func GetDefaultCollationLegacy ¶ added in v0.2.0

func GetDefaultCollationLegacy(charset string) (string, error)

GetDefaultCollationLegacy is compatible with the charset support in old version parser.

func HackSlice ¶ added in v0.2.0

func HackSlice(s string) (b []byte)

HackSlice converts string to slice without copy. Use at your own risk.

func HackString ¶ added in v0.2.0

func HackString(b []byte) (s string)

HackString converts slice to string without copy. Use it at your own risk.

func IsSupportedEncoding ¶ added in v0.2.0

func IsSupportedEncoding(charset string) bool

IsSupportedEncoding checks if the charset is fully supported.

func Lookup ¶

func Lookup(label string) (e encoding.Encoding, name string)

Lookup returns the encoding with the specified label, and its canonical name. It returns nil and the empty string if label is not one of the standard encodings for HTML. Matching is case-insensitive and ignores leading and trailing whitespace.

func NewCustomGBKEncoder ¶ added in v0.2.0

func NewCustomGBKEncoder() *encoding.Encoder

NewCustomGBKEncoder return a custom GBK encoding.

func RemoveCharset ¶ added in v0.2.0

func RemoveCharset(c string)

RemoveCharset remove a charset. Use only when remove a custom charset to the parser.

func ValidCharsetAndCollation ¶

func ValidCharsetAndCollation(cs string, co string) bool

ValidCharsetAndCollation checks the charset and the collation validity and returns a boolean.

Types ¶

type Charset ¶

type Charset struct {
	Name             string
	DefaultCollation string
	Collations       map[string]*Collation
	Desc             string
	Maxlen           int
}

Charset is a charset. Now we only support MySQL.

func GetCharsetInfo ¶

func GetCharsetInfo(cs string) (*Charset, error)

GetCharsetInfo returns charset and collation for cs as name.

func GetSupportedCharsets ¶

func GetSupportedCharsets() []*Charset

GetSupportedCharsets gets descriptions for all charsets supported so far.

type Collation ¶

type Collation struct {
	ID          int
	CharsetName string
	Name        string
	IsDefault   bool
}

Collation is a collation. Now we only support MySQL.

func GetCollationByID ¶ added in v0.2.0

func GetCollationByID(id int) (*Collation, error)

GetCollationByID returns collations by given id.

func GetCollationByName ¶

func GetCollationByName(name string) (*Collation, error)

func GetCollations ¶

func GetCollations() []*Collation

GetCollations returns a list for all collations.

func GetSupportedCollations ¶

func GetSupportedCollations() []*Collation

GetSupportedCollations gets information for all collations supported so far.

type Encoding ¶ added in v0.2.0

type Encoding interface {
	// Name is the name of the encoding.
	Name() string
	// Tp is the type of the encoding.
	Tp() EncodingTp
	// Peek returns the next char.
	Peek(src []byte) []byte
	// MbLen returns multiple byte length, if the next character is single byte, return 0.
	MbLen(string) int
	// IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding.
	IsValid(src []byte) bool
	// Foreach iterates the characters in in current encoding.
	Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool)
	// Transform map the bytes in src to dest according to Op.
	// **the caller should initialize the dest if it wants to avoid memory alloc every time, or else it will always make a new one**
	// **the returned array may be the alias of `src`, edit the returned array on your own risk**
	Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error)
	// ToUpper change a string to uppercase.
	ToUpper(src string) string
	// ToLower change a string to lowercase.
	ToLower(src string) string
}

Encoding provide encode/decode functions for a string with a specific charset.

func FindEncoding ¶ added in v0.2.0

func FindEncoding(charset string) Encoding

FindEncoding finds the encoding according to charset.

func FindEncodingTakeUTF8AsNoop ¶ added in v0.2.0

func FindEncodingTakeUTF8AsNoop(charset string) Encoding

FindEncodingTakeUTF8AsNoop finds the encoding according to the charset except that utf-8 is treated as no-operation encoding. This is used to reduce the overhead of utf-8 validation in some cases.

type EncodingTp ¶ added in v0.2.0

type EncodingTp int8

const (
	EncodingTpNone EncodingTp = iota
	EncodingTpUTF8
	EncodingTpUTF8MB3Strict
	EncodingTpASCII
	EncodingTpLatin1
	EncodingTpBin
	EncodingTpGBK
)

type Op ¶ added in v0.2.0

type Op int16

Op is used by Encoding.Transform.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL