Documentation ¶
Overview ¶
Package cpd - code page detect (c) 2020 softlandia@gmail.com
Index ¶
- Variables
- func CodepageAsString(codepage IDCodePage) string
- func DecodeUTF16be(s string) string
- func DecodeUTF16le(s string) string
- func FileConvertCodepage(fileName string, fromCP, toCP IDCodePage) error
- func IsSeparator(r rune) bool
- func NewReader(r io.Reader, cpn ...string) (io.Reader, error)
- func NewReaderTo(r io.Reader, cpn string) (io.Reader, error)
- func StrConvertCodepage(s string, fromCP, toCP IDCodePage) (string, error)
- func SupportedEncoder(cpn string) bool
- func ValidUTF8(data []byte) bool
- type CodePage
- type IDCodePage
- type MatchRes
- type TCodepagesDic
Constants ¶
This section is empty.
Variables ¶
var Boms = []struct { Bom []byte id IDCodePage }{ {[]byte{0xef, 0xbb, 0xbf}, UTF8}, {[]byte{0x00, 0x00, 0xfe, 0xff}, UTF32BE}, {[]byte{0xff, 0xfe, 0x00, 0x00}, UTF32LE}, {[]byte{0xfe, 0xff}, UTF16BE}, {[]byte{0xff, 0xfe}, UTF16LE}, }
Boms - byte oder mark - special bytes for
var ReadBufSize int = 1024
ReadBufSize - byte count for reading from file, func FileCodePageDetect()
Functions ¶
func CodepageAsString ¶
func CodepageAsString(codepage IDCodePage) string
CodepageAsString - return name of char set with id codepage if codepage not exist - return ""
func DecodeUTF16be ¶
DecodeUTF16be - decode slice of byte from UTF16 to UTF8
func DecodeUTF16le ¶
DecodeUTF16le - decode slice of byte from UTF16 to UTF8
func FileConvertCodepage ¶
func FileConvertCodepage(fileName string, fromCP, toCP IDCodePage) error
FileConvertCodepage - replace code page text file from one to another support convert only from/to Windows1251/IBM866
func IsSeparator ¶
IsSeparator - return true if input rune is SPACE or PUNCT
func NewReader ¶
NewReader - conversion to UTF-8 return input reader if input contain less 4 bytes return input reader if input contain ASCII data if cpn[0] exist, then using it as input codepage name
func NewReaderTo ¶
NewReaderTo - creates a new reader encoding from UTF-8 to the specified codepage return input reader and error if output codepage not found, or unsupport encoding if input str contains the BOM char, then BOM be deleted
func StrConvertCodepage ¶
func StrConvertCodepage(s string, fromCP, toCP IDCodePage) (string, error)
StrConvertCodepage - convert string from one code page to another function for future, at now support convert only from/to Windows1251/IBM866
func SupportedEncoder ¶
SupportedEncoder - check codepage name
Types ¶
type CodePage ¶
type CodePage struct { NumByte byte //number of byte using in codepage MatchRes //count of matching Boms []byte //default BOM for this codepage // contains filtered or unexported fields }
CodePage - realize code page
func (CodePage) FirstAlphabetPos ¶
FirstAlphabetPos - return position of first alphabet возвращает позицию первого алфавитного символа данной кодировки встреченную в отсортированном массиве
func (CodePage) MatchingRunes ¶
MatchingRunes - return string with rune/counts
type IDCodePage ¶
type IDCodePage uint16
IDCodePage - index of code page implements interface String()
const ( // ASCII is the uint16 identifier with IANA name US-ASCII (MIME: US-ASCII). // ANSI X3.4-1986 // Reference: RFC2046 ASCII IDCodePage = 3 // ISOLatinCyrillic is the MIB identifier with IANA name ISO_8859-5:1988 (MIME: ISO-8859-5). // // ISO-IR: International Register of Escape Sequences // Note: The current registration authority is IPSJ/ITSCJ, Japan. // Reference: RFC1345 ISOLatinCyrillic IDCodePage = 8 // UTF8 is the uint16 identifier with IANA name UTF-8. // // rfc3629 // Reference: RFC3629 UTF8 IDCodePage = 106 // Unicode is the uint16 identifier with IANA name ISO-10646-UCS-2. // // the 2-octet Basic Multilingual Plane, aka Unicode // this needs to specify network byte order: the standard // does not specify (it is a 16-bit integer space) Unicode IDCodePage = 1000 // UnicodeASCII is the uint16 identifier with IANA name ISO-10646-UCS-Basic. // // ASCII subset of Unicode. Basic Latin = collection 1 // See ISO 10646, Appendix A UnicodeASCII IDCodePage = 1002 // UTF7 is the uint16 identifier with IANA name UTF-7. // // rfc2152 // Reference: RFC2152 UTF7 IDCodePage = 1012 // UTF16BE is the uint16 identifier with IANA name UTF-16BE. // // rfc2781 // Reference: RFC2781 UTF16BE IDCodePage = 1013 // UTF16LE is the uint16 identifier with IANA name UTF-16LE. // // rfc2781 // Reference: RFC2781 UTF16LE IDCodePage = 1014 // UTF32 is the uint16 identifier with IANA name UTF-32. // // https://www.unicode.org/unicode/reports/tr19/ UTF32 IDCodePage = 1017 // UTF32BE is the uint16 identifier with IANA name UTF-32BE. // // https://www.unicode.org/unicode/reports/tr19/ UTF32BE IDCodePage = 1018 // UTF32LE is the uint16 identifier with IANA name UTF-32LE. // // https://www.unicode.org/unicode/reports/tr19/ UTF32LE IDCodePage = 1019 // KOI8R is the uint16 identifier with IANA name KOI8-R (MIME: KOI8-R). // // rfc1489 , based on GOST-19768-74, ISO-6937/8, // INIS-Cyrillic, ISO-5427. // Reference: RFC1489 KOI8R IDCodePage = 2084 // CP866 is the uint16 identifier with IANA name IBM866. // // IBM NLDG Volume 2 (SE09-8002-03) August 1994 CP866 IDCodePage = 2086 // CP1251 is the uint16 identifier with IANA name windows-1251. // // Microsoft http://www.iana.org/assignments/charset-reg/windows-1251 CP1251 IDCodePage = 2251 // Windows1252 is the uint16 identifier with IANA name windows-1252. // // Microsoft http://www.iana.org/assignments/charset-reg/windows-1252 Windows1252 IDCodePage = 2252 )
func CheckBOM ¶
func CheckBOM(buf []byte) (id IDCodePage, res bool)
CheckBOM - check buffer for match to utf-8, utf-16le or utf-16be BOM
func CodepageAutoDetect ¶
func CodepageAutoDetect(b []byte) IDCodePage
CodepageAutoDetect - auto detect code page of input content
func CodepageDetect ¶
func CodepageDetect(r io.Reader) (IDCodePage, error)
CodepageDetect - detect code page of ascii data from reader 'r'
func FileCodepageDetect ¶
func FileCodepageDetect(fn string, stopStr ...string) (IDCodePage, error)
FileCodepageDetect - detect codepage of text file
func (IDCodePage) BomLen ¶
func (i IDCodePage) BomLen() int
BomLen - return lenght in bytes of BOM for this for codepage no have Bom, return 0
func (IDCodePage) DeleteBom ¶
func (i IDCodePage) DeleteBom(s string) (res string)
DeleteBom - return string without prefix bom bytes
func (IDCodePage) DeleteBomFromReader ¶
func (i IDCodePage) DeleteBomFromReader(r io.Reader) io.Reader
DeleteBomFromReader - return reader after removing BOM from it
func (IDCodePage) ReaderHasBom ¶
func (i IDCodePage) ReaderHasBom(r io.Reader) bool
ReaderHasBom - check reader to BOM prefix
func (IDCodePage) String ¶
func (i IDCodePage) String() string
func (IDCodePage) StringHasBom ¶
func (i IDCodePage) StringHasBom(s string) bool
StringHasBom - return true if input string has BOM prefix
type MatchRes ¶
type MatchRes struct {
// contains filtered or unexported fields
}
MatchRes - result criteria countMatch - the number of letters founded in text countCvPairs - then number of pairs consonans+vowels
type TCodepagesDic ¶
type TCodepagesDic map[IDCodePage]CodePage
TCodepagesDic - type to store all supported code page
func NewCodepageDic ¶
func NewCodepageDic() TCodepagesDic
NewCodepageDic - create a new map by copying the global
func (TCodepagesDic) Match ¶
func (o TCodepagesDic) Match(data []byte) (result IDCodePage)
Match - return the id of code page to which the data best matches call function match of each codepage