Documentation ¶
Overview ¶
Package langdet detects natural languages in text.
Index ¶
Constants ¶
This section is empty.
Variables ¶
var ( BelarusianTag = language.Make("be") BosnianTag = language.Make("bs") IrishTag = language.Make("ga") JavaneseTag = language.Make("jv") LatinTag = language.Make("la") LuxembourgishTag = language.Make("lb") MalteseTag = language.Make("mt") MyanmarTag = language.Make("my") OriyaTag = language.Make("or") SundaneseTag = language.Make("su") TibetanTag = language.Make("bo") )
Tags for languages missing from golang.org/x/text/language.
var Albanian = Language{ Tag: language.Albanian, Trigrams: _AlbanianTrigrams, }
Albanian profiles the Albanian language.
var Belarusian = Language{ Tag: BelarusianTag, Trigrams: _BelarusianTrigrams, }
Belarusian profiles the Belarusian language.
var Bosnian = Language{ Tag: BosnianTag, Trigrams: _BosnianTrigrams, }
Bosnian profiles the Bosnian language.
var Bulgarian = Language{ Tag: language.Bulgarian, Trigrams: _BulgarianTrigrams, }
Bulgarian profiles the Bulgarian language.
var Croatian = Language{ Tag: language.Croatian, Trigrams: _CroatianTrigrams, }
Croatian profiles the Croatian language.
var Czech = Language{ Tag: language.Czech, Trigrams: _CzechTrigrams, }
Czech profiles the Czech language.
var Danish = Language{ Tag: language.Danish, Trigrams: _DanishTrigrams, }
Danish profiles the Danish language.
var DefaultOptions = Options{ Languages: map[*unicode.RangeTable]Languages{ unicode.Arabic: { DefaultTag: language.Arabic, }, unicode.Armenian: { DefaultTag: language.Armenian, }, unicode.Bengali: { DefaultTag: language.Bengali, }, unicode.Cyrillic: { Languages: []Language{ Belarusian, Bulgarian, Macedonian, Russian, Serbian, Ukrainian, }, }, unicode.Devanagari: { DefaultTag: language.Hindi, }, unicode.Ethiopic: { DefaultTag: language.Amharic, }, unicode.Javanese: { DefaultTag: JavaneseTag, }, unicode.Latin: { Languages: []Language{ Albanian, Bosnian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Hungarian, Icelandic, Irish, Italian, Latin, Latvian, Lithuanian, Luxembourgish, Maltese, NorwegianBokmål, NorwegianNynorsk, Polish, Portuguese, Romanian, Spanish, Slovak, Slovenian, Swedish, Turkish, }, }, unicode.Georgian: { DefaultTag: language.Georgian, }, unicode.Greek: { DefaultTag: language.Greek, }, unicode.Gujarati: { DefaultTag: language.Gujarati, }, unicode.Gurmukhi: { DefaultTag: language.Punjabi, }, unicode.Han: { DefaultTag: language.Chinese, }, unicode.Hangul: { DefaultTag: language.Korean, }, unicode.Hebrew: { DefaultTag: language.Hebrew, }, HiraganaKatakana: { DefaultTag: language.Japanese, }, unicode.Kannada: { DefaultTag: language.Kannada, }, unicode.Khmer: { DefaultTag: language.Khmer, }, unicode.Lao: { DefaultTag: language.Lao, }, unicode.Malayalam: { DefaultTag: language.Malayalam, }, unicode.Myanmar: { DefaultTag: MyanmarTag, }, unicode.Oriya: { DefaultTag: OriyaTag, }, unicode.Sinhala: { DefaultTag: language.Sinhala, }, unicode.Sundanese: { DefaultTag: SundaneseTag, }, unicode.Tamil: { DefaultTag: language.Tamil, }, unicode.Telugu: { DefaultTag: language.Telugu, }, unicode.Thai: { DefaultTag: language.Thai, }, unicode.Tibetan: { DefaultTag: TibetanTag, }, }, Scripts: []*unicode.RangeTable{ unicode.Latin, unicode.Han, unicode.Arabic, unicode.Devanagari, unicode.Bengali, unicode.Cyrillic, HiraganaKatakana, unicode.Javanese, unicode.Hangul, unicode.Telugu, unicode.Tamil, unicode.Gujarati, unicode.Kannada, unicode.Myanmar, unicode.Malayalam, unicode.Thai, unicode.Sundanese, unicode.Gurmukhi, unicode.Lao, unicode.Oriya, unicode.Ethiopic, unicode.Sinhala, unicode.Hebrew, unicode.Armenian, unicode.Khmer, unicode.Greek, unicode.Tibetan, unicode.Georgian, }, }
DefaultOptions is a default set of options that detects the most commonly used languages worldwide.
var Dutch = Language{ Tag: language.Dutch, Trigrams: _DutchTrigrams, }
Dutch profiles the Dutch language.
var English = Language{ Tag: language.English, Trigrams: _EnglishTrigrams, }
English profiles the English language.
var Estonian = Language{ Tag: language.Estonian, Trigrams: _EstonianTrigrams, }
Estonian profiles the Estonian language.
var Finnish = Language{ Tag: language.Finnish, Trigrams: _FinnishTrigrams, }
Finnish profiles the Finnish language.
var French = Language{ Tag: language.French, Trigrams: _FrenchTrigrams, }
French profiles the French language.
var German = Language{ Tag: language.German, Trigrams: _GermanTrigrams, }
German profiles the German language.
var HiraganaKatakana = &unicode.RangeTable{ R16: append(unicode.Hiragana.R16, unicode.Katakana.R16...), R32: append(unicode.Hiragana.R32, unicode.Katakana.R32...), }
HiraganaKatakana is the unicode set of Japanese characters.
var Hungarian = Language{ Tag: language.Hungarian, Trigrams: _HungarianTrigrams, }
Hungarian profiles the Hungarian language.
var Icelandic = Language{ Tag: language.Icelandic, Trigrams: _IcelandicTrigrams, }
Icelandic profiles the Icelandic language.
var Irish = Language{ Tag: IrishTag, Trigrams: _IrishTrigrams, }
Irish profiles the Irish language.
var Italian = Language{ Tag: language.Italian, Trigrams: _ItalianTrigrams, }
Italian profiles the Italian language.
var Latin = Language{ Tag: LatinTag, Trigrams: _LatinTrigrams, }
Latin profiles the Latin language.
var Latvian = Language{ Tag: language.Latvian, Trigrams: _LatvianTrigrams, }
Latvian profiles the Latvian language.
var Lithuanian = Language{ Tag: language.Lithuanian, Trigrams: _LithuanianTrigrams, }
Lithuanian profiles the Lithuanian language.
var Luxembourgish = Language{ Tag: LuxembourgishTag, Trigrams: _LuxembourgishTrigrams, }
Luxembourgish profiles the Luxembourgish language.
var Macedonian = Language{ Tag: language.Macedonian, Trigrams: _MacedonianTrigrams, }
Macedonian profiles the Macedonian language.
var Maltese = Language{ Tag: MalteseTag, Trigrams: _MalteseTrigrams, }
Maltese profiles the Maltese language.
var NorwegianBokmål = Language{ Tag: language.Norwegian, Trigrams: _NorwegianBokmålTrigrams, }
NorwegianBokmål profiles the NorwegianBokmål language.
var NorwegianNynorsk = Language{ Tag: language.Norwegian, Trigrams: _NorwegianNynorskTrigrams, }
NorwegianNynorsk profiles the NorwegianNynorsk language.
var Polish = Language{ Tag: language.Polish, Trigrams: _PolishTrigrams, }
Polish profiles the Polish language.
var Portuguese = Language{ Tag: language.Portuguese, Trigrams: _PortugueseTrigrams, }
Portuguese profiles the Portuguese language.
var Romanian = Language{ Tag: language.Romanian, Trigrams: _RomanianTrigrams, }
Romanian profiles the Romanian language.
var Russian = Language{ Tag: language.Russian, Trigrams: _RussianTrigrams, }
Russian profiles the Russian language.
var Serbian = Language{ Tag: language.Serbian, Trigrams: _SerbianTrigrams, }
Serbian profiles the Serbian language.
var Slovak = Language{ Tag: language.Slovak, Trigrams: _SlovakTrigrams, }
Slovak profiles the Slovak language.
var Slovenian = Language{ Tag: language.Slovenian, Trigrams: _SlovenianTrigrams, }
Slovenian profiles the Slovenian language.
var Spanish = Language{ Tag: language.Spanish, Trigrams: _SpanishTrigrams, }
Spanish profiles the Spanish language.
var Swedish = Language{ Tag: language.Swedish, Trigrams: _SwedishTrigrams, }
Swedish profiles the Swedish language.
var Turkish = Language{ Tag: language.Turkish, Trigrams: _TurkishTrigrams, }
Turkish profiles the Turkish language.
var Ukrainian = Language{ Tag: language.Ukrainian, Trigrams: _UkrainianTrigrams, }
Ukrainian profiles the Ukrainian language.
Functions ¶
func DetectLanguage ¶
DetectLanguage is a shorthand that calls DetectLanguageWithOptions with the default options and returns the best detected language.
func DetectScript ¶
func DetectScript(s string, scripts []*unicode.RangeTable) *unicode.RangeTable
DetectScript detects the dominant writing script of s.
Types ¶
type Language ¶
type Language struct { // Tag is the BCP 47 language tag. Tag language.Tag // Trigrams is the trigrams profile created by Train. Trigrams []Trigram }
Language profiles a natural language.
type Languages ¶
type Languages struct { // DefaultTag is the default language tag used if Languages is empty. DefaultTag language.Tag // Languages is the set of languages sharing the same writing script. // If this is empty or nil, the detected language is always DefaultTag. Languages []Language }
Languages is a set of languages that share the same writing script.
type Options ¶
type Options struct { // Scripts is the set of writing scripts to detect. Scripts []*unicode.RangeTable // Languages maps writing systems to a set of languages. Languages map[*unicode.RangeTable]Languages // MinConfidence is the minimum confidence that must be met // before DetectLanguage returns the detected language. MinConfidence float64 // MinRelConfidence is the minimum confidence difference // that must be met between detected languages. // Languages that do not meet the minimum are filtered from the result. MinRelConfidence float64 }
Options configures the language detector.
type Result ¶
type Result struct { // Tag is the detected language. Tag language.Tag // Confidence is the probability that this language is correct, between 0 and 1. Confidence float64 }
Result holds a detected language and confidence.
func DetectLanguageWithOptions ¶
DetectLanguageWithOptions detects the language of s configured by options. It returns a set of candidate languages ordered by confidence level. At least one result is always returned.
type Trigram ¶
type Trigram [3]rune
Trigram is a tuple of three unicode runes.
func (Trigram) MarshalText ¶
MarshalText implements encoding.TextMarshaler.
func (*Trigram) UnmarshalText ¶
UnmarshalText implements encoding.TextUnmarshaler.
Source Files ¶
- default.go
- lang_bel.go
- lang_bos.go
- lang_bul.go
- lang_ces.go
- lang_dan.go
- lang_deu.go
- lang_eng.go
- lang_est.go
- lang_fin.go
- lang_fra.go
- lang_gle.go
- lang_hrv.go
- lang_hun.go
- lang_isl.go
- lang_ita.go
- lang_lat.go
- lang_lav.go
- lang_lit.go
- lang_ltz.go
- lang_mkd.go
- lang_mlt.go
- lang_nld.go
- lang_nno.go
- lang_nob.go
- lang_pol.go
- lang_por.go
- lang_ron.go
- lang_rus.go
- lang_slk.go
- lang_slv.go
- lang_spa.go
- lang_sqi.go
- lang_srp.go
- lang_swe.go
- lang_tur.go
- lang_ukr.go
- langdet.go
- scriptdet.go
- trigram.go