// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build ignore // Language tag table generator. // Data read from the web. package main import ( "flag" "fmt" "io" "log" "sort" "strconv" "strings" "golang.org/x/text/internal/gen" "golang.org/x/text/internal/language" "golang.org/x/text/unicode/cldr" ) var ( test = flag.Bool("test", false, "test existing tables; can be used to compare web data with package data.") outputFile = flag.String("output", "tables.go", "output file for generated tables") ) func main() { gen.Init() w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "language") b := newBuilder(w) gen.WriteCLDRVersion(w) b.writeConstants() b.writeMatchData() } type builder struct { w *gen.CodeWriter hw io.Writer // MultiWriter for w and w.Hash data *cldr.CLDR supp *cldr.SupplementalData } func (b *builder) langIndex(s string) uint16 { return uint16(language.MustParseBase(s)) } func (b *builder) regionIndex(s string) int { return int(language.MustParseRegion(s)) } func (b *builder) scriptIndex(s string) int { return int(language.MustParseScript(s)) } func newBuilder(w *gen.CodeWriter) *builder { r := gen.OpenCLDRCoreZip() defer r.Close() d := &cldr.Decoder{} data, err := d.DecodeZip(r) if err != nil { log.Fatal(err) } b := builder{ w: w, hw: io.MultiWriter(w, w.Hash), data: data, supp: data.Supplemental(), } return &b } // writeConsts computes f(v) for all v in values and writes the results // as constants named _v to a single constant block. func (b *builder) writeConsts(f func(string) int, values ...string) { fmt.Fprintln(b.w, "const (") for _, v := range values { fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v)) } fmt.Fprintln(b.w, ")") } // TODO: region inclusion data will probably not be use used in future matchers. var langConsts = []string{ "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und", } var scriptConsts = []string{ "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", "Zzzz", } var regionConsts = []string{ "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. } func (b *builder) writeConstants() { b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) b.writeConsts(b.regionIndex, regionConsts...) b.writeConsts(b.scriptIndex, scriptConsts...) } type mutualIntelligibility struct { want, have uint16 distance uint8 oneway bool } type scriptIntelligibility struct { wantLang, haveLang uint16 wantScript, haveScript uint8 distance uint8 // Always oneway } type regionIntelligibility struct { lang uint16 // compact language id script uint8 // 0 means any group uint8 // 0 means any; if bit 7 is set it means inverse distance uint8 // Always twoway. } // writeMatchData writes tables with languages and scripts for which there is // mutual intelligibility. The data is based on CLDR's languageMatching data. // Note that we use a different algorithm than the one defined by CLDR and that // we slightly modify the data. For example, we convert scores to confidence levels. // We also drop all region-related data as we use a different algorithm to // determine region equivalence. func (b *builder) writeMatchData() { lm := b.supp.LanguageMatching.LanguageMatches cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") regionHierarchy := map[string][]string{} for _, g := range b.supp.TerritoryContainment.Group { regions := strings.Split(g.Contains, " ") regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) } regionToGroups := make([]uint8, language.NumRegions) idToIndex := map[string]uint8{} for i, mv := range lm[0].MatchVariable { if i > 6 { log.Fatalf("Too many groups: %d", i) } idToIndex[mv.Id] = uint8(i + 1) // TODO: also handle '-' for _, r := range strings.Split(mv.Value, "+") { todo := []string{r} for k := 0; k < len(todo); k++ { r := todo[k] regionToGroups[b.regionIndex(r)] |= 1 << uint8(i) todo = append(todo, regionHierarchy[r]...) } } } b.w.WriteVar("regionToGroups", regionToGroups) // maps language id to in- and out-of-group region. paradigmLocales := [][3]uint16{} locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") for i := 0; i < len(locales); i += 2 { x := [3]uint16{} for j := 0; j < 2; j++ { pc := strings.SplitN(locales[i+j], "-", 2) x[0] = b.langIndex(pc[0]) if len(pc) == 2 { x[1+j] = uint16(b.regionIndex(pc[1])) } } paradigmLocales = append(paradigmLocales, x) } b.w.WriteVar("paradigmLocales", paradigmLocales) b.w.WriteType(mutualIntelligibility{}) b.w.WriteType(scriptIntelligibility{}) b.w.WriteType(regionIntelligibility{}) matchLang := []mutualIntelligibility{} matchScript := []scriptIntelligibility{} matchRegion := []regionIntelligibility{} // Convert the languageMatch entries in lists keyed by desired language. for _, m := range lm[0].LanguageMatch { // Different versions of CLDR use different separators. desired := strings.Replace(m.Desired, "-", "_", -1) supported := strings.Replace(m.Supported, "-", "_", -1) d := strings.Split(desired, "_") s := strings.Split(supported, "_") if len(d) != len(s) { log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) continue } distance, _ := strconv.ParseInt(m.Distance, 10, 8) switch len(d) { case 2: if desired == supported && desired == "*_*" { continue } // language-script pair. matchScript = append(matchScript, scriptIntelligibility{ wantLang: uint16(b.langIndex(d[0])), haveLang: uint16(b.langIndex(s[0])), wantScript: uint8(b.scriptIndex(d[1])), haveScript: uint8(b.scriptIndex(s[1])), distance: uint8(distance), }) if m.Oneway != "true" { matchScript = append(matchScript, scriptIntelligibility{ wantLang: uint16(b.langIndex(s[0])), haveLang: uint16(b.langIndex(d[0])), wantScript: uint8(b.scriptIndex(s[1])), haveScript: uint8(b.scriptIndex(d[1])), distance: uint8(distance), }) } case 1: if desired == supported && desired == "*" { continue } if distance == 1 { // nb == no is already handled by macro mapping. Check there // really is only this case. if d[0] != "no" || s[0] != "nb" { log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) } continue } // TODO: consider dropping oneway field and just doubling the entry. matchLang = append(matchLang, mutualIntelligibility{ want: uint16(b.langIndex(d[0])), have: uint16(b.langIndex(s[0])), distance: uint8(distance), oneway: m.Oneway == "true", }) case 3: if desired == supported && desired == "*_*_*" { continue } if desired != supported { // This is now supported by CLDR, but only one case, which // should already be covered by paradigm locales. For instance, // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in // testdata/CLDRLocaleMatcherTest.txt tests this. if supported != "en_*_GB" { log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) } continue } ri := regionIntelligibility{ lang: b.langIndex(d[0]), distance: uint8(distance), } if d[1] != "*" { ri.script = uint8(b.scriptIndex(d[1])) } switch { case d[2] == "*": ri.group = 0x80 // not contained in anything case strings.HasPrefix(d[2], "$!"): ri.group = 0x80 d[2] = "$" + d[2][len("$!"):] fallthrough case strings.HasPrefix(d[2], "$"): ri.group |= idToIndex[d[2]] } matchRegion = append(matchRegion, ri) default: log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) } } sort.SliceStable(matchLang, func(i, j int) bool { return matchLang[i].distance < matchLang[j].distance }) b.w.WriteComment(` matchLang holds pairs of langIDs of base languages that are typically mutually intelligible. Each pair is associated with a confidence and whether the intelligibility goes one or both ways.`) b.w.WriteVar("matchLang", matchLang) b.w.WriteComment(` matchScript holds pairs of scriptIDs where readers of one script can typically also read the other. Each is associated with a confidence.`) sort.SliceStable(matchScript, func(i, j int) bool { return matchScript[i].distance < matchScript[j].distance }) b.w.WriteVar("matchScript", matchScript) sort.SliceStable(matchRegion, func(i, j int) bool { return matchRegion[i].distance < matchRegion[j].distance }) b.w.WriteVar("matchRegion", matchRegion) }