// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // TODO: remove hard-coded versions when we have implemented fractional weights. // The current implementation is incompatible with later CLDR versions. //go:generate go run maketables.go -cldr=23 -unicode=6.2.0 // Package collate contains types for comparing and sorting Unicode strings // according to a given collation order. package collate // import "golang.org/x/text/collate" import ( "bytes" "strings" "golang.org/x/text/internal/colltab" "golang.org/x/text/language" ) // Collator provides functionality for comparing strings for a given // collation order. type Collator struct { options sorter sorter _iter [2]iter } func (c *Collator) iter(i int) *iter { // TODO: evaluate performance for making the second iterator optional. return &c._iter[i] } // Supported returns the list of languages for which collating differs from its parent. func Supported() []language.Tag { // TODO: use language.Coverage instead. t := make([]language.Tag, len(tags)) copy(t, tags) return t } func init() { ids := strings.Split(availableLocales, ",") tags = make([]language.Tag, len(ids)) for i, s := range ids { tags[i] = language.Raw.MustParse(s) } } var tags []language.Tag // New returns a new Collator initialized for the given locale. func New(t language.Tag, o ...Option) *Collator { index := colltab.MatchLang(t, tags) c := newCollator(getTable(locales[index])) // Set options from the user-supplied tag. c.setFromTag(t) // Set the user-supplied options. c.setOptions(o) c.init() return c } // NewFromTable returns a new Collator for the given Weighter. func NewFromTable(w colltab.Weighter, o ...Option) *Collator { c := newCollator(w) c.setOptions(o) c.init() return c } func (c *Collator) init() { if c.numeric { c.t = colltab.NewNumericWeighter(c.t) } c._iter[0].init(c) c._iter[1].init(c) } // Buffer holds keys generated by Key and KeyString. type Buffer struct { buf [4096]byte key []byte } func (b *Buffer) init() { if b.key == nil { b.key = b.buf[:0] } } // Reset clears the buffer from previous results generated by Key and KeyString. func (b *Buffer) Reset() { b.key = b.key[:0] } // Compare returns an integer comparing the two byte slices. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. func (c *Collator) Compare(a, b []byte) int { // TODO: skip identical prefixes once we have a fast way to detect if a rune is // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. c.iter(0).SetInput(a) c.iter(1).SetInput(b) if res := c.compare(); res != 0 { return res } if !c.ignore[colltab.Identity] { return bytes.Compare(a, b) } return 0 } // CompareString returns an integer comparing the two strings. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. func (c *Collator) CompareString(a, b string) int { // TODO: skip identical prefixes once we have a fast way to detect if a rune is // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. c.iter(0).SetInputString(a) c.iter(1).SetInputString(b) if res := c.compare(); res != 0 { return res } if !c.ignore[colltab.Identity] { if a < b { return -1 } else if a > b { return 1 } } return 0 } func compareLevel(f func(i *iter) int, a, b *iter) int { a.pce = 0 b.pce = 0 for { va := f(a) vb := f(b) if va != vb { if va < vb { return -1 } return 1 } else if va == 0 { break } } return 0 } func (c *Collator) compare() int { ia, ib := c.iter(0), c.iter(1) // Process primary level if c.alternate != altShifted { // TODO: implement script reordering if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { return res } } else { // TODO: handle shifted } if !c.ignore[colltab.Secondary] { f := (*iter).nextSecondary if c.backwards { f = (*iter).prevSecondary } if res := compareLevel(f, ia, ib); res != 0 { return res } } // TODO: special case handling (Danish?) if !c.ignore[colltab.Tertiary] || c.caseLevel { if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { return res } if !c.ignore[colltab.Quaternary] { if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { return res } } } return 0 } // Key returns the collation key for str. // Passing the buffer buf may avoid memory allocations. // The returned slice will point to an allocation in Buffer and will remain // valid until the next call to buf.Reset(). func (c *Collator) Key(buf *Buffer, str []byte) []byte { // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details. buf.init() return c.key(buf, c.getColElems(str)) } // KeyFromString returns the collation key for str. // Passing the buffer buf may avoid memory allocations. // The returned slice will point to an allocation in Buffer and will retain // valid until the next call to buf.ResetKeys(). func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details. buf.init() return c.key(buf, c.getColElemsString(str)) } func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte { processWeights(c.alternate, c.t.Top(), w) kn := len(buf.key) c.keyFromElems(buf, w) return buf.key[kn:] } func (c *Collator) getColElems(str []byte) []colltab.Elem { i := c.iter(0) i.SetInput(str) for i.Next() { } return i.Elems } func (c *Collator) getColElemsString(str string) []colltab.Elem { i := c.iter(0) i.SetInputString(str) for i.Next() { } return i.Elems } type iter struct { wa [512]colltab.Elem colltab.Iter pce int } func (i *iter) init(c *Collator) { i.Weighter = c.t i.Elems = i.wa[:0] } func (i *iter) nextPrimary() int { for { for ; i.pce < i.N; i.pce++ { if v := i.Elems[i.pce].Primary(); v != 0 { i.pce++ return v } } if !i.Next() { return 0 } } panic("should not reach here") } func (i *iter) nextSecondary() int { for ; i.pce < len(i.Elems); i.pce++ { if v := i.Elems[i.pce].Secondary(); v != 0 { i.pce++ return v } } return 0 } func (i *iter) prevSecondary() int { for ; i.pce < len(i.Elems); i.pce++ { if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 { i.pce++ return v } } return 0 } func (i *iter) nextTertiary() int { for ; i.pce < len(i.Elems); i.pce++ { if v := i.Elems[i.pce].Tertiary(); v != 0 { i.pce++ return int(v) } } return 0 } func (i *iter) nextQuaternary() int { for ; i.pce < len(i.Elems); i.pce++ { if v := i.Elems[i.pce].Quaternary(); v != 0 { i.pce++ return v } } return 0 } func appendPrimary(key []byte, p int) []byte { // Convert to variable length encoding; supports up to 23 bits. if p <= 0x7FFF { key = append(key, uint8(p>>8), uint8(p)) } else { key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p)) } return key } // keyFromElems converts the weights ws to a compact sequence of bytes. // The result will be appended to the byte buffer in buf. func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) { for _, v := range ws { if w := v.Primary(); w > 0 { buf.key = appendPrimary(buf.key, w) } } if !c.ignore[colltab.Secondary] { buf.key = append(buf.key, 0, 0) // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. if !c.backwards { for _, v := range ws { if w := v.Secondary(); w > 0 { buf.key = append(buf.key, uint8(w>>8), uint8(w)) } } } else { for i := len(ws) - 1; i >= 0; i-- { if w := ws[i].Secondary(); w > 0 { buf.key = append(buf.key, uint8(w>>8), uint8(w)) } } } } else if c.caseLevel { buf.key = append(buf.key, 0, 0) } if !c.ignore[colltab.Tertiary] || c.caseLevel { buf.key = append(buf.key, 0, 0) for _, v := range ws { if w := v.Tertiary(); w > 0 { buf.key = append(buf.key, uint8(w)) } } // Derive the quaternary weights from the options and other levels. // Note that we represent MaxQuaternary as 0xFF. The first byte of the // representation of a primary weight is always smaller than 0xFF, // so using this single byte value will compare correctly. if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted { if c.alternate == altShiftTrimmed { lastNonFFFF := len(buf.key) buf.key = append(buf.key, 0) for _, v := range ws { if w := v.Quaternary(); w == colltab.MaxQuaternary { buf.key = append(buf.key, 0xFF) } else if w > 0 { buf.key = appendPrimary(buf.key, w) lastNonFFFF = len(buf.key) } } buf.key = buf.key[:lastNonFFFF] } else { buf.key = append(buf.key, 0) for _, v := range ws { if w := v.Quaternary(); w == colltab.MaxQuaternary { buf.key = append(buf.key, 0xFF) } else if w > 0 { buf.key = appendPrimary(buf.key, w) } } } } } } func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) { ignore := false vtop := int(top) switch vw { case altShifted, altShiftTrimmed: for i := range wa { if p := wa[i].Primary(); p <= vtop && p != 0 { wa[i] = colltab.MakeQuaternary(p) ignore = true } else if p == 0 { if ignore { wa[i] = colltab.Ignore } } else { ignore = false } } case altBlanked: for i := range wa { if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) { wa[i] = colltab.Ignore ignore = true } else { ignore = false } } } }