aboutsummaryrefslogblamecommitdiffstats
path: root/vendor/golang.org/x/text/unicode/cldr/collate.go
blob: 80ee28d795e3ab6ad38a3c2b3f4e8fab9f4ea069 (plain) (tree)






































































































































































































































































































































































                                                                                              
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package cldr

import (
    "bufio"
    "encoding/xml"
    "errors"
    "fmt"
    "strconv"
    "strings"
    "unicode"
    "unicode/utf8"
)

// RuleProcessor can be passed to Collator's Process method, which
// parses the rules and calls the respective method for each rule found.
type RuleProcessor interface {
    Reset(anchor string, before int) error
    Insert(level int, str, context, extend string) error
    Index(id string)
}

const (
    // cldrIndex is a Unicode-reserved sentinel value used to mark the start
    // of a grouping within an index.
    // We ignore any rule that starts with this rune.
    // See http://unicode.org/reports/tr35/#Collation_Elements for details.
    cldrIndex = "\uFDD0"

    // specialAnchor is the format in which to represent logical reset positions,
    // such as "first tertiary ignorable".
    specialAnchor = "<%s/>"
)

// Process parses the rules for the tailorings of this collation
// and calls the respective methods of p for each rule found.
func (c Collation) Process(p RuleProcessor) (err error) {
    if len(c.Cr) > 0 {
        if len(c.Cr) > 1 {
            return fmt.Errorf("multiple cr elements, want 0 or 1")
        }
        return processRules(p, c.Cr[0].Data())
    }
    if c.Rules.Any != nil {
        return c.processXML(p)
    }
    return errors.New("no tailoring data")
}

// processRules parses rules in the Collation Rule Syntax defined in
// http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
func processRules(p RuleProcessor, s string) (err error) {
    chk := func(s string, e error) string {
        if err == nil {
            err = e
        }
        return s
    }
    i := 0 // Save the line number for use after the loop.
    scanner := bufio.NewScanner(strings.NewReader(s))
    for ; scanner.Scan() && err == nil; i++ {
        for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
            level := 5
            var ch byte
            switch ch, s = s[0], s[1:]; ch {
            case '&': // followed by <anchor> or '[' <key> ']'
                if s = skipSpace(s); consume(&s, '[') {
                    s = chk(parseSpecialAnchor(p, s))
                } else {
                    s = chk(parseAnchor(p, 0, s))
                }
            case '<': // sort relation '<'{1,4}, optionally followed by '*'.
                for level = 1; consume(&s, '<'); level++ {
                }
                if level > 4 {
                    err = fmt.Errorf("level %d > 4", level)
                }
                fallthrough
            case '=': // identity relation, optionally followed by *.
                if consume(&s, '*') {
                    s = chk(parseSequence(p, level, s))
                } else {
                    s = chk(parseOrder(p, level, s))
                }
            default:
                chk("", fmt.Errorf("illegal operator %q", ch))
                break
            }
        }
    }
    if chk("", scanner.Err()); err != nil {
        return fmt.Errorf("%d: %v", i, err)
    }
    return nil
}

// parseSpecialAnchor parses the anchor syntax which is either of the form
//    ['before' <level>] <anchor>
// or
//    [<label>]
// The starting should already be consumed.
func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
    i := strings.IndexByte(s, ']')
    if i == -1 {
        return "", errors.New("unmatched bracket")
    }
    a := strings.TrimSpace(s[:i])
    s = s[i+1:]
    if strings.HasPrefix(a, "before ") {
        l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
        if err != nil {
            return s, err
        }
        return parseAnchor(p, int(l), s)
    }
    return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
}

func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
    anchor, s, err := scanString(s)
    if err != nil {
        return s, err
    }
    return s, p.Reset(anchor, level)
}

func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
    var value, context, extend string
    if value, s, err = scanString(s); err != nil {
        return s, err
    }
    if strings.HasPrefix(value, cldrIndex) {
        p.Index(value[len(cldrIndex):])
        return
    }
    if consume(&s, '|') {
        if context, s, err = scanString(s); err != nil {
            return s, errors.New("missing string after context")
        }
    }
    if consume(&s, '/') {
        if extend, s, err = scanString(s); err != nil {
            return s, errors.New("missing string after extension")
        }
    }
    return s, p.Insert(level, value, context, extend)
}

// scanString scans a single input string.
func scanString(s string) (str, tail string, err error) {
    if s = skipSpace(s); s == "" {
        return s, s, errors.New("missing string")
    }
    buf := [16]byte{} // small but enough to hold most cases.
    value := buf[:0]
    for s != "" {
        if consume(&s, '\'') {
            i := strings.IndexByte(s, '\'')
            if i == -1 {
                return "", "", errors.New(`unmatched single quote`)
            }
            if i == 0 {
                value = append(value, '\'')
            } else {
                value = append(value, s[:i]...)
            }
            s = s[i+1:]
            continue
        }
        r, sz := utf8.DecodeRuneInString(s)
        if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
            break
        }
        value = append(value, s[:sz]...)
        s = s[sz:]
    }
    return string(value), skipSpace(s), nil
}

func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
    if s = skipSpace(s); s == "" {
        return s, errors.New("empty sequence")
    }
    last := rune(0)
    for s != "" {
        r, sz := utf8.DecodeRuneInString(s)
        s = s[sz:]

        if r == '-' {
            // We have a range. The first element was already written.
            if last == 0 {
                return s, errors.New("range without starter value")
            }
            r, sz = utf8.DecodeRuneInString(s)
            s = s[sz:]
            if r == utf8.RuneError || r < last {
                return s, fmt.Errorf("invalid range %q-%q", last, r)
            }
            for i := last + 1; i <= r; i++ {
                if err := p.Insert(level, string(i), "", ""); err != nil {
                    return s, err
                }
            }
            last = 0
            continue
        }

        if unicode.IsSpace(r) || unicode.IsPunct(r) {
            break
        }

        // normal case
        if err := p.Insert(level, string(r), "", ""); err != nil {
            return s, err
        }
        last = r
    }
    return s, nil
}

func skipSpace(s string) string {
    return strings.TrimLeftFunc(s, unicode.IsSpace)
}

// consumes returns whether the next byte is ch. If so, it gobbles it by
// updating s.
func consume(s *string, ch byte) (ok bool) {
    if *s == "" || (*s)[0] != ch {
        return false
    }
    *s = (*s)[1:]
    return true
}

// The following code parses Collation rules of CLDR version 24 and before.

var lmap = map[byte]int{
    'p': 1,
    's': 2,
    't': 3,
    'i': 5,
}

type rulesElem struct {
    Rules struct {
        Common
        Any []*struct {
            XMLName xml.Name
            rule
        } `xml:",any"`
    } `xml:"rules"`
}

type rule struct {
    Value  string `xml:",chardata"`
    Before string `xml:"before,attr"`
    Any    []*struct {
        XMLName xml.Name
        rule
    } `xml:",any"`
}

var emptyValueError = errors.New("cldr: empty rule value")

func (r *rule) value() (string, error) {
    // Convert hexadecimal Unicode codepoint notation to a string.
    s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
    r.Value = s
    if s == "" {
        if len(r.Any) != 1 {
            return "", emptyValueError
        }
        r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
        r.Any = nil
    } else if len(r.Any) != 0 {
        return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
    }
    return r.Value, nil
}

func (r rule) process(p RuleProcessor, name, context, extend string) error {
    v, err := r.value()
    if err != nil {
        return err
    }
    switch name {
    case "p", "s", "t", "i":
        if strings.HasPrefix(v, cldrIndex) {
            p.Index(v[len(cldrIndex):])
            return nil
        }
        if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
            return err
        }
    case "pc", "sc", "tc", "ic":
        level := lmap[name[0]]
        for _, s := range v {
            if err := p.Insert(level, string(s), context, extend); err != nil {
                return err
            }
        }
    default:
        return fmt.Errorf("cldr: unsupported tag: %q", name)
    }
    return nil
}

// processXML parses the format of CLDR versions 24 and older.
func (c Collation) processXML(p RuleProcessor) (err error) {
    // Collation is generated and defined in xml.go.
    var v string
    for _, r := range c.Rules.Any {
        switch r.XMLName.Local {
        case "reset":
            level := 0
            switch r.Before {
            case "primary", "1":
                level = 1
            case "secondary", "2":
                level = 2
            case "tertiary", "3":
                level = 3
            case "":
            default:
                return fmt.Errorf("cldr: unknown level %q", r.Before)
            }
            v, err = r.value()
            if err == nil {
                err = p.Reset(v, level)
            }
        case "x":
            var context, extend string
            for _, r1 := range r.Any {
                v, err = r1.value()
                switch r1.XMLName.Local {
                case "context":
                    context = v
                case "extend":
                    extend = v
                }
            }
            for _, r1 := range r.Any {
                if t := r1.XMLName.Local; t == "context" || t == "extend" {
                    continue
                }
                r1.rule.process(p, r1.XMLName.Local, context, extend)
            }
        default:
            err = r.rule.process(p, r.XMLName.Local, "", "")
        }
        if err != nil {
            return err
        }
    }
    return nil
}