aboutsummaryrefslogtreecommitdiffstats
path: root/Godeps/_workspace/src/golang.org/x/text/encoding/encoding.go
blob: 2f99a02ac47f6f326f0e4580e3d1733cc131222b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package encoding defines an interface for character encodings, such as Shift
// JIS and Windows 1252, that can convert to and from UTF-8.
//
// To convert the bytes of an io.Reader r from the encoding e to UTF-8:
//  rInUTF8 := transform.NewReader(r, e.NewDecoder())
// and to convert from UTF-8 to the encoding e:
//  wInUTF8 := transform.NewWriter(w, e.NewEncoder())
// In both cases, import "golang.org/x/text/transform".
//
// Encoding implementations are provided in other packages, such as
// golang.org/x/text/encoding/charmap and
// golang.org/x/text/encoding/japanese.
package encoding

import (
    "errors"
    "unicode/utf8"

    "golang.org/x/text/transform"
)

// Encoding is a character set encoding that can be transformed to and from
// UTF-8.
type Encoding interface {
    // NewDecoder returns a transformer that converts to UTF-8.
    //
    // Transforming source bytes that are not of that encoding will not
    // result in an error per se. Each byte that cannot be transcoded will
    // be represented in the output by the UTF-8 encoding of '\uFFFD', the
    // replacement rune.
    NewDecoder() transform.Transformer

    // NewEncoder returns a transformer that converts from UTF-8.
    //
    // Transforming source bytes that are not valid UTF-8 will not result in
    // an error per se. Each rune that cannot be transcoded will be
    // represented in the output by an encoding-specific replacement such as
    // "\x1a" (the ASCII substitute character) or "\xff\xfd". To return
    // early with error instead, use transform.Chain to preprocess the data
    // with a UTF8Validator.
    NewEncoder() transform.Transformer
}

// ASCIISub is the ASCII substitute character, as recommended by
// http://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'

// Nop is the nop encoding. Its transformed bytes are the same as the source
// bytes; it does not replace invalid UTF-8 sequences.
var Nop Encoding = nop{}

type nop struct{}

func (nop) NewDecoder() transform.Transformer {
    return transform.Nop
}

func (nop) NewEncoder() transform.Transformer {
    return transform.Nop
}

// Replacement is the replacement encoding. Decoding from the replacement
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
// the replacement encoding yields the same as the source bytes except that
// invalid UTF-8 is converted to '\uFFFD'.
//
// It is defined at http://encoding.spec.whatwg.org/#replacement
var Replacement Encoding = replacement{}

type replacement struct{}

func (replacement) NewDecoder() transform.Transformer {
    return replacementDecoder{}
}

func (replacement) NewEncoder() transform.Transformer {
    return replacementEncoder{}
}

type replacementDecoder struct{ transform.NopResetter }

func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    if len(dst) < 3 {
        return 0, 0, transform.ErrShortDst
    }
    if atEOF {
        const fffd = "\ufffd"
        dst[0] = fffd[0]
        dst[1] = fffd[1]
        dst[2] = fffd[2]
        nDst = 3
    }
    return nDst, len(src), nil
}

type replacementEncoder struct{ transform.NopResetter }

func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    r, size := rune(0), 0

    for ; nSrc < len(src); nSrc += size {
        r = rune(src[nSrc])

        // Decode a 1-byte rune.
        if r < utf8.RuneSelf {
            size = 1

        } else {
            // Decode a multi-byte rune.
            r, size = utf8.DecodeRune(src[nSrc:])
            if size == 1 {
                // All valid runes of size 1 (those below utf8.RuneSelf) were
                // handled above. We have invalid UTF-8 or we haven't seen the
                // full character yet.
                if !atEOF && !utf8.FullRune(src[nSrc:]) {
                    err = transform.ErrShortSrc
                    break
                }
                r = '\ufffd'
            }
        }

        if nDst+utf8.RuneLen(r) > len(dst) {
            err = transform.ErrShortDst
            break
        }
        nDst += utf8.EncodeRune(dst[nDst:], r)
    }
    return nDst, nSrc, err
}

// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")

// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
// input byte that is not valid UTF-8.
var UTF8Validator transform.Transformer = utf8Validator{}

type utf8Validator struct{ transform.NopResetter }

func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    n := len(src)
    if n > len(dst) {
        n = len(dst)
    }
    for i := 0; i < n; {
        if c := src[i]; c < utf8.RuneSelf {
            dst[i] = c
            i++
            continue
        }
        _, size := utf8.DecodeRune(src[i:])
        if size == 1 {
            // All valid runes of size 1 (those below utf8.RuneSelf) were
            // handled above. We have invalid UTF-8 or we haven't seen the
            // full character yet.
            err = ErrInvalidUTF8
            if !atEOF && !utf8.FullRune(src[i:]) {
                err = transform.ErrShortSrc
            }
            return i, i, err
        }
        if i+size > len(dst) {
            return i, i, transform.ErrShortDst
        }
        for ; size > 0; size-- {
            dst[i] = src[i]
            i++
        }
    }
    if len(src) > len(dst) {
        err = transform.ErrShortDst
    }
    return n, n, err
}