aboutsummaryrefslogblamecommitdiffstats
path: root/Godeps/_workspace/src/golang.org/x/text/transform/transform.go
blob: 51862b02bf5b34f593c8d587cf452ff13a4c24d9 (plain) (tree)
















































































































































































































                                                                                                    


                                         







                                                                                        
                 



























                                                                                                 
                         
                 
                             




                                            

                          
                                                                  







                                                                    














































































































































































































































                                                                                                  


                            













                                                                             
                 
                    




                                                                                    

         





                                                                               












                                                                                   
             






                                                                             

                                                                                         
                                                         

                             











                                                                                          

                 














                                                                                  
         
 



                                                                                


                                                    
                                                                                   

                                        
                                                                                     


                            


                                                                                      


                                                     
                                              


                                                  
                                                        







                                                                              














                                                                                           
                 
                 
             
                                                                            












                                                                                      
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package transform provides reader and writer wrappers that transform the
// bytes passing through as well as various transformations. Example
// transformations provided by other packages include normalization and
// conversion between character sets.
package transform

import (
    "bytes"
    "errors"
    "io"
    "unicode/utf8"
)

var (
    // ErrShortDst means that the destination buffer was too short to
    // receive all of the transformed bytes.
    ErrShortDst = errors.New("transform: short destination buffer")

    // ErrShortSrc means that the source buffer has insufficient data to
    // complete the transformation.
    ErrShortSrc = errors.New("transform: short source buffer")

    // errInconsistentByteCount means that Transform returned success (nil
    // error) but also returned nSrc inconsistent with the src argument.
    errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")

    // errShortInternal means that an internal buffer is not large enough
    // to make progress and the Transform operation must be aborted.
    errShortInternal = errors.New("transform: short internal buffer")
)

// Transformer transforms bytes.
type Transformer interface {
    // Transform writes to dst the transformed bytes read from src, and
    // returns the number of dst bytes written and src bytes read. The
    // atEOF argument tells whether src represents the last bytes of the
    // input.
    //
    // Callers should always process the nDst bytes produced and account
    // for the nSrc bytes consumed before considering the error err.
    //
    // A nil error means that all of the transformed bytes (whether freshly
    // transformed from src or left over from previous Transform calls)
    // were written to dst. A nil error can be returned regardless of
    // whether atEOF is true. If err is nil then nSrc must equal len(src);
    // the converse is not necessarily true.
    //
    // ErrShortDst means that dst was too short to receive all of the
    // transformed bytes. ErrShortSrc means that src had insufficient data
    // to complete the transformation. If both conditions apply, then
    // either error may be returned. Other than the error conditions listed
    // here, implementations are free to report other errors that arise.
    Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)

    // Reset resets the state and allows a Transformer to be reused.
    Reset()
}

// NopResetter can be embedded by implementations of Transformer to add a nop
// Reset method.
type NopResetter struct{}

// Reset implements the Reset method of the Transformer interface.
func (NopResetter) Reset() {}

// Reader wraps another io.Reader by transforming the bytes read.
type Reader struct {
    r   io.Reader
    t   Transformer
    err error

    // dst[dst0:dst1] contains bytes that have been transformed by t but
    // not yet copied out via Read.
    dst        []byte
    dst0, dst1 int

    // src[src0:src1] contains bytes that have been read from r but not
    // yet transformed through t.
    src        []byte
    src0, src1 int

    // transformComplete is whether the transformation is complete,
    // regardless of whether or not it was successful.
    transformComplete bool
}

const defaultBufSize = 4096

// NewReader returns a new Reader that wraps r by transforming the bytes read
// via t. It calls Reset on t.
func NewReader(r io.Reader, t Transformer) *Reader {
    t.Reset()
    return &Reader{
        r:   r,
        t:   t,
        dst: make([]byte, defaultBufSize),
        src: make([]byte, defaultBufSize),
    }
}

// Read implements the io.Reader interface.
func (r *Reader) Read(p []byte) (int, error) {
    n, err := 0, error(nil)
    for {
        // Copy out any transformed bytes and return the final error if we are done.
        if r.dst0 != r.dst1 {
            n = copy(p, r.dst[r.dst0:r.dst1])
            r.dst0 += n
            if r.dst0 == r.dst1 && r.transformComplete {
                return n, r.err
            }
            return n, nil
        } else if r.transformComplete {
            return 0, r.err
        }

        // Try to transform some source bytes, or to flush the transformer if we
        // are out of source bytes. We do this even if r.r.Read returned an error.
        // As the io.Reader documentation says, "process the n > 0 bytes returned
        // before considering the error".
        if r.src0 != r.src1 || r.err != nil {
            r.dst0 = 0
            r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
            r.src0 += n

            switch {
            case err == nil:
                if r.src0 != r.src1 {
                    r.err = errInconsistentByteCount
                }
                // The Transform call was successful; we are complete if we
                // cannot read more bytes into src.
                r.transformComplete = r.err != nil
                continue
            case err == ErrShortDst && (r.dst1 != 0 || n != 0):
                // Make room in dst by copying out, and try again.
                continue
            case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
                // Read more bytes into src via the code below, and try again.
            default:
                r.transformComplete = true
                // The reader error (r.err) takes precedence over the
                // transformer error (err) unless r.err is nil or io.EOF.
                if r.err == nil || r.err == io.EOF {
                    r.err = err
                }
                continue
            }
        }

        // Move any untransformed source bytes to the start of the buffer
        // and read more bytes.
        if r.src0 != 0 {
            r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
        }
        n, r.err = r.r.Read(r.src[r.src1:])
        r.src1 += n
    }
}

// TODO: implement ReadByte (and ReadRune??).

// Writer wraps another io.Writer by transforming the bytes read.
// The user needs to call Close to flush unwritten bytes that may
// be buffered.
type Writer struct {
    w   io.Writer
    t   Transformer
    dst []byte

    // src[:n] contains bytes that have not yet passed through t.
    src []byte
    n   int
}

// NewWriter returns a new Writer that wraps w by transforming the bytes written
// via t. It calls Reset on t.
func NewWriter(w io.Writer, t Transformer) *Writer {
    t.Reset()
    return &Writer{
        w:   w,
        t:   t,
        dst: make([]byte, defaultBufSize),
        src: make([]byte, defaultBufSize),
    }
}

// Write implements the io.Writer interface. If there are not enough
// bytes available to complete a Transform, the bytes will be buffered
// for the next write. Call Close to convert the remaining bytes.
func (w *Writer) Write(data []byte) (n int, err error) {
    src := data
    if w.n > 0 {
        // Append bytes from data to the last remainder.
        // TODO: limit the amount copied on first try.
        n = copy(w.src[w.n:], data)
        w.n += n
        src = w.src[:w.n]
    }
    for {
        nDst, nSrc, err := w.t.Transform(w.dst, src, false)
        if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
            return n, werr
        }
        src = src[nSrc:]
        if w.n == 0 {
            n += nSrc
        } else if len(src) <= n {
            // Enough bytes from w.src have been consumed. We make src point
            // to data instead to reduce the copying.
            w.n = 0
            n -= len(src)
            src = data[n:]
            if n < len(data) && (err == nil || err == ErrShortSrc) {
                continue
            }
        }
        switch err {
        case ErrShortDst:
            // This error is okay as long as we are making progress.
            if nDst > 0 || nSrc > 0 {
                continue
            }
        case ErrShortSrc:
            if len(src) < len(w.src) {
                m := copy(w.src, src)
                // If w.n > 0, bytes from data were already copied to w.src and n
                // was already set to the number of bytes consumed.
                if w.n == 0 {
                    n += m
                }
                w.n = m
                err = nil
            } else if nDst > 0 || nSrc > 0 {
                // Not enough buffer to store the remainder. Keep processing as
                // long as there is progress. Without this case, transforms that
                // require a lookahead larger than the buffer may result in an
                // error. This is not something one may expect to be common in
                // practice, but it may occur when buffers are set to small
                // sizes during testing.
                continue
            }
        case nil:
            if w.n > 0 {
                err = errInconsistentByteCount
            }
        }
        return n, err
    }
}

// Close implements the io.Closer interface.
func (w *Writer) Close() error {
    src := w.src[:w.n]
    for {
        nDst, nSrc, err := w.t.Transform(w.dst, src, true)
        if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
            return werr
        }
        if err != ErrShortDst {
            return err
        }
        src = src[nSrc:]
    }
}

type nop struct{ NopResetter }

func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    n := copy(dst, src)
    if n < len(src) {
        err = ErrShortDst
    }
    return n, n, err
}

type discard struct{ NopResetter }

func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    return 0, len(src), nil
}

var (
    // Discard is a Transformer for which all Transform calls succeed
    // by consuming all bytes and writing nothing.
    Discard Transformer = discard{}

    // Nop is a Transformer that copies src to dst.
    Nop Transformer = nop{}
)

// chain is a sequence of links. A chain with N Transformers has N+1 links and
// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
// buffers given to chain.Transform and the middle N-1 buffers are intermediate
// buffers owned by the chain. The i'th link transforms bytes from the i'th
// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
type chain struct {
    link []link
    err  error
    // errStart is the index at which the error occurred plus 1. Processing
    // errStart at this level at the next call to Transform. As long as
    // errStart > 0, chain will not consume any more source bytes.
    errStart int
}

func (c *chain) fatalError(errIndex int, err error) {
    if i := errIndex + 1; i > c.errStart {
        c.errStart = i
        c.err = err
    }
}

type link struct {
    t Transformer
    // b[p:n] holds the bytes to be transformed by t.
    b []byte
    p int
    n int
}

func (l *link) src() []byte {
    return l.b[l.p:l.n]
}

func (l *link) dst() []byte {
    return l.b[l.n:]
}

// Chain returns a Transformer that applies t in sequence.
func Chain(t ...Transformer) Transformer {
    if len(t) == 0 {
        return nop{}
    }
    c := &chain{link: make([]link, len(t)+1)}
    for i, tt := range t {
        c.link[i].t = tt
    }
    // Allocate intermediate buffers.
    b := make([][defaultBufSize]byte, len(t)-1)
    for i := range b {
        c.link[i+1].b = b[i][:]
    }
    return c
}

// Reset resets the state of Chain. It calls Reset on all the Transformers.
func (c *chain) Reset() {
    for i, l := range c.link {
        if l.t != nil {
            l.t.Reset()
        }
        c.link[i].p, c.link[i].n = 0, 0
    }
}

// Transform applies the transformers of c in sequence.
func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    // Set up src and dst in the chain.
    srcL := &c.link[0]
    dstL := &c.link[len(c.link)-1]
    srcL.b, srcL.p, srcL.n = src, 0, len(src)
    dstL.b, dstL.n = dst, 0
    var lastFull, needProgress bool // for detecting progress

    // i is the index of the next Transformer to apply, for i in [low, high].
    // low is the lowest index for which c.link[low] may still produce bytes.
    // high is the highest index for which c.link[high] has a Transformer.
    // The error returned by Transform determines whether to increase or
    // decrease i. We try to completely fill a buffer before converting it.
    for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
        in, out := &c.link[i], &c.link[i+1]
        nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
        out.n += nDst
        in.p += nSrc
        if i > 0 && in.p == in.n {
            in.p, in.n = 0, 0
        }
        needProgress, lastFull = lastFull, false
        switch err0 {
        case ErrShortDst:
            // Process the destination buffer next. Return if we are already
            // at the high index.
            if i == high {
                return dstL.n, srcL.p, ErrShortDst
            }
            if out.n != 0 {
                i++
                // If the Transformer at the next index is not able to process any
                // source bytes there is nothing that can be done to make progress
                // and the bytes will remain unprocessed. lastFull is used to
                // detect this and break out of the loop with a fatal error.
                lastFull = true
                continue
            }
            // The destination buffer was too small, but is completely empty.
            // Return a fatal error as this transformation can never complete.
            c.fatalError(i, errShortInternal)
        case ErrShortSrc:
            if i == 0 {
                // Save ErrShortSrc in err. All other errors take precedence.
                err = ErrShortSrc
                break
            }
            // Source bytes were depleted before filling up the destination buffer.
            // Verify we made some progress, move the remaining bytes to the errStart
            // and try to get more source bytes.
            if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
                // There were not enough source bytes to proceed while the source
                // buffer cannot hold any more bytes. Return a fatal error as this
                // transformation can never complete.
                c.fatalError(i, errShortInternal)
                break
            }
            // in.b is an internal buffer and we can make progress.
            in.p, in.n = 0, copy(in.b, in.src())
            fallthrough
        case nil:
            // if i == low, we have depleted the bytes at index i or any lower levels.
            // In that case we increase low and i. In all other cases we decrease i to
            // fetch more bytes before proceeding to the next index.
            if i > low {
                i--
                continue
            }
        default:
            c.fatalError(i, err0)
        }
        // Exhausted level low or fatal error: increase low and continue
        // to process the bytes accepted so far.
        i++
        low = i
    }

    // If c.errStart > 0, this means we found a fatal error.  We will clear
    // all upstream buffers. At this point, no more progress can be made
    // downstream, as Transform would have bailed while handling ErrShortDst.
    if c.errStart > 0 {
        for i := 1; i < c.errStart; i++ {
            c.link[i].p, c.link[i].n = 0, 0
        }
        err, c.errStart, c.err = c.err, 0, nil
    }
    return dstL.n, srcL.p, err
}

// RemoveFunc returns a Transformer that removes from the input all runes r for
// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
func RemoveFunc(f func(r rune) bool) Transformer {
    return removeF(f)
}

type removeF func(r rune) bool

func (removeF) Reset() {}

// Transform implements the Transformer interface.
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {

        if r = rune(src[0]); r < utf8.RuneSelf {
            sz = 1
        } else {
            r, sz = utf8.DecodeRune(src)

            if sz == 1 {
                // Invalid rune.
                if !atEOF && !utf8.FullRune(src) {
                    err = ErrShortSrc
                    break
                }
                // We replace illegal bytes with RuneError. Not doing so might
                // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
                // The resulting byte sequence may subsequently contain runes
                // for which t(r) is true that were passed unnoticed.
                if !t(r) {
                    if nDst+3 > len(dst) {
                        err = ErrShortDst
                        break
                    }
                    nDst += copy(dst[nDst:], "\uFFFD")
                }
                nSrc++
                continue
            }
        }

        if !t(r) {
            if nDst+sz > len(dst) {
                err = ErrShortDst
                break
            }
            nDst += copy(dst[nDst:], src[:sz])
        }
        nSrc += sz
    }
    return
}

// grow returns a new []byte that is longer than b, and copies the first n bytes
// of b to the start of the new slice.
func grow(b []byte, n int) []byte {
    m := len(b)
    if m <= 32 {
        m = 64
    } else if m <= 256 {
        m *= 2
    } else {
        m += m >> 1
    }
    buf := make([]byte, m)
    copy(buf, b[:n])
    return buf
}

const initialBufSize = 128

// String returns a string with the result of converting s[:n] using t, where
// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
func String(t Transformer, s string) (result string, n int, err error) {
    t.Reset()
    if s == "" {
        // Fast path for the common case for empty input. Results in about a
        // 86% reduction of running time for BenchmarkStringLowerEmpty.
        if _, _, err := t.Transform(nil, nil, true); err == nil {
            return "", 0, nil
        }
    }

    // Allocate only once. Note that both dst and src escape when passed to
    // Transform.
    buf := [2 * initialBufSize]byte{}
    dst := buf[:initialBufSize:initialBufSize]
    src := buf[initialBufSize : 2*initialBufSize]

    // The input string s is transformed in multiple chunks (starting with a
    // chunk size of initialBufSize). nDst and nSrc are per-chunk (or
    // per-Transform-call) indexes, pDst and pSrc are overall indexes.
    nDst, nSrc := 0, 0
    pDst, pSrc := 0, 0

    // pPrefix is the length of a common prefix: the first pPrefix bytes of the
    // result will equal the first pPrefix bytes of s. It is not guaranteed to
    // be the largest such value, but if pPrefix, len(result) and len(s) are
    // all equal after the final transform (i.e. calling Transform with atEOF
    // being true returned nil error) then we don't need to allocate a new
    // result string.
    pPrefix := 0
    for {
        // Invariant: pDst == pPrefix && pSrc == pPrefix.

        n := copy(src, s[pSrc:])
        nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
        pDst += nDst
        pSrc += nSrc

        // TODO:  let transformers implement an optional Spanner interface, akin
        // to norm's QuickSpan. This would even allow us to avoid any allocation.
        if !bytes.Equal(dst[:nDst], src[:nSrc]) {
            break
        }
        pPrefix = pSrc
        if err == ErrShortDst {
            // A buffer can only be short if a transformer modifies its input.
            break
        } else if err == ErrShortSrc {
            if nSrc == 0 {
                // No progress was made.
                break
            }
            // Equal so far and !atEOF, so continue checking.
        } else if err != nil || pPrefix == len(s) {
            return string(s[:pPrefix]), pPrefix, err
        }
    }
    // Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.

    // We have transformed the first pSrc bytes of the input s to become pDst
    // transformed bytes. Those transformed bytes are discontiguous: the first
    // pPrefix of them equal s[:pPrefix] and the last nDst of them equal
    // dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
    // that they become one contiguous slice: dst[:pDst].
    if pPrefix != 0 {
        newDst := dst
        if pDst > len(newDst) {
            newDst = make([]byte, len(s)+nDst-nSrc)
        }
        copy(newDst[pPrefix:pDst], dst[:nDst])
        copy(newDst[:pPrefix], s[:pPrefix])
        dst = newDst
    }

    // Prevent duplicate Transform calls with atEOF being true at the end of
    // the input. Also return if we have an unrecoverable error.
    if (err == nil && pSrc == len(s)) ||
        (err != nil && err != ErrShortDst && err != ErrShortSrc) {
        return string(dst[:pDst]), pSrc, err
    }

    // Transform the remaining input, growing dst and src buffers as necessary.
    for {
        n := copy(src, s[pSrc:])
        nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
        pDst += nDst
        pSrc += nSrc

        // If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
        // make progress. This may avoid excessive allocations.
        if err == ErrShortDst {
            if nDst == 0 {
                dst = grow(dst, pDst)
            }
        } else if err == ErrShortSrc {
            if nSrc == 0 {
                src = grow(src, 0)
            }
        } else if err != nil || pSrc == len(s) {
            return string(dst[:pDst]), pSrc, err
        }
    }
}

// Bytes returns a new byte slice with the result of converting b[:n] using t,
// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
    return doAppend(t, 0, make([]byte, len(b)), b)
}

// Append appends the result of converting src[:n] using t to dst, where
// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
    if len(dst) == cap(dst) {
        n := len(src) + len(dst) // It is okay for this to be 0.
        b := make([]byte, n)
        dst = b[:copy(b, dst)]
    }
    return doAppend(t, len(dst), dst[:cap(dst)], src)
}

func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
    t.Reset()
    pSrc := 0
    for {
        nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
        pDst += nDst
        pSrc += nSrc
        if err != ErrShortDst {
            return dst[:pDst], pSrc, err
        }

        // Grow the destination buffer, but do not grow as long as we can make
        // progress. This may avoid excessive allocations.
        if nDst == 0 {
            dst = grow(dst, pDst)
        }
    }
}