Subversion Repositories Code-Repo

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
87 Kevin 1
// Protocol Buffers - Google's data interchange format
2
// Copyright 2008 Google Inc.  All rights reserved.
3
// http://code.google.com/p/protobuf/
4
//
5
// Redistribution and use in source and binary forms, with or without
6
// modification, are permitted provided that the following conditions are
7
// met:
8
//
9
//     * Redistributions of source code must retain the above copyright
10
// notice, this list of conditions and the following disclaimer.
11
//     * Redistributions in binary form must reproduce the above
12
// copyright notice, this list of conditions and the following disclaimer
13
// in the documentation and/or other materials provided with the
14
// distribution.
15
//     * Neither the name of Google Inc. nor the names of its
16
// contributors may be used to endorse or promote products derived from
17
// this software without specific prior written permission.
18
//
19
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
 
31
package com.google.protobuf;
32
 
33
import java.io.UnsupportedEncodingException;
34
 
35
/**
36
 * The classes contained within are used internally by the Protocol Buffer
37
 * library and generated message implementations. They are public only because
38
 * those generated messages do not reside in the {@code protobuf} package.
39
 * Others should not use this class directly.
40
 *
41
 * @author kenton@google.com (Kenton Varda)
42
 */
43
public class Internal {
44
  /**
45
   * Helper called by generated code to construct default values for string
46
   * fields.
47
   * <p>
48
   * The protocol compiler does not actually contain a UTF-8 decoder -- it
49
   * just pushes UTF-8-encoded text around without touching it.  The one place
50
   * where this presents a problem is when generating Java string literals.
51
   * Unicode characters in the string literal would normally need to be encoded
52
   * using a Unicode escape sequence, which would require decoding them.
53
   * To get around this, protoc instead embeds the UTF-8 bytes into the
54
   * generated code and leaves it to the runtime library to decode them.
55
   * <p>
56
   * It gets worse, though.  If protoc just generated a byte array, like:
57
   *   new byte[] {0x12, 0x34, 0x56, 0x78}
58
   * Java actually generates *code* which allocates an array and then fills
59
   * in each value.  This is much less efficient than just embedding the bytes
60
   * directly into the bytecode.  To get around this, we need another
61
   * work-around.  String literals are embedded directly, so protoc actually
62
   * generates a string literal corresponding to the bytes.  The easiest way
63
   * to do this is to use the ISO-8859-1 character set, which corresponds to
64
   * the first 256 characters of the Unicode range.  Protoc can then use
65
   * good old CEscape to generate the string.
66
   * <p>
67
   * So we have a string literal which represents a set of bytes which
68
   * represents another string.  This function -- stringDefaultValue --
69
   * converts from the generated string to the string we actually want.  The
70
   * generated code calls this automatically.
71
   */
72
  public static String stringDefaultValue(String bytes) {
73
    try {
74
      return new String(bytes.getBytes("ISO-8859-1"), "UTF-8");
75
    } catch (UnsupportedEncodingException e) {
76
      // This should never happen since all JVMs are required to implement
77
      // both of the above character sets.
78
      throw new IllegalStateException(
79
          "Java VM does not support a standard character set.", e);
80
    }
81
  }
82
 
83
  /**
84
   * Helper called by generated code to construct default values for bytes
85
   * fields.
86
   * <p>
87
   * This is a lot like {@link #stringDefaultValue}, but for bytes fields.
88
   * In this case we only need the second of the two hacks -- allowing us to
89
   * embed raw bytes as a string literal with ISO-8859-1 encoding.
90
   */
91
  public static ByteString bytesDefaultValue(String bytes) {
92
    try {
93
      return ByteString.copyFrom(bytes.getBytes("ISO-8859-1"));
94
    } catch (UnsupportedEncodingException e) {
95
      // This should never happen since all JVMs are required to implement
96
      // ISO-8859-1.
97
      throw new IllegalStateException(
98
          "Java VM does not support a standard character set.", e);
99
    }
100
  }
101
 
102
  /**
103
   * Helper called by generated code to determine if a byte array is a valid
104
   * UTF-8 encoded string such that the original bytes can be converted to
105
   * a String object and then back to a byte array round tripping the bytes
106
   * without loss.
107
   * <p>
108
   * This is inspired by UTF_8.java in sun.nio.cs.
109
   *
110
   * @param byteString the string to check
111
   * @return whether the byte array is round trippable
112
   */
113
  public static boolean isValidUtf8(ByteString byteString) {
114
    int index = 0;
115
    int size = byteString.size();
116
    // To avoid the masking, we could change this to use bytes;
117
    // Then X > 0xC2 gets turned into X < -0xC2; X < 0x80
118
    // gets turned into X >= 0, etc.
119
 
120
    while (index < size) {
121
      int byte1 = byteString.byteAt(index++) & 0xFF;
122
      if (byte1 < 0x80) {
123
        // fast loop for single bytes
124
        continue;
125
 
126
        // we know from this point on that we have 2-4 byte forms
127
      } else if (byte1 < 0xC2 || byte1 > 0xF4) {
128
        // catch illegal first bytes: < C2 or > F4
129
        return false;
130
      }
131
      if (index >= size) {
132
        // fail if we run out of bytes
133
        return false;
134
      }
135
      int byte2 = byteString.byteAt(index++) & 0xFF;
136
      if (byte2 < 0x80 || byte2 > 0xBF) {
137
        // general trail-byte test
138
        return false;
139
      }
140
      if (byte1 <= 0xDF) {
141
        // two-byte form; general trail-byte test is sufficient
142
        continue;
143
      }
144
 
145
      // we know from this point on that we have 3 or 4 byte forms
146
      if (index >= size) {
147
        // fail if we run out of bytes
148
        return false;
149
      }
150
      int byte3 = byteString.byteAt(index++) & 0xFF;
151
      if (byte3 < 0x80 || byte3 > 0xBF) {
152
        // general trail-byte test
153
        return false;
154
      }
155
      if (byte1 <= 0xEF) {
156
        // three-byte form. Vastly more frequent than four-byte forms
157
        // The following has an extra test, but not worth restructuring
158
        if (byte1 == 0xE0 && byte2 < 0xA0 ||
159
            byte1 == 0xED && byte2 > 0x9F) {
160
          // check special cases of byte2
161
          return false;
162
        }
163
 
164
      } else {
165
        // four-byte form
166
 
167
        if (index >= size) {
168
          // fail if we run out of bytes
169
          return false;
170
        }
171
        int byte4 = byteString.byteAt(index++) & 0xFF;
172
        if (byte4 < 0x80 || byte4 > 0xBF) {
173
          // general trail-byte test
174
          return false;
175
        }
176
        // The following has an extra test, but not worth restructuring
177
        if (byte1 == 0xF0 && byte2 < 0x90 ||
178
            byte1 == 0xF4 && byte2 > 0x8F) {
179
          // check special cases of byte2
180
          return false;
181
        }
182
      }
183
    }
184
    return true;
185
  }
186
 
187
  /**
188
   * Interface for an enum value or value descriptor, to be used in FieldSet.
189
   * The lite library stores enum values directly in FieldSets but the full
190
   * library stores EnumValueDescriptors in order to better support reflection.
191
   */
192
  public interface EnumLite {
193
    int getNumber();
194
  }
195
 
196
  /**
197
   * Interface for an object which maps integers to {@link EnumLite}s.
198
   * {@link Descriptors.EnumDescriptor} implements this interface by mapping
199
   * numbers to {@link Descriptors.EnumValueDescriptor}s.  Additionally,
200
   * every generated enum type has a static method internalGetValueMap() which
201
   * returns an implementation of this type that maps numbers to enum values.
202
   */
203
  public interface EnumLiteMap<T extends EnumLite> {
204
    T findValueByNumber(int number);
205
  }
206
}