src/library/jvm/scala/xml/include/sax/EncodingHeuristics.scala


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

/*                     __                                               *\
**     ________ ___   / /  ___     Scala API                            **
**    / __/ __// _ | / /  / _ |    (c) 2002-2007, LAMP/EPFL             **
**  __\ \/ /__/ __ |/ /__/ __ |    http://scala-lang.org/               **
** /____/\___/_/ |_/____/_/ | |                                         **
**                          |/                                          **
\*                                                                      */

// $Id$

package scala.xml.include.sax

import java.io.{IOException, InputStreamReader, InputStream}

/**
 * <p>
 * <code>EncodingHeuristics</code> reads from a stream
 * (which should be buffered) and attempts to guess
 * what the encoding of the text in the stream is.
 * Byte order marks are stripped from the stream.
 * If it fails to determine the type of the encoding,
 * it returns the default UTF-8.
 * </p>
 * <p>
 *   Translated from Elliotte Rusty Harold's Java source
 * </p>
 *
 * @author Burak Emir
 */
object EncodingHeuristics {

  /**
    * <p>
    * This utility method ????.
    * </p>
    *
    * @param in   <code>InputStream</code> to read from.
    * @return String  The name of the encoding.
    * @throws IOException if the stream cannot be reset back to where it was when
    *                     the method was invoked.
    */
  def readEncodingFromStream(in: InputStream): String = {
    //System.err.println("EncodingHeuristics::readEncodingFromStream");
    // This may fail if there are a lot of space characters before the end
    // of the encoding declaration
    in.mark(1024)
    var ret: String = null
    try {
      // lots of things can go wrong here. If any do, I just return null
      // so that we'll fall back on the encoding declaration or the
      // UTF-8 default
      val byte1 = in.read()
      val byte2 = in.read()
      if (byte1 == 0xFE && byte2 == 0xFF) {
        // don't reset because the byte order mark should not be included????
        ret =  "UnicodeBig"; // name for big-endian????
      }
      else if (byte1 == 0xFF && byte2 == 0xFE) {
        // don't reset because the byte order mark should not be included????
        // will the reader throw away the byte order mark or will it return it????
        ret =  "UnicodeLittle"
      }

      /* In accordance with the Character Model [Character Model],
       when the text format is a Unicode encoding, the XInclude
       processor must fail the inclusion when the text in the
       selected range is non-normalized. When transcoding characters
       to a Unicode encoding from a legacy encoding, a normalizing transcoder must be used. */

      val byte3 = in.read()
      // check for UTF-8 byte order mark
      if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
        // don't reset because the byte order mark should not be included????
        // in general what happens if text document includes non-XML legal chars????
        ret =  "UTF-8";
      }

      val byte4 = in.read();
      if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {
        // don't reset because the byte order mark should not be included????
        ret =  "UCS-4"; // right name for big-endian UCS-4 in Java 1.4????
      }
      else if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFF && byte4 == 0xFE) {
        // don't reset because the byte order mark should not be included????
        ret =  "UCS-4"; // right name for little-endian UCS-4 in Java 1.4????
      }

      // no byte order mark present; first character must be
      // less than sign or white space
      // Let's look for less-than signs first
      if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0x00 && byte4 == '<') {
        in.reset()
        ret =  "UCS-4"  // right name for big-endian UCS-4 in Java 1.4????
      }
      else if (byte1 == '<' && byte2 == 0x00 && byte3 == 0x00 && byte4 == 0x00) {
        in.reset()
        ret =  "UCS-4"  // right name for little-endian UCS-4 in Java 1.4????
      }
      else if (byte1 == 0x00 && byte2 == '<' && byte3 == 0x00 && byte4 == '?') {
        in.reset()
        ret =  "UnicodeBigUnmarked"
      }
      else if (byte1 == '<' && byte2 == 0x00 && byte3 == '?' && byte4 == 0x00) {
        in.reset()
        ret =  "UnicodeLittleUnmarked"
      }
      else if (byte1 == '<' && byte2 == '?' && byte3 == 'x' && byte4 == 'm') {
        // ASCII compatible, must read encoding declaration
        // 1024 bytes will be far enough to read most XML declarations
        val data = new Array[Byte](1024)
        data(0) = byte1.asInstanceOf[Byte]
        data(1) = byte2.asInstanceOf[Byte]
        data(2) = byte3.asInstanceOf[Byte]
        data(3) = byte4.asInstanceOf[Byte]
        val length = in.read(data, 4, 1020) + 4;
        // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and
        // all byte sequences are legal Latin-1 sequences so I don't have
        // to worry about encoding errors if I slip past the
        // end of the XML/text declaration
        val declaration = new String(data, 0, length, "8859_1");
        // if any of these throw a StringIndexOutOfBoundsException
        // we just fall into the catch bloclk and return null
        // since this can't be well-formed XML
        var position = declaration.indexOf("encoding") + 8;
        var c: Char = '\0'  // bogus init value
        // get rid of white space before equals sign
        do {
          c = declaration.charAt(position)
          position += 1
        } while (c == ' ' || c == '\t' || c == '\r' || c == '\n') ;
        if (c != '=') { // malformed
          in.reset()
          ret =  "UTF-8"
        }
        // get rid of white space after equals sign
        do {
          c = declaration.charAt(position)
          position += 1
        } while (c == ' ' || c == '\t' || c == '\r' || c == '\n') ;
        var delimiter: Char = c
        if (delimiter != '\'' && delimiter != '"') { // malformed
          in.reset()
          ret =  "UTF-8"
        }
        // now positioned to read encoding name
        val encodingName = new StringBuffer()
        do {
          c = declaration.charAt(position)
          position += 1
          encodingName.append(c)
        } while(c != delimiter)
        encodingName.setLength(encodingName.length() - 1)  // rm delim
        in.reset()
        ret =  encodingName.toString()
      }
        else if (byte1 == 0x4C && byte2 == 0x6F && byte3 == 0xA7 && byte4 == 0x94) {
          // EBCDIC compatible, must read encoding declaration
          // ????
        }

    } catch {
      case e: Exception =>
        in.reset()
        ret = "UTF-8"
    }

    // no XML or text declaration present
    //System.err.println("exit EncodingHeuristics::readEncodingFromStream");

    if (ret != null)
      ret
    else {
      in.reset()
      "UTF-8"
    }
  }
}