Removed custom UTF8 encoding implementation, de...

Removed custom UTF8 encoding implementation, deprecated all its methods, and ran everything through java's built-in decoder.
author: Paul Phillips <paulp@improving.org> 2009-09-05 11:32:10 +0000
committer: Paul Phillips <paulp@improving.org> 2009-09-05 11:32:10 +0000
commit: 4d209eab310ee089d23413af76b02590a24575cb (patch)
tree: 21d2644d5e51a1e25554f21e39c73351d8862f85
parent: fd1eba71450b3d60fd02dba22f951a32485629d8 (diff)
download: scala-4d209eab310ee089d23413af76b02590a24575cb.tar.gz
scala-4d209eab310ee089d23413af76b02590a24575cb.tar.bz2
scala-4d209eab310ee089d23413af76b02590a24575cb.zip
6 files changed, 81 insertions, 122 deletions
diff --git a/src/compiler/scala/tools/nsc/symtab/Names.scala b/src/compiler/scala/tools/nsc/symtab/Names.scala
index 3dcff5da0b..603c0f3673 100644
--- a/src/compiler/scala/tools/nsc/symtab/Names.scala
+++ b/src/compiler/scala/tools/nsc/symtab/Names.scala
@@ -8,7 +8,7 @@ package scala.tools.nsc
 package symtab
 
 import scala.util.NameTransformer
-import scala.io.UTF8Codec
+import scala.io.Codec
 import java.security.MessageDigest
 
 /** The class <code>Names</code> ...
@@ -87,24 +87,19 @@ class Names {
 
   private lazy val md5 = MessageDigest.getInstance("MD5")
 
-  private def toMD5(s: String, prefixSuffixLen: Int) = {
-//  println("COMPACTIFY "+s)
-    val cs: Array[Char] = s.toCharArray
-    val bytes = new Array[Byte](cs.length * 4)
-    val len = UTF8Codec.encode(cs, 0, bytes, 0, cs.length)
-    md5.update(bytes, 0, len)
-    val hash = md5.digest()
-    val sb = new StringBuilder
-    sb.appendAll(cs, 0, prefixSuffixLen)
-    sb.append("$$$$")
-    for (i <- 0 until hash.length) {
-      val b = hash(i)
-      sb.append(((b >> 4) & 0xF).toHexString)
-      sb.append((b & 0xF).toHexString)
-    }
-    sb.append("$$$$")
-    sb.appendAll(cs, len - prefixSuffixLen, prefixSuffixLen)
-    sb.toString
+  /** "COMPACTIFY" */
+  private def toMD5(s: String, edge: Int) = {
+    import collection.immutable.StringVector._
+    val prefix = take(s, edge)
+    val suffix = takeRight(s, edge)
+    val marker = "$$$$"
+
+    val cs = s.toArray
+    val bytes = Codec fromUTF8 cs
+    md5 update bytes
+    val md5chars = md5.digest() map (b => (b & 0xFF).toHexString) mkString
+
+    prefix + marker + md5chars + marker + suffix
   }
 
   def compactify(s: String): String =
@@ -142,11 +137,8 @@ class Names {
    *  @param len    ...
    *  @return       the created term name
    */
-  def newTermName(bs: Array[Byte], offset: Int, len: Int): Name = {
-    val cs = new Array[Char](bs.length)
-    val nchrs = UTF8Codec.decode(bs, offset, cs, 0, len)
-    newTermName(cs, 0, nchrs)
-  }
+  def newTermName(bs: Array[Byte], offset: Int, len: Int): Name =
+    newTermName(Codec toUTF8 bs.slice(offset, offset + len) mkString)
 
   /** Create a type name from the characters in <code>cs[offset..offset+len-1]</code>.
    *
@@ -173,7 +165,6 @@ class Names {
   def newTypeName(bs: Array[Byte], offset: Int, len: Int): Name =
     newTermName(bs, offset, len).toTypeName
 
-
   def nameChars: Array[Char] = chrs
 
   implicit def view(s: String): Name = newTermName(s)
@@ -228,8 +219,11 @@ class Names {
      *  Array must have enough remaining space for all bytes
      *  (i.e. maximally 3*length bytes).
      */
-    final def copyUTF8(bs: Array[Byte], offset: Int): Int =
-      UTF8Codec.encode(chrs, index, bs, offset, len)
+    final def copyUTF8(bs: Array[Byte], offset: Int): Int = {
+      val bytes = Codec fromUTF8 chrs.slice(index, index + len)
+      compat.Platform.arraycopy(bytes, 0, bs, offset, bytes.length)
+      offset + bytes.length
+    }
 
     /** return the hash value of this name
      */
diff --git a/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala b/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala
index 87bf879980..d5e69d20d8 100644
--- a/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala
+++ b/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala
@@ -12,7 +12,6 @@ import java.io.IOException
 import java.lang.{Float, Double}
 
 import scala.tools.nsc.util.{Position, NoPosition}
-import scala.io.UTF8Codec
 
 import Flags._
 import PickleFormat._
diff --git a/src/library/scala/io/BytePickle.scala b/src/library/scala/io/BytePickle.scala
index 135d608eb9..b468ba4b52 100644
--- a/src/library/scala/io/BytePickle.scala
+++ b/src/library/scala/io/BytePickle.scala
@@ -271,8 +271,11 @@ object BytePickle {
       (s.stream(0), new UnPicklerState(s.stream.slice(1, s.stream.length), s.dict));
   }
 
-  def string: SPU[String] =
-    share(wrap((a: Array[Byte]) => UTF8Codec.decode(a, 0, a.length), (s:String) => UTF8Codec.encode(s), bytearray));
+  def string: SPU[String] = share(wrap(
+    (a: Array[Byte]) => Codec toUTF8 a mkString,
+    (s: String) => Codec fromUTF8 s,
+    bytearray
+  ))
 
   def bytearray: SPU[Array[Byte]] = {
     wrap((l:List[Byte]) => l.toArray, (_.toList), list(byte))
diff --git a/src/library/scala/io/Codec.scala b/src/library/scala/io/Codec.scala
index a9483077a2..2e262fcb38 100644
--- a/src/library/scala/io/Codec.scala
+++ b/src/library/scala/io/Codec.scala
@@ -83,6 +83,24 @@ object Codec {
     new Codec(decoder.charset()) { override def decoder = _decoder }
   }
 
+  def toUTF8(bytes: Array[Byte]): Array[Char] = {
+    val bbuffer = java.nio.ByteBuffer wrap bytes
+    val cbuffer = UTF8 decode bbuffer
+    val chars = new Array[Char](cbuffer.remaining())
+    cbuffer get chars
+
+    chars
+  }
+
+  def fromUTF8(cs: CharSequence): Array[Byte] = {
+    val cbuffer = java.nio.CharBuffer wrap cs
+    val bbuffer = UTF8 encode cbuffer
+    val bytes = new Array[Byte](bbuffer.remaining())
+    bbuffer get bytes
+
+    bytes
+  }
+
   implicit def string2codec(s: String) = apply(s)
   implicit def charset2codec(c: Charset) = apply(c)
   implicit def decoder2codec(cd: CharsetDecoder) = apply(cd)
diff --git a/src/library/scala/io/UTF8Codec.scala b/src/library/scala/io/UTF8Codec.scala
index b0e0ee0077..21d3b3bb31 100644
--- a/src/library/scala/io/UTF8Codec.scala
+++ b/src/library/scala/io/UTF8Codec.scala
@@ -8,112 +8,58 @@
 
 // $Id$
 
-
 package scala.io
 
 /**
  *  @author  Martin Odersky
  *  @version 1.0, 04/10/2004
  */
-object UTF8Codec {
-
+object UTF8Codec
+{
   final val UNI_REPLACEMENT_CHAR: Int = 0x0000FFFD
+  final val UNI_REPLACEMENT_BYTES = encode(UNI_REPLACEMENT_CHAR)
 
-  /** convert a codepoint to utf-8 bytes
-   * @author buraq
-   * @param ch codepoint
-   */
-  def encode(ch1: Int): Array[Byte] = {
-    var ch = ch1
-    val byteMask = 0xBF
-    val byteMark = 0x80
-    var bytesToWrite = 0
-    val firstByteMark = List[Byte](0x00.toByte, 0x00.toByte, 0xC0.toByte, 0xE0.toByte, 0xF0.toByte, 0xF8.toByte, 0xFC.toByte)
-
-    if      (ch < 0x80)        { bytesToWrite = 1 }
-    else if (ch < 0x800)       { bytesToWrite = 2 }
-    else if (ch < 0x10000)     { bytesToWrite = 3 }
-    else if (ch <= 0x0010FFFF) { bytesToWrite = 4 }
-    else return encode(UNI_REPLACEMENT_CHAR)
-
-    val res = new Array[Byte](bytesToWrite)
+  // Note, from http://unicode.org/faq/utf_bom.html#utf8-5
+  //
+  // A different issue arises if an unpaired surrogate is encountered when converting
+  // ill-formed UTF-16 data. By represented such an unpaired surrogate on its own as a
+  // 3-byte sequence, the resulting UTF-8 data stream would become ill-formed.
+  // While it faithfully reflects the nature of the input, Unicode conformance
+  // requires that encoding form conversion always results in valid data stream.
+  // Therefore a converter must treat this as an error.
+  //
+  // Some useful locations:
+  //    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 
-    var bw = bytesToWrite
-    if (bw >= 4) {
-      res(3) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1
-    }
-    if (bw >= 3) {
-      res(2) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1
-    }
-    if (bw >= 2) {
-      res(1) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1
-    }
-    if (bw >= 1) {
-      res(0) = (ch | firstByteMark(bytesToWrite)).toByte
+  @deprecated("""Use new String(Array(ch), 0, 1).getBytes("UTF-8") instead""")
+  def encode(ch: Int): Array[Byte] =
+    if ((Character getType ch) == Character.SURROGATE) UNI_REPLACEMENT_BYTES
+    else try new String(Array(ch), 0, 1) getBytes "UTF-8" catch {
+      case _: IllegalArgumentException  => UNI_REPLACEMENT_BYTES
     }
-    res
-  }
 
+  @deprecated("Use Codec.fromUTF8 instead")
   def encode(src: Array[Char], from: Int, dst: Array[Byte], to: Int, len: Int): Int = {
-    var i = from
-    var j = to
-    val end = from + len
-    while (i < end) {
-      val ch = src(i)
-      i += 1
-      if (ch < 128) {
-        dst(j) = ch.toByte
-        j += 1
-      }
-      else if (ch <= 0x3FF) {
-        dst(j)   = (0xC0 | (ch >> 6)).toByte
-        dst(j+1) = (0x80 | (ch & 0x3F)).toByte
-        j += 2
-      } else {
-        dst(j)   = (0xE0 | (ch >> 12)).toByte
-        dst(j+1) = (0x80 | ((ch >> 6) & 0x3F)).toByte
-        dst(j+2) = (0x80 | (ch & 0x3F)).toByte
-        j += 3
-      }
-    }
-    j
+    val bytes = Codec fromUTF8 src.slice(from, from + len)
+    Array.copy(bytes, 0, dst, to, bytes.length)
+    bytes.length
   }
 
+  @deprecated("Use Codec.fromUTF8 instead")
   def encode(s: String, dst: Array[Byte], to: Int): Int =
-    encode(s.toCharArray(), 0, dst, to, s.length())
-
-  def encode(s: String): Array[Byte] = {
-    val dst = new Array[Byte](s.length() * 3)
-    val len = encode(s, dst, 0)
-    dst.slice(0, len)
-  }
+    encode(s.toArray, 0, dst, to, s.length)
 
-  def decode(src: Array[Byte], from: Int,
-             dst: Array[Char], to: Int, len: Int): Int =
-  {
-    var i = from
-    var j = to
-    val end = from + len
-    while (i < end) {
-      var b = src(i) & 0xFF
-      i += 1
-      if (b >= 0xE0) {
-        b = ((b & 0x0F) << 12) | (src(i) & 0x3F) << 6
-        b = b | (src(i+1) & 0x3F)
-        i += 2
-      } else if (b >= 0xC0) {
-        b = ((b & 0x1F) << 6) | (src(i) & 0x3F)
-        i += 1
-      }
-      dst(j) = b.toChar
-      j += 1
-    }
-    j
-  }
+  @deprecated("Use Codec.fromUTF8 instead")
+  def encode(s: String): Array[Byte] = Codec fromUTF8 s
 
-  def decode(src: Array[Byte], from: Int, len: Int): String = {
-    val cs = new Array[Char](len)
-    new String(cs, 0, decode(src, from, cs, 0, len))
+  @deprecated("Use Codec.toUTF8 instead")
+  def decode(src: Array[Byte], from: Int, dst: Array[Char], to: Int, len: Int): Int = {
+    val chars = Codec toUTF8 src.slice(from, from + len)
+    Array.copy(chars, 0, dst, to, chars.length)
+    chars.length
   }
 
-}
+  @deprecated("Use Codec.toUTF8 instead")
+  def decode(src: Array[Byte], from: Int, len: Int): String =
+    Codec toUTF8 src.slice(from, from + len) mkString
+}
+\ No newline at end of file
diff --git a/src/library/scala/xml/Utility.scala b/src/library/scala/xml/Utility.scala
index 8e4c42c255..68d68ee4aa 100644
--- a/src/library/scala/xml/Utility.scala
+++ b/src/library/scala/xml/Utility.scala
@@ -414,7 +414,6 @@ object Utility extends AnyRef with parsing.TokenTests
       }
       nextch()
     }
-    new String(io.UTF8Codec.encode(i), "utf8")
+    new String(Array(i), 0, 1)
   }
-
 }
author	Paul Phillips <paulp@improving.org>	2009-09-05 11:32:10 +0000
committer	Paul Phillips <paulp@improving.org>	2009-09-05 11:32:10 +0000
commit	4d209eab310ee089d23413af76b02590a24575cb (patch)
tree	21d2644d5e51a1e25554f21e39c73351d8862f85
parent	fd1eba71450b3d60fd02dba22f951a32485629d8 (diff)
download	scala-4d209eab310ee089d23413af76b02590a24575cb.tar.gz scala-4d209eab310ee089d23413af76b02590a24575cb.tar.bz2 scala-4d209eab310ee089d23413af76b02590a24575cb.zip