diff options
author | Paul Phillips <paulp@improving.org> | 2009-09-05 11:32:10 +0000 |
---|---|---|
committer | Paul Phillips <paulp@improving.org> | 2009-09-05 11:32:10 +0000 |
commit | 4d209eab310ee089d23413af76b02590a24575cb (patch) | |
tree | 21d2644d5e51a1e25554f21e39c73351d8862f85 /src | |
parent | fd1eba71450b3d60fd02dba22f951a32485629d8 (diff) | |
download | scala-4d209eab310ee089d23413af76b02590a24575cb.tar.gz scala-4d209eab310ee089d23413af76b02590a24575cb.tar.bz2 scala-4d209eab310ee089d23413af76b02590a24575cb.zip |
Removed custom UTF8 encoding implementation, de...
Removed custom UTF8 encoding implementation, deprecated all its methods,
and ran everything through java's built-in decoder.
Diffstat (limited to 'src')
-rw-r--r-- | src/compiler/scala/tools/nsc/symtab/Names.scala | 48 | ||||
-rw-r--r-- | src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala | 1 | ||||
-rw-r--r-- | src/library/scala/io/BytePickle.scala | 7 | ||||
-rw-r--r-- | src/library/scala/io/Codec.scala | 18 | ||||
-rw-r--r-- | src/library/scala/io/UTF8Codec.scala | 126 | ||||
-rw-r--r-- | src/library/scala/xml/Utility.scala | 3 |
6 files changed, 81 insertions, 122 deletions
diff --git a/src/compiler/scala/tools/nsc/symtab/Names.scala b/src/compiler/scala/tools/nsc/symtab/Names.scala index 3dcff5da0b..603c0f3673 100644 --- a/src/compiler/scala/tools/nsc/symtab/Names.scala +++ b/src/compiler/scala/tools/nsc/symtab/Names.scala @@ -8,7 +8,7 @@ package scala.tools.nsc package symtab import scala.util.NameTransformer -import scala.io.UTF8Codec +import scala.io.Codec import java.security.MessageDigest /** The class <code>Names</code> ... @@ -87,24 +87,19 @@ class Names { private lazy val md5 = MessageDigest.getInstance("MD5") - private def toMD5(s: String, prefixSuffixLen: Int) = { -// println("COMPACTIFY "+s) - val cs: Array[Char] = s.toCharArray - val bytes = new Array[Byte](cs.length * 4) - val len = UTF8Codec.encode(cs, 0, bytes, 0, cs.length) - md5.update(bytes, 0, len) - val hash = md5.digest() - val sb = new StringBuilder - sb.appendAll(cs, 0, prefixSuffixLen) - sb.append("$$$$") - for (i <- 0 until hash.length) { - val b = hash(i) - sb.append(((b >> 4) & 0xF).toHexString) - sb.append((b & 0xF).toHexString) - } - sb.append("$$$$") - sb.appendAll(cs, len - prefixSuffixLen, prefixSuffixLen) - sb.toString + /** "COMPACTIFY" */ + private def toMD5(s: String, edge: Int) = { + import collection.immutable.StringVector._ + val prefix = take(s, edge) + val suffix = takeRight(s, edge) + val marker = "$$$$" + + val cs = s.toArray + val bytes = Codec fromUTF8 cs + md5 update bytes + val md5chars = md5.digest() map (b => (b & 0xFF).toHexString) mkString + + prefix + marker + md5chars + marker + suffix } def compactify(s: String): String = @@ -142,11 +137,8 @@ class Names { * @param len ... * @return the created term name */ - def newTermName(bs: Array[Byte], offset: Int, len: Int): Name = { - val cs = new Array[Char](bs.length) - val nchrs = UTF8Codec.decode(bs, offset, cs, 0, len) - newTermName(cs, 0, nchrs) - } + def newTermName(bs: Array[Byte], offset: Int, len: Int): Name = + newTermName(Codec toUTF8 bs.slice(offset, offset + len) mkString) /** Create a type name from the characters in <code>cs[offset..offset+len-1]</code>. * @@ -173,7 +165,6 @@ class Names { def newTypeName(bs: Array[Byte], offset: Int, len: Int): Name = newTermName(bs, offset, len).toTypeName - def nameChars: Array[Char] = chrs implicit def view(s: String): Name = newTermName(s) @@ -228,8 +219,11 @@ class Names { * Array must have enough remaining space for all bytes * (i.e. maximally 3*length bytes). */ - final def copyUTF8(bs: Array[Byte], offset: Int): Int = - UTF8Codec.encode(chrs, index, bs, offset, len) + final def copyUTF8(bs: Array[Byte], offset: Int): Int = { + val bytes = Codec fromUTF8 chrs.slice(index, index + len) + compat.Platform.arraycopy(bytes, 0, bs, offset, bytes.length) + offset + bytes.length + } /** return the hash value of this name */ diff --git a/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala b/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala index 87bf879980..d5e69d20d8 100644 --- a/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala +++ b/src/compiler/scala/tools/nsc/symtab/classfile/UnPickler.scala @@ -12,7 +12,6 @@ import java.io.IOException import java.lang.{Float, Double} import scala.tools.nsc.util.{Position, NoPosition} -import scala.io.UTF8Codec import Flags._ import PickleFormat._ diff --git a/src/library/scala/io/BytePickle.scala b/src/library/scala/io/BytePickle.scala index 135d608eb9..b468ba4b52 100644 --- a/src/library/scala/io/BytePickle.scala +++ b/src/library/scala/io/BytePickle.scala @@ -271,8 +271,11 @@ object BytePickle { (s.stream(0), new UnPicklerState(s.stream.slice(1, s.stream.length), s.dict)); } - def string: SPU[String] = - share(wrap((a: Array[Byte]) => UTF8Codec.decode(a, 0, a.length), (s:String) => UTF8Codec.encode(s), bytearray)); + def string: SPU[String] = share(wrap( + (a: Array[Byte]) => Codec toUTF8 a mkString, + (s: String) => Codec fromUTF8 s, + bytearray + )) def bytearray: SPU[Array[Byte]] = { wrap((l:List[Byte]) => l.toArray, (_.toList), list(byte)) diff --git a/src/library/scala/io/Codec.scala b/src/library/scala/io/Codec.scala index a9483077a2..2e262fcb38 100644 --- a/src/library/scala/io/Codec.scala +++ b/src/library/scala/io/Codec.scala @@ -83,6 +83,24 @@ object Codec { new Codec(decoder.charset()) { override def decoder = _decoder } } + def toUTF8(bytes: Array[Byte]): Array[Char] = { + val bbuffer = java.nio.ByteBuffer wrap bytes + val cbuffer = UTF8 decode bbuffer + val chars = new Array[Char](cbuffer.remaining()) + cbuffer get chars + + chars + } + + def fromUTF8(cs: CharSequence): Array[Byte] = { + val cbuffer = java.nio.CharBuffer wrap cs + val bbuffer = UTF8 encode cbuffer + val bytes = new Array[Byte](bbuffer.remaining()) + bbuffer get bytes + + bytes + } + implicit def string2codec(s: String) = apply(s) implicit def charset2codec(c: Charset) = apply(c) implicit def decoder2codec(cd: CharsetDecoder) = apply(cd) diff --git a/src/library/scala/io/UTF8Codec.scala b/src/library/scala/io/UTF8Codec.scala index b0e0ee0077..21d3b3bb31 100644 --- a/src/library/scala/io/UTF8Codec.scala +++ b/src/library/scala/io/UTF8Codec.scala @@ -8,112 +8,58 @@ // $Id$ - package scala.io /** * @author Martin Odersky * @version 1.0, 04/10/2004 */ -object UTF8Codec { - +object UTF8Codec +{ final val UNI_REPLACEMENT_CHAR: Int = 0x0000FFFD + final val UNI_REPLACEMENT_BYTES = encode(UNI_REPLACEMENT_CHAR) - /** convert a codepoint to utf-8 bytes - * @author buraq - * @param ch codepoint - */ - def encode(ch1: Int): Array[Byte] = { - var ch = ch1 - val byteMask = 0xBF - val byteMark = 0x80 - var bytesToWrite = 0 - val firstByteMark = List[Byte](0x00.toByte, 0x00.toByte, 0xC0.toByte, 0xE0.toByte, 0xF0.toByte, 0xF8.toByte, 0xFC.toByte) - - if (ch < 0x80) { bytesToWrite = 1 } - else if (ch < 0x800) { bytesToWrite = 2 } - else if (ch < 0x10000) { bytesToWrite = 3 } - else if (ch <= 0x0010FFFF) { bytesToWrite = 4 } - else return encode(UNI_REPLACEMENT_CHAR) - - val res = new Array[Byte](bytesToWrite) + // Note, from http://unicode.org/faq/utf_bom.html#utf8-5 + // + // A different issue arises if an unpaired surrogate is encountered when converting + // ill-formed UTF-16 data. By represented such an unpaired surrogate on its own as a + // 3-byte sequence, the resulting UTF-8 data stream would become ill-formed. + // While it faithfully reflects the nature of the input, Unicode conformance + // requires that encoding form conversion always results in valid data stream. + // Therefore a converter must treat this as an error. + // + // Some useful locations: + // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt - var bw = bytesToWrite - if (bw >= 4) { - res(3) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1 - } - if (bw >= 3) { - res(2) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1 - } - if (bw >= 2) { - res(1) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1 - } - if (bw >= 1) { - res(0) = (ch | firstByteMark(bytesToWrite)).toByte + @deprecated("""Use new String(Array(ch), 0, 1).getBytes("UTF-8") instead""") + def encode(ch: Int): Array[Byte] = + if ((Character getType ch) == Character.SURROGATE) UNI_REPLACEMENT_BYTES + else try new String(Array(ch), 0, 1) getBytes "UTF-8" catch { + case _: IllegalArgumentException => UNI_REPLACEMENT_BYTES } - res - } + @deprecated("Use Codec.fromUTF8 instead") def encode(src: Array[Char], from: Int, dst: Array[Byte], to: Int, len: Int): Int = { - var i = from - var j = to - val end = from + len - while (i < end) { - val ch = src(i) - i += 1 - if (ch < 128) { - dst(j) = ch.toByte - j += 1 - } - else if (ch <= 0x3FF) { - dst(j) = (0xC0 | (ch >> 6)).toByte - dst(j+1) = (0x80 | (ch & 0x3F)).toByte - j += 2 - } else { - dst(j) = (0xE0 | (ch >> 12)).toByte - dst(j+1) = (0x80 | ((ch >> 6) & 0x3F)).toByte - dst(j+2) = (0x80 | (ch & 0x3F)).toByte - j += 3 - } - } - j + val bytes = Codec fromUTF8 src.slice(from, from + len) + Array.copy(bytes, 0, dst, to, bytes.length) + bytes.length } + @deprecated("Use Codec.fromUTF8 instead") def encode(s: String, dst: Array[Byte], to: Int): Int = - encode(s.toCharArray(), 0, dst, to, s.length()) - - def encode(s: String): Array[Byte] = { - val dst = new Array[Byte](s.length() * 3) - val len = encode(s, dst, 0) - dst.slice(0, len) - } + encode(s.toArray, 0, dst, to, s.length) - def decode(src: Array[Byte], from: Int, - dst: Array[Char], to: Int, len: Int): Int = - { - var i = from - var j = to - val end = from + len - while (i < end) { - var b = src(i) & 0xFF - i += 1 - if (b >= 0xE0) { - b = ((b & 0x0F) << 12) | (src(i) & 0x3F) << 6 - b = b | (src(i+1) & 0x3F) - i += 2 - } else if (b >= 0xC0) { - b = ((b & 0x1F) << 6) | (src(i) & 0x3F) - i += 1 - } - dst(j) = b.toChar - j += 1 - } - j - } + @deprecated("Use Codec.fromUTF8 instead") + def encode(s: String): Array[Byte] = Codec fromUTF8 s - def decode(src: Array[Byte], from: Int, len: Int): String = { - val cs = new Array[Char](len) - new String(cs, 0, decode(src, from, cs, 0, len)) + @deprecated("Use Codec.toUTF8 instead") + def decode(src: Array[Byte], from: Int, dst: Array[Char], to: Int, len: Int): Int = { + val chars = Codec toUTF8 src.slice(from, from + len) + Array.copy(chars, 0, dst, to, chars.length) + chars.length } -} + @deprecated("Use Codec.toUTF8 instead") + def decode(src: Array[Byte], from: Int, len: Int): String = + Codec toUTF8 src.slice(from, from + len) mkString +}
\ No newline at end of file diff --git a/src/library/scala/xml/Utility.scala b/src/library/scala/xml/Utility.scala index 8e4c42c255..68d68ee4aa 100644 --- a/src/library/scala/xml/Utility.scala +++ b/src/library/scala/xml/Utility.scala @@ -414,7 +414,6 @@ object Utility extends AnyRef with parsing.TokenTests } nextch() } - new String(io.UTF8Codec.encode(i), "utf8") + new String(Array(i), 0, 1) } - } |