SPARK-1586 Windows build fixes

Unfortunately, this is not exhaustive - particularly hive tests still fail due to path issues. Author: Mridul Muralidharan <mridulm80@apache.org> This patch had conflicts when merged, resolved by Committer: Matei Zaharia <matei@databricks.com> Closes #505 from mridulm/windows_fixes and squashes the following commits: ef12283 [Mridul Muralidharan] Move to org.apache.commons.lang3 for StringEscapeUtils. Earlier version was buggy appparently cdae406 [Mridul Muralidharan] Remove leaked changes from > 2G fix branch 3267f4b [Mridul Muralidharan] Fix build failures 35b277a [Mridul Muralidharan] Fix Scalastyle failures bc69d14 [Mridul Muralidharan] Change from hardcoded path separator 10c4d78 [Mridul Muralidharan] Use explicit encoding while using getBytes 1337abd [Mridul Muralidharan] fix classpath while running in windows
author: Mridul Muralidharan <mridulm80@apache.org> 2014-04-24 20:48:33 -0700
committer: Matei Zaharia <matei@databricks.com> 2014-04-24 20:48:33 -0700
commit: 968c0187a12f5ae4a696c02c1ff088e998ed7edd (patch)
tree: a08997fe5f5debfaae4b55770cee37d9d53c739c /sql
parent: d5c6ae6cc3305b9aa3185486b5b6ba0a6e5aca90 (diff)
download: spark-968c0187a12f5ae4a696c02c1ff088e998ed7edd.tar.gz
spark-968c0187a12f5ae4a696c02c1ff088e998ed7edd.tar.bz2
spark-968c0187a12f5ae4a696c02c1ff088e998ed7edd.zip
7 files changed, 55 insertions, 19 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 5be76890af..4cd52d8288 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -200,10 +200,10 @@ private[sql] object SHORT extends NativeColumnType(ShortType, 6, 2) {
 }
 
 private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
-  override def actualSize(v: String): Int = v.getBytes.length + 4
+  override def actualSize(v: String): Int = v.getBytes("utf-8").length + 4
 
   override def append(v: String, buffer: ByteBuffer) {
-    val stringBytes = v.getBytes()
+    val stringBytes = v.getBytes("utf-8")
     buffer.putInt(stringBytes.length).put(stringBytes, 0, stringBytes.length)
   }
 
@@ -211,7 +211,7 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
     val length = buffer.getInt()
     val stringBytes = new Array[Byte](length)
     buffer.get(stringBytes, 0, length)
-    new String(stringBytes)
+    new String(stringBytes, "utf-8")
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: String) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 1d3608ed2d..325173cf95 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -58,7 +58,7 @@ class ColumnTypeSuite extends FunSuite {
     checkActualSize(DOUBLE,  Double.MaxValue, 8)
     checkActualSize(FLOAT,   Float.MaxValue,  4)
     checkActualSize(BOOLEAN, true,            1)
-    checkActualSize(STRING,  "hello",         4 + 5)
+    checkActualSize(STRING,  "hello",         4 + "hello".getBytes("utf-8").length)
 
     val binary = Array.fill[Byte](4)(0: Byte)
     checkActualSize(BINARY,  binary, 4 + 4)
@@ -91,14 +91,16 @@ class ColumnTypeSuite extends FunSuite {
   testNativeColumnType[StringType.type](
     STRING,
     (buffer: ByteBuffer, string: String) => {
-      val bytes = string.getBytes()
-      buffer.putInt(bytes.length).put(string.getBytes)
+
+      val bytes = string.getBytes("utf-8")
+      buffer.putInt(bytes.length)
+      buffer.put(bytes)
     },
     (buffer: ByteBuffer) => {
       val length = buffer.getInt()
       val bytes = new Array[Byte](length)
-      buffer.get(bytes, 0, length)
-      new String(bytes)
+      buffer.get(bytes)
+      new String(bytes, "utf-8")
     })
 
   testColumnType[BinaryType.type, Array[Byte]](
@@ -161,9 +163,13 @@ class ColumnTypeSuite extends FunSuite {
 
       buffer.rewind()
       seq.foreach { expected =>
+        println("buffer = " + buffer + ", expected = " + expected)
+        val extracted = columnType.extract(buffer)
         assert(
-          expected === columnType.extract(buffer),
-          "Extracted value didn't equal to the original one")
+          expected === extracted,
+          "Extracted value didn't equal to the original one. " +
+            hexDump(expected) + " != " + hexDump(extracted) +
+            ", buffer = " + dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
       }
     }
 
@@ -179,4 +185,28 @@ class ColumnTypeSuite extends FunSuite {
       }
     }
   }
+
+  private def hexDump(value: Any): String = {
+    if (value.isInstanceOf[String]) {
+      val sb = new StringBuilder()
+      for (ch <- value.asInstanceOf[String].toCharArray) {
+        sb.append(Integer.toHexString(ch & 0xffff)).append(' ')
+      }
+      if (! sb.isEmpty) sb.setLength(sb.length - 1)
+      sb.toString()
+    } else {
+      // for now ..
+      hexDump(value.toString)
+    }
+  }
+
+  private def dumpBuffer(buff: ByteBuffer): Any = {
+    val sb = new StringBuilder()
+    while (buff.hasRemaining) {
+      val b = buff.get()
+      sb.append(Integer.toHexString(b & 0xff)).append(' ')
+    }
+    if (! sb.isEmpty) sb.setLength(sb.length - 1)
+    sb.toString()
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ScriptTransformation.scala
index 610fa9cb84..8258ee5fef 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ScriptTransformation.scala
@@ -71,7 +71,7 @@ case class ScriptTransformation(
       iter
         .map(outputProjection)
         // TODO: Use SerDe
-        .map(_.mkString("", "\t", "\n").getBytes).foreach(outputStream.write)
+        .map(_.mkString("", "\t", "\n").getBytes("utf-8")).foreach(outputStream.write)
       outputStream.close()
       readerThread.join()
       outputLines.toIterator
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index 74110ee27b..3ad66a3d7f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -100,14 +100,15 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
   hiveFilesTemp.delete()
   hiveFilesTemp.mkdir()
 
-  val inRepoTests = if (System.getProperty("user.dir").endsWith("sql/hive")) {
-    new File("src/test/resources/")
+  val inRepoTests = if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) {
+    new File("src" + File.separator + "test" + File.separator + "resources" + File.separator)
   } else {
-    new File("sql/hive/src/test/resources")
+    new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + 
+      File.separator + "resources")
   }
 
   def getHiveFile(path: String): File = {
-    val stripped = path.replaceAll("""\.\.\/""", "")
+    val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar)
     hiveDevHome
       .map(new File(_, stripped))
       .filter(_.exists)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/BigDataBenchmarkSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/BigDataBenchmarkSuite.scala
index 9b9a823b6e..42a82c1fbf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/BigDataBenchmarkSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/BigDataBenchmarkSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.hive.test.TestHive._
  * https://amplab.cs.berkeley.edu/benchmark/
  */
 class BigDataBenchmarkSuite extends HiveComparisonTest {
-  val testDataDirectory = new File("target/big-data-benchmark-testdata")
+  val testDataDirectory = new File("target" + File.separator + "big-data-benchmark-testdata")
 
   val testTables = Seq(
     TestTable(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index ea17e6e93b..edff38b901 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -78,7 +78,8 @@ abstract class HiveComparisonTest
       .map(name => new File(targetDir, s"$suiteName.$name"))
 
   /** The local directory with cached golden answer will be stored. */
-  protected val answerCache = new File("src/test/resources/golden")
+  protected val answerCache = new File("src" + File.separator + "test" +
+    File.separator + "resources" + File.separator + "golden")
   if (!answerCache.exists) {
     answerCache.mkdir()
   }
@@ -120,7 +121,7 @@ abstract class HiveComparisonTest
   protected val cacheDigest = java.security.MessageDigest.getInstance("MD5")
   protected def getMd5(str: String): String = {
     val digest = java.security.MessageDigest.getInstance("MD5")
-    digest.update(str.getBytes)
+    digest.update(str.getBytes("utf-8"))
     new java.math.BigInteger(1, digest.digest).toString(16)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index dfe88b960b..0bb76f31c3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.io.File
+
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.hive.test.TestHive
@@ -26,7 +28,9 @@ import org.apache.spark.sql.hive.test.TestHive
  */
 class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
   // TODO: bundle in jar files... get from classpath
-  lazy val hiveQueryDir = TestHive.getHiveFile("ql/src/test/queries/clientpositive")
+  lazy val hiveQueryDir = TestHive.getHiveFile("ql" + File.separator + "src" +
+    File.separator + "test" + File.separator + "queries" + File.separator + "clientpositive")
+
   def testCases = hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f)
 
   override def beforeAll() {
author	Mridul Muralidharan <mridulm80@apache.org>	2014-04-24 20:48:33 -0700
committer	Matei Zaharia <matei@databricks.com>	2014-04-24 20:48:33 -0700
commit	968c0187a12f5ae4a696c02c1ff088e998ed7edd (patch)
tree	a08997fe5f5debfaae4b55770cee37d9d53c739c /sql
parent	d5c6ae6cc3305b9aa3185486b5b6ba0a6e5aca90 (diff)
download	spark-968c0187a12f5ae4a696c02c1ff088e998ed7edd.tar.gz spark-968c0187a12f5ae4a696c02c1ff088e998ed7edd.tar.bz2 spark-968c0187a12f5ae4a696c02c1ff088e998ed7edd.zip