From 453dae56716bc254bf5022fddc9b8327c9b1a49f Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 18 Jan 2016 21:42:07 -0800 Subject: [SPARK-12668][SQL] Providing aliases for CSV options to be similar to Pandas and R MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://issues.apache.org/jira/browse/SPARK-12668 Spark CSV datasource has been being merged (filed in [SPARK-12420](https://issues.apache.org/jira/browse/SPARK-12420)). This is a quicky PR that simply renames several CSV options to similar Pandas and R. - Alias for delimiter ­-> sep - charset -­> encoding Author: hyukjinkwon Closes #10800 from HyukjinKwon/SPARK-12668. --- .../sql/execution/datasources/csv/CSVParameters.scala | 8 +++++--- .../spark/sql/execution/datasources/csv/CSVRelation.scala | 3 ++- .../spark/sql/execution/datasources/csv/CSVSuite.scala | 15 +++++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala index ec16bdbd8b..127c9728da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala @@ -21,7 +21,7 @@ import java.nio.charset.Charset import org.apache.spark.Logging -private[sql] case class CSVParameters(parameters: Map[String, String]) extends Logging { +private[sql] case class CSVParameters(@transient parameters: Map[String, String]) extends Logging { private def getChar(paramName: String, default: Char): Char = { val paramValue = parameters.get(paramName) @@ -44,9 +44,11 @@ private[sql] case class CSVParameters(parameters: Map[String, String]) extends L } } - val delimiter = CSVTypeCast.toChar(parameters.getOrElse("delimiter", ",")) + val delimiter = CSVTypeCast.toChar( + parameters.getOrElse("sep", parameters.getOrElse("delimiter", ","))) val parseMode = parameters.getOrElse("mode", "PERMISSIVE") - val charset = parameters.getOrElse("charset", Charset.forName("UTF-8").name()) + val charset = parameters.getOrElse("encoding", + parameters.getOrElse("charset", Charset.forName("UTF-8").name())) val quote = getChar("quote", '\"') val escape = getChar("escape", '\\') diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala index 9267479755..53818853ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala @@ -58,9 +58,10 @@ private[csv] class CSVRelation( if (Charset.forName(params.charset) == Charset.forName("UTF-8")) { sqlContext.sparkContext.textFile(location) } else { + val charset = params.charset sqlContext.sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](location) .mapPartitions { _.map { pair => - new String(pair._2.getBytes, 0, pair._2.getLength, params.charset) + new String(pair._2.getBytes, 0, pair._2.getLength, charset) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 8fdd31aa43..071b5ef56d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -122,7 +122,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { assert(exception.getMessage.contains("1-9588-osi")) } - ignore("test different encoding") { + test("test different encoding") { // scalastyle:off sqlContext.sql( s""" @@ -135,6 +135,18 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { verifyCars(sqlContext.table("carsTable"), withHeader = true) } + test("test aliases sep and encoding for delimiter and charset") { + val cars = sqlContext + .read + .format("csv") + .option("header", "true") + .option("encoding", "iso-8859-1") + .option("sep", "þ") + .load(testFile(carsFile8859)) + + verifyCars(cars, withHeader = true) + } + test("DDL test with tab separated file") { sqlContext.sql( s""" @@ -337,5 +349,4 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { assert(results(0).toSeq === Array(2012, "Tesla", "S", "null", "null")) assert(results(2).toSeq === Array(null, "Chevy", "Volt", null, null)) } - } -- cgit v1.2.3