aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-01-18 21:42:07 -0800
committerReynold Xin <rxin@databricks.com>2016-01-18 21:42:07 -0800
commit453dae56716bc254bf5022fddc9b8327c9b1a49f (patch)
tree7a6c31098e885e196ce804ede69dfcc2340ff485
parent74ba84b64cab0bf3828033037267955aca296d3a (diff)
downloadspark-453dae56716bc254bf5022fddc9b8327c9b1a49f.tar.gz
spark-453dae56716bc254bf5022fddc9b8327c9b1a49f.tar.bz2
spark-453dae56716bc254bf5022fddc9b8327c9b1a49f.zip
[SPARK-12668][SQL] Providing aliases for CSV options to be similar to Pandas and R
https://issues.apache.org/jira/browse/SPARK-12668 Spark CSV datasource has been being merged (filed in [SPARK-12420](https://issues.apache.org/jira/browse/SPARK-12420)). This is a quicky PR that simply renames several CSV options to similar Pandas and R. - Alias for delimiter ­-> sep - charset -­> encoding Author: hyukjinkwon <gurwls223@gmail.com> Closes #10800 from HyukjinKwon/SPARK-12668.
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala8
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala3
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala15
3 files changed, 20 insertions, 6 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
index ec16bdbd8b..127c9728da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
@@ -21,7 +21,7 @@ import java.nio.charset.Charset
import org.apache.spark.Logging
-private[sql] case class CSVParameters(parameters: Map[String, String]) extends Logging {
+private[sql] case class CSVParameters(@transient parameters: Map[String, String]) extends Logging {
private def getChar(paramName: String, default: Char): Char = {
val paramValue = parameters.get(paramName)
@@ -44,9 +44,11 @@ private[sql] case class CSVParameters(parameters: Map[String, String]) extends L
}
}
- val delimiter = CSVTypeCast.toChar(parameters.getOrElse("delimiter", ","))
+ val delimiter = CSVTypeCast.toChar(
+ parameters.getOrElse("sep", parameters.getOrElse("delimiter", ",")))
val parseMode = parameters.getOrElse("mode", "PERMISSIVE")
- val charset = parameters.getOrElse("charset", Charset.forName("UTF-8").name())
+ val charset = parameters.getOrElse("encoding",
+ parameters.getOrElse("charset", Charset.forName("UTF-8").name()))
val quote = getChar("quote", '\"')
val escape = getChar("escape", '\\')
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index 9267479755..53818853ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -58,9 +58,10 @@ private[csv] class CSVRelation(
if (Charset.forName(params.charset) == Charset.forName("UTF-8")) {
sqlContext.sparkContext.textFile(location)
} else {
+ val charset = params.charset
sqlContext.sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](location)
.mapPartitions { _.map { pair =>
- new String(pair._2.getBytes, 0, pair._2.getLength, params.charset)
+ new String(pair._2.getBytes, 0, pair._2.getLength, charset)
}
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 8fdd31aa43..071b5ef56d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -122,7 +122,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
assert(exception.getMessage.contains("1-9588-osi"))
}
- ignore("test different encoding") {
+ test("test different encoding") {
// scalastyle:off
sqlContext.sql(
s"""
@@ -135,6 +135,18 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
verifyCars(sqlContext.table("carsTable"), withHeader = true)
}
+ test("test aliases sep and encoding for delimiter and charset") {
+ val cars = sqlContext
+ .read
+ .format("csv")
+ .option("header", "true")
+ .option("encoding", "iso-8859-1")
+ .option("sep", "þ")
+ .load(testFile(carsFile8859))
+
+ verifyCars(cars, withHeader = true)
+ }
+
test("DDL test with tab separated file") {
sqlContext.sql(
s"""
@@ -337,5 +349,4 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
assert(results(0).toSeq === Array(2012, "Tesla", "S", "null", "null"))
assert(results(2).toSeq === Array(null, "Chevy", "Volt", null, null))
}
-
}