diff options
-rw-r--r-- | dev/deps/spark-deps-hadoop-2.2 | 2 | ||||
-rw-r--r-- | dev/deps/spark-deps-hadoop-2.3 | 2 | ||||
-rw-r--r-- | dev/deps/spark-deps-hadoop-2.4 | 2 | ||||
-rw-r--r-- | dev/deps/spark-deps-hadoop-2.6 | 2 | ||||
-rw-r--r-- | dev/deps/spark-deps-hadoop-2.7 | 2 | ||||
-rw-r--r-- | python/pyspark/sql/readwriter.py | 2 | ||||
-rw-r--r-- | python/pyspark/sql/streaming.py | 2 | ||||
-rw-r--r-- | sql/core/pom.xml | 2 | ||||
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 4 | ||||
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala | 4 | ||||
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala | 2 | ||||
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala | 4 |
12 files changed, 13 insertions, 17 deletions
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index a7259e25bf..f4f92c6d20 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -159,7 +159,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 6986ab572b..3db013f1a7 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -167,7 +167,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 75cccb352b..71710109a1 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -167,7 +167,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index ef7b8a7d8d..cb30fda253 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -175,7 +175,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xercesImpl-2.9.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 6356612537..9008aa80bc 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -176,7 +176,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xercesImpl-2.9.1.jar diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index a6860efa89..3ad6f80de9 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -349,7 +349,7 @@ class DataFrameReader(OptionUtils): set, it uses the default value, ``20480``. :param maxCharsPerColumn: defines the maximum number of characters allowed for any given value being read. If None is set, it uses the default value, - ``1000000``. + ``-1`` meaning unlimited length. :param maxMalformedLogPerPartition: sets the maximum number of malformed rows Spark will log for each partition. Malformed records beyond this number will be ignored. If None is set, it diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 01364517ed..cbd827950b 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -517,7 +517,7 @@ class DataStreamReader(OptionUtils): set, it uses the default value, ``20480``. :param maxCharsPerColumn: defines the maximum number of characters allowed for any given value being read. If None is set, it uses the default value, - ``1000000``. + ``-1`` meaning unlimited length. :param mode: allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. diff --git a/sql/core/pom.xml b/sql/core/pom.xml index b2752638be..84de1d4a6e 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -38,7 +38,7 @@ <dependency> <groupId>com.univocity</groupId> <artifactId>univocity-parsers</artifactId> - <version>2.1.1</version> + <version>2.2.1</version> <type>jar</type> </dependency> <dependency> diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 30f39c70fe..b10d2c86ac 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -392,8 +392,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()` or ISO 8601 format.</li> * <li>`maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.</li> - * <li>`maxCharsPerColumn` (default `1000000`): defines the maximum number of characters allowed - * for any given value being read.</li> + * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed + * for any given value being read. By default, it is -1 meaning unlimited length</li> * <li>`maxMalformedLogPerPartition` (default `10`): sets the maximum number of malformed rows * Spark will log for each partition. Malformed records beyond this number will be ignored.</li> * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 364d7c831e..e7dcc22272 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -112,7 +112,7 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str val maxColumns = getInt("maxColumns", 20480) - val maxCharsPerColumn = getInt("maxCharsPerColumn", 1000000) + val maxCharsPerColumn = getInt("maxCharsPerColumn", -1) val escapeQuotes = getBool("escapeQuotes", true) @@ -123,8 +123,6 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' - - val rowSeparator = "\n" } object CSVOptions { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala index 64bdd6f464..332f5c8e9f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala @@ -34,7 +34,6 @@ private[csv] class CsvReader(params: CSVOptions) { val settings = new CsvParserSettings() val format = settings.getFormat format.setDelimiter(params.delimiter) - format.setLineSeparator(params.rowSeparator) format.setQuote(params.quote) format.setQuoteEscape(params.escape) format.setComment(params.comment) @@ -70,7 +69,6 @@ private[csv] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten private val format = writerSettings.getFormat format.setDelimiter(params.delimiter) - format.setLineSeparator(params.rowSeparator) format.setQuote(params.quote) format.setQuoteEscape(params.escape) format.setComment(params.comment) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 9d174051bc..d437c16a25 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -247,8 +247,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * `java.text.SimpleDateFormat`. This applies to timestamp type.</li> * <li>`maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.</li> - * <li>`maxCharsPerColumn` (default `1000000`): defines the maximum number of characters allowed - * for any given value being read.</li> + * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed + * for any given value being read. By default, it is -1 meaning unlimited length</li> * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. * <ul> |