aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlya Ganelin <ilya.ganelin@capitalone.com>2015-06-29 14:15:15 -0700
committerReynold Xin <rxin@databricks.com>2015-06-29 14:15:15 -0700
commitf6fc254ec4ce5f103d45da6d007b4066ce751236 (patch)
treee00615856783d0cdbdf70da13c4b2903ef09803b
parent27ef85451cd237caa7016baa69957a35ab365aa8 (diff)
downloadspark-f6fc254ec4ce5f103d45da6d007b4066ce751236.tar.gz
spark-f6fc254ec4ce5f103d45da6d007b4066ce751236.tar.bz2
spark-f6fc254ec4ce5f103d45da6d007b4066ce751236.zip
[SPARK-8056][SQL] Design an easier way to construct schema for both Scala and Python
I've added functionality to create new StructType similar to how we add parameters to a new SparkContext. I've also added tests for this type of creation. Author: Ilya Ganelin <ilya.ganelin@capitalone.com> Closes #6686 from ilganeli/SPARK-8056B and squashes the following commits: 27c1de1 [Ilya Ganelin] Rename 467d836 [Ilya Ganelin] Removed from_string in favor of _parse_Datatype_json_value 5fef5a4 [Ilya Ganelin] Updates for type parsing 4085489 [Ilya Ganelin] Style errors 3670cf5 [Ilya Ganelin] added string to DataType conversion 8109e00 [Ilya Ganelin] Fixed error in tests 41ab686 [Ilya Ganelin] Fixed style errors e7ba7e0 [Ilya Ganelin] Moved some python tests to tests.py. Added cleaner handling of null data type and added test for correctness of input format 15868fa [Ilya Ganelin] Fixed python errors b79b992 [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-8056B a3369fc [Ilya Ganelin] Fixing space errors e240040 [Ilya Ganelin] Style bab7823 [Ilya Ganelin] Constructor error 73d4677 [Ilya Ganelin] Style 4ed00d9 [Ilya Ganelin] Fixed default arg 67df57a [Ilya Ganelin] Removed Foo 04cbf0c [Ilya Ganelin] Added comments for single object 0484d7a [Ilya Ganelin] Restored second method 6aeb740 [Ilya Ganelin] Style 689e54d [Ilya Ganelin] Style f497e9e [Ilya Ganelin] Got rid of old code e3c7a88 [Ilya Ganelin] Fixed doctest failure a62ccde [Ilya Ganelin] Style 966ac06 [Ilya Ganelin] style checks dabb7e6 [Ilya Ganelin] Added Python tests a3f4152 [Ilya Ganelin] added python bindings and better comments e6e536c [Ilya Ganelin] Added extra space 7529a2e [Ilya Ganelin] Fixed formatting d388f86 [Ilya Ganelin] Fixed small bug c4e3bf5 [Ilya Ganelin] Reverted to using parse. Updated parse to support long d7634b6 [Ilya Ganelin] Reverted to fromString to properly support types 22c39d5 [Ilya Ganelin] replaced FromString with DataTypeParser.parse. Replaced empty constructor initializing a null to have it instead create a new array to allow appends to it. faca398 [Ilya Ganelin] [SPARK-8056] Replaced default argument usage. Updated usage and code for DataType.fromString 1acf76e [Ilya Ganelin] Scala style e31c674 [Ilya Ganelin] Fixed bug in test 8dc0795 [Ilya Ganelin] Added tests for creation of StructType object with new methods fdf7e9f [Ilya Ganelin] [SPARK-8056] Created add methods to facilitate building new StructType objects.
-rw-r--r--python/pyspark/sql/tests.py29
-rw-r--r--python/pyspark/sql/types.py52
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala2
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala104
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala31
5 files changed, 212 insertions, 6 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index ffee43a94b..34f397d0ff 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -516,6 +516,35 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEqual([Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)],
df.filter(df.a.between(df.b, df.c)).collect())
+ def test_struct_type(self):
+ from pyspark.sql.types import StructType, StringType, StructField
+ struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
+ struct2 = StructType([StructField("f1", StringType(), True),
+ StructField("f2", StringType(), True, None)])
+ self.assertEqual(struct1, struct2)
+
+ struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
+ struct2 = StructType([StructField("f1", StringType(), True)])
+ self.assertNotEqual(struct1, struct2)
+
+ struct1 = (StructType().add(StructField("f1", StringType(), True))
+ .add(StructField("f2", StringType(), True, None)))
+ struct2 = StructType([StructField("f1", StringType(), True),
+ StructField("f2", StringType(), True, None)])
+ self.assertEqual(struct1, struct2)
+
+ struct1 = (StructType().add(StructField("f1", StringType(), True))
+ .add(StructField("f2", StringType(), True, None)))
+ struct2 = StructType([StructField("f1", StringType(), True)])
+ self.assertNotEqual(struct1, struct2)
+
+ # Catch exception raised during improper construction
+ try:
+ struct1 = StructType().add("name")
+ self.assertEqual(1, 0)
+ except ValueError:
+ self.assertEqual(1, 1)
+
def test_save_and_load(self):
df = self.df
tmpPath = tempfile.mkdtemp()
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 932686e5e4..ae9344e610 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -355,8 +355,7 @@ class StructType(DataType):
This is the data type representing a :class:`Row`.
"""
-
- def __init__(self, fields):
+ def __init__(self, fields=None):
"""
>>> struct1 = StructType([StructField("f1", StringType(), True)])
>>> struct2 = StructType([StructField("f1", StringType(), True)])
@@ -368,8 +367,53 @@ class StructType(DataType):
>>> struct1 == struct2
False
"""
- assert all(isinstance(f, DataType) for f in fields), "fields should be a list of DataType"
- self.fields = fields
+ if not fields:
+ self.fields = []
+ else:
+ self.fields = fields
+ assert all(isinstance(f, StructField) for f in fields),\
+ "fields should be a list of StructField"
+
+ def add(self, field, data_type=None, nullable=True, metadata=None):
+ """
+ Construct a StructType by adding new elements to it to define the schema. The method accepts
+ either:
+ a) A single parameter which is a StructField object.
+ b) Between 2 and 4 parameters as (name, data_type, nullable (optional),
+ metadata(optional). The data_type parameter may be either a String or a DataType object
+
+ >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
+ >>> struct2 = StructType([StructField("f1", StringType(), True),\
+ StructField("f2", StringType(), True, None)])
+ >>> struct1 == struct2
+ True
+ >>> struct1 = StructType().add(StructField("f1", StringType(), True))
+ >>> struct2 = StructType([StructField("f1", StringType(), True)])
+ >>> struct1 == struct2
+ True
+ >>> struct1 = StructType().add("f1", "string", True)
+ >>> struct2 = StructType([StructField("f1", StringType(), True)])
+ >>> struct1 == struct2
+ True
+
+ :param field: Either the name of the field or a StructField object
+ :param data_type: If present, the DataType of the StructField to create
+ :param nullable: Whether the field to add should be nullable (default True)
+ :param metadata: Any additional metadata (default None)
+ :return: a new updated StructType
+ """
+ if isinstance(field, StructField):
+ self.fields.append(field)
+ else:
+ if isinstance(field, str) and data_type is None:
+ raise ValueError("Must specify DataType if passing name of struct_field to create.")
+
+ if isinstance(data_type, str):
+ data_type_f = _parse_datatype_json_value(data_type)
+ else:
+ data_type_f = data_type
+ self.fields.append(StructField(field, data_type_f, nullable, metadata))
+ return self
def simpleString(self):
return 'struct<%s>' % (','.join(f.simpleString() for f in self.fields))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
index 04f3379afb..6b43224feb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
@@ -44,7 +44,7 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
"(?i)tinyint".r ^^^ ByteType |
"(?i)smallint".r ^^^ ShortType |
"(?i)double".r ^^^ DoubleType |
- "(?i)bigint".r ^^^ LongType |
+ "(?i)(?:bigint|long)".r ^^^ LongType |
"(?i)binary".r ^^^ BinaryType |
"(?i)boolean".r ^^^ BooleanType |
fixedDecimalType |
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 193c08a4d0..2db0a359e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -94,7 +94,7 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {
/** No-arg constructor for kryo. */
- protected def this() = this(null)
+ def this() = this(Array.empty[StructField])
/** Returns all field names in an array. */
def fieldNames: Array[String] = fields.map(_.name)
@@ -104,6 +104,108 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
private lazy val nameToIndex: Map[String, Int] = fieldNames.zipWithIndex.toMap
/**
+ * Creates a new [[StructType]] by adding a new field.
+ * {{{
+ * val struct = (new StructType)
+ * .add(StructField("a", IntegerType, true))
+ * .add(StructField("b", LongType, false))
+ * .add(StructField("c", StringType, true))
+ *}}}
+ */
+ def add(field: StructField): StructType = {
+ StructType(fields :+ field)
+ }
+
+ /**
+ * Creates a new [[StructType]] by adding a new nullable field with no metadata.
+ *
+ * val struct = (new StructType)
+ * .add("a", IntegerType)
+ * .add("b", LongType)
+ * .add("c", StringType)
+ */
+ def add(name: String, dataType: DataType): StructType = {
+ StructType(fields :+ new StructField(name, dataType, nullable = true, Metadata.empty))
+ }
+
+ /**
+ * Creates a new [[StructType]] by adding a new field with no metadata.
+ *
+ * val struct = (new StructType)
+ * .add("a", IntegerType, true)
+ * .add("b", LongType, false)
+ * .add("c", StringType, true)
+ */
+ def add(name: String, dataType: DataType, nullable: Boolean): StructType = {
+ StructType(fields :+ new StructField(name, dataType, nullable, Metadata.empty))
+ }
+
+ /**
+ * Creates a new [[StructType]] by adding a new field and specifying metadata.
+ * {{{
+ * val struct = (new StructType)
+ * .add("a", IntegerType, true, Metadata.empty)
+ * .add("b", LongType, false, Metadata.empty)
+ * .add("c", StringType, true, Metadata.empty)
+ * }}}
+ */
+ def add(
+ name: String,
+ dataType: DataType,
+ nullable: Boolean,
+ metadata: Metadata): StructType = {
+ StructType(fields :+ new StructField(name, dataType, nullable, metadata))
+ }
+
+ /**
+ * Creates a new [[StructType]] by adding a new nullable field with no metadata where the
+ * dataType is specified as a String.
+ *
+ * {{{
+ * val struct = (new StructType)
+ * .add("a", "int")
+ * .add("b", "long")
+ * .add("c", "string")
+ * }}}
+ */
+ def add(name: String, dataType: String): StructType = {
+ add(name, DataTypeParser.parse(dataType), nullable = true, Metadata.empty)
+ }
+
+ /**
+ * Creates a new [[StructType]] by adding a new field with no metadata where the
+ * dataType is specified as a String.
+ *
+ * {{{
+ * val struct = (new StructType)
+ * .add("a", "int", true)
+ * .add("b", "long", false)
+ * .add("c", "string", true)
+ * }}}
+ */
+ def add(name: String, dataType: String, nullable: Boolean): StructType = {
+ add(name, DataTypeParser.parse(dataType), nullable, Metadata.empty)
+ }
+
+ /**
+ * Creates a new [[StructType]] by adding a new field and specifying metadata where the
+ * dataType is specified as a String.
+ * {{{
+ * val struct = (new StructType)
+ * .add("a", "int", true, Metadata.empty)
+ * .add("b", "long", false, Metadata.empty)
+ * .add("c", "string", true, Metadata.empty)
+ * }}}
+ */
+ def add(
+ name: String,
+ dataType: String,
+ nullable: Boolean,
+ metadata: Metadata): StructType = {
+ add(name, DataTypeParser.parse(dataType), nullable, metadata)
+ }
+
+ /**
* Extracts a [[StructField]] of the given name. If the [[StructType]] object does not
* have a name matching the given name, `null` will be returned.
*/
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 077c0ad70a..14e7b4a956 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -33,6 +33,37 @@ class DataTypeSuite extends SparkFunSuite {
assert(MapType(StringType, IntegerType, true) === map)
}
+ test("construct with add") {
+ val struct = (new StructType)
+ .add("a", IntegerType, true)
+ .add("b", LongType, false)
+ .add("c", StringType, true)
+
+ assert(StructField("b", LongType, false) === struct("b"))
+ }
+
+ test("construct with add from StructField") {
+ // Test creation from StructField type
+ val struct = (new StructType)
+ .add(StructField("a", IntegerType, true))
+ .add(StructField("b", LongType, false))
+ .add(StructField("c", StringType, true))
+
+ assert(StructField("b", LongType, false) === struct("b"))
+ }
+
+ test("construct with String DataType") {
+ // Test creation with DataType as String
+ val struct = (new StructType)
+ .add("a", "int", true)
+ .add("b", "long", false)
+ .add("c", "string", true)
+
+ assert(StructField("a", IntegerType, true) === struct("a"))
+ assert(StructField("b", LongType, false) === struct("b"))
+ assert(StructField("c", StringType, true) === struct("c"))
+ }
+
test("extract fields from a StructType") {
val struct = StructType(
StructField("a", IntegerType, true) ::