aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/sql-programming-guide.md60
-rw-r--r--python/pyspark/sql/_types.py2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala14
3 files changed, 73 insertions, 3 deletions
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 6af10432b9..6b7b867ea6 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1594,6 +1594,64 @@ options.
# Migration Guide
+## Upgrading from Spark SQL 1.3 to 1.4
+
+Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+
+// In 1.3.x, in order for the grouping column "department" to show up,
+// it must be included explicitly as part of the agg function call.
+df.groupBy("department").agg($"department", max("age"), sum("expense"))
+
+// In 1.4+, grouping column "department" is included automatically.
+df.groupBy("department").agg(max("age"), sum("expense"))
+
+// Revert to 1.3 behavior (not retaining grouping column) by:
+sqlContext.setConf("spark.sql.retainGroupColumns", "false")
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+
+// In 1.3.x, in order for the grouping column "department" to show up,
+// it must be included explicitly as part of the agg function call.
+df.groupBy("department").agg(col("department"), max("age"), sum("expense"));
+
+// In 1.4+, grouping column "department" is included automatically.
+df.groupBy("department").agg(max("age"), sum("expense"));
+
+// Revert to 1.3 behavior (not retaining grouping column) by:
+sqlContext.setConf("spark.sql.retainGroupColumns", "false");
+
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+
+import pyspark.sql.functions as func
+
+# In 1.3.x, in order for the grouping column "department" to show up,
+# it must be included explicitly as part of the agg function call.
+df.groupBy("department").agg("department"), func.max("age"), func.sum("expense"))
+
+# In 1.4+, grouping column "department" is included automatically.
+df.groupBy("department").agg(func.max("age"), func.sum("expense"))
+
+# Revert to 1.3.x behavior (not retaining grouping column) by:
+sqlContext.setConf("spark.sql.retainGroupColumns", "false")
+
+{% endhighlight %}
+</div>
+
+</div>
+
+
## Upgrading from Spark SQL 1.0-1.2 to 1.3
In Spark 1.3 we removed the "Alpha" label from Spark SQL and as part of this did a cleanup of the
@@ -1651,7 +1709,7 @@ moved into the udf object in `SQLContext`.
<div class="codetabs">
<div data-lang="scala" markdown="1">
-{% highlight java %}
+{% highlight scala %}
sqlContext.udf.register("strLen", (s: String) => s.length())
diff --git a/python/pyspark/sql/_types.py b/python/pyspark/sql/_types.py
index fd98e116d2..b96851a174 100644
--- a/python/pyspark/sql/_types.py
+++ b/python/pyspark/sql/_types.py
@@ -1228,12 +1228,14 @@ class Row(tuple):
raise AttributeError(item)
def __reduce__(self):
+ """Returns a tuple so Python knows how to pickle Row."""
if hasattr(self, "__fields__"):
return (_create_row, (self.__fields__, tuple(self)))
else:
return tuple.__reduce__(self)
def __repr__(self):
+ """Printable representation of Row used in Python REPL."""
if hasattr(self, "__fields__"):
return "Row(%s)" % ", ".join("%s=%r" % (k, v)
for k, v in zip(self.__fields__, tuple(self)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 003a620dcc..543320e471 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -146,11 +146,21 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
*
* // Scala:
* import org.apache.spark.sql.functions._
- * df.groupBy("department").agg($"department", max($"age"), sum($"expense"))
+ * df.groupBy("department").agg(max("age"), sum("expense"))
*
* // Java:
* import static org.apache.spark.sql.functions.*;
- * df.groupBy("department").agg(col("department"), max(col("age")), sum(col("expense")));
+ * df.groupBy("department").agg(max("age"), sum("expense"));
+ * }}}
+ *
+ * Note that before Spark 1.4, the default behavior is to NOT retain grouping columns. To change
+ * to that behavior, set config variable `spark.sql.retainGroupColumns` to `false`.
+ * {{{
+ * // Scala, 1.3.x:
+ * df.groupBy("department").agg($"department", max("age"), sum("expense"))
+ *
+ * // Java, 1.3.x:
+ * df.groupBy("department").agg(col("department"), max("age"), sum("expense"));
* }}}
*/
@scala.annotation.varargs