[SPARK-7322, SPARK-7836, SPARK-7822][SQL] DataFrame window function related updates

1. ntile should take an integer as parameter. 2. Added Python API (based on #6364) 3. Update documentation of various DataFrame Python functions. Author: Davies Liu <davies@databricks.com> Author: Reynold Xin <rxin@databricks.com> Closes #6374 from rxin/window-final and squashes the following commits: 69004c7 [Reynold Xin] Style fix. 288cea9 [Reynold Xin] Update documentaiton. 7cb8985 [Reynold Xin] Merge pull request #6364 from davies/window 66092b4 [Davies Liu] update docs ed73cb4 [Reynold Xin] [SPARK-7322][SQL] Improve DataFrame window function documentation. ef55132 [Davies Liu] Merge branch 'master' of github.com:apache/spark into window4 8936ade [Davies Liu] fix maxint in python 3 2649358 [Davies Liu] update docs 778e2c0 [Davies Liu] SPARK-7836 and SPARK-7822: Python API of window functions (cherry picked from commit efe3bfdf496aa6206ace2697e31dd4c0c3c824fb) Signed-off-by: Yin Huai <yhuai@databricks.com>
author: Davies Liu <davies@databricks.com> 2015-05-23 08:30:05 -0700
committer: Yin Huai <yhuai@databricks.com> 2015-05-23 08:30:18 -0700
commit: d1515381cb957f40daf026144ce3ac014660df23 (patch)
tree: 3d4ec2c78cdc8629653ce99d31a432eefbdddc81 /python/pyspark/sql/column.py
parent: ea9db50bc3ade82fb9966df34961a17b255b86d7 (diff)
download: spark-d1515381cb957f40daf026144ce3ac014660df23.tar.gz
spark-d1515381cb957f40daf026144ce3ac014660df23.tar.bz2
spark-d1515381cb957f40daf026144ce3ac014660df23.zip
1 files changed, 40 insertions, 14 deletions
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index baf1ecbd0a..8dc5039f58 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -116,6 +116,8 @@ class Column(object):
         df.colName + 1
         1 / df.colName
 
+    .. note:: Experimental
+
     .. versionadded:: 1.3
     """
 
@@ -164,8 +166,9 @@ class Column(object):
 
     @since(1.3)
     def getItem(self, key):
-        """An expression that gets an item at position `ordinal` out of a list,
-         or gets an item by key out of a dict.
+        """
+        An expression that gets an item at position ``ordinal`` out of a list,
+        or gets an item by key out of a dict.
 
         >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
         >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
@@ -185,7 +188,8 @@ class Column(object):
 
     @since(1.3)
     def getField(self, name):
-        """An expression that gets a field by name in a StructField.
+        """
+        An expression that gets a field by name in a StructField.
 
         >>> from pyspark.sql import Row
         >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
@@ -219,7 +223,7 @@ class Column(object):
     @since(1.3)
     def substr(self, startPos, length):
         """
-        Return a :class:`Column` which is a substring of the column
+        Return a :class:`Column` which is a substring of the column.
 
         :param startPos: start position (int or Column)
         :param length:  length of the substring (int or Column)
@@ -242,7 +246,8 @@ class Column(object):
     @ignore_unicode_prefix
     @since(1.3)
     def inSet(self, *cols):
-        """ A boolean expression that is evaluated to true if the value of this
+        """
+        A boolean expression that is evaluated to true if the value of this
         expression is contained by the evaluated values of the arguments.
 
         >>> df[df.name.inSet("Bob", "Mike")].collect()
@@ -268,7 +273,8 @@ class Column(object):
 
     @since(1.3)
     def alias(self, *alias):
-        """Returns this column aliased with a new name or names (in the case of expressions that
+        """
+        Returns this column aliased with a new name or names (in the case of expressions that
         return more than one column, such as explode).
 
         >>> df.select(df.age.alias("age2")).collect()
@@ -284,7 +290,7 @@ class Column(object):
     @ignore_unicode_prefix
     @since(1.3)
     def cast(self, dataType):
-        """ Convert the column into type `dataType`
+        """ Convert the column into type ``dataType``.
 
         >>> df.select(df.age.cast("string").alias('ages')).collect()
         [Row(ages=u'2'), Row(ages=u'5')]
@@ -304,25 +310,24 @@ class Column(object):
 
     astype = cast
 
-    @ignore_unicode_prefix
     @since(1.3)
     def between(self, lowerBound, upperBound):
-        """ A boolean expression that is evaluated to true if the value of this
+        """
+        A boolean expression that is evaluated to true if the value of this
         expression is between the given columns.
         """
         return (self >= lowerBound) & (self <= upperBound)
 
-    @ignore_unicode_prefix
     @since(1.4)
     def when(self, condition, value):
-        """Evaluates a list of conditions and returns one of multiple possible result expressions.
+        """
+        Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
 
         See :func:`pyspark.sql.functions.when` for example usage.
 
         :param condition: a boolean :class:`Column` expression.
         :param value: a literal value, or a :class:`Column` expression.
-
         """
         sc = SparkContext._active_spark_context
         if not isinstance(condition, Column):
@@ -331,10 +336,10 @@ class Column(object):
         jc = sc._jvm.functions.when(condition._jc, v)
         return Column(jc)
 
-    @ignore_unicode_prefix
     @since(1.4)
     def otherwise(self, value):
-        """Evaluates a list of conditions and returns one of multiple possible result expressions.
+        """
+        Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
 
         See :func:`pyspark.sql.functions.when` for example usage.
@@ -345,6 +350,27 @@ class Column(object):
         jc = self._jc.otherwise(value)
         return Column(jc)
 
+    @since(1.4)
+    def over(self, window):
+        """
+        Define a windowing column.
+
+        :param window: a :class:`WindowSpec`
+        :return: a Column
+
+        >>> from pyspark.sql import Window
+        >>> window = Window.partitionBy("name").orderBy("age").rowsBetween(-1, 1)
+        >>> from pyspark.sql.functions import rank, min
+        >>> # df.select(rank().over(window), min('age').over(window))
+
+        .. note:: Window functions is only supported with HiveContext in 1.4
+        """
+        from pyspark.sql.window import WindowSpec
+        if not isinstance(window, WindowSpec):
+            raise TypeError("window should be WindowSpec")
+        jc = self._jc.over(window._jspec)
+        return Column(jc)
+
     def __repr__(self):
         return 'Column<%s>' % self._jc.toString().encode('utf8')
author	Davies Liu <davies@databricks.com>	2015-05-23 08:30:05 -0700
committer	Yin Huai <yhuai@databricks.com>	2015-05-23 08:30:18 -0700
commit	d1515381cb957f40daf026144ce3ac014660df23 (patch)
tree	3d4ec2c78cdc8629653ce99d31a432eefbdddc81 /python/pyspark/sql/column.py
parent	ea9db50bc3ade82fb9966df34961a17b255b86d7 (diff)
download	spark-d1515381cb957f40daf026144ce3ac014660df23.tar.gz spark-d1515381cb957f40daf026144ce3ac014660df23.tar.bz2 spark-d1515381cb957f40daf026144ce3ac014660df23.zip