[SPARK-8727] [SQL] Missing python api; md5, log2

Jira: https://issues.apache.org/jira/browse/SPARK-8727 Author: Tarek Auel <tarek.auel@gmail.com> Author: Tarek Auel <tarek.auel@googlemail.com> Closes #7114 from tarekauel/missing-python and squashes the following commits: ef4c61b [Tarek Auel] [SPARK-8727] revert dataframe change 4029d4d [Tarek Auel] removed dataframe pi and e unit test 66f0d2b [Tarek Auel] removed pi and e from python api and dataframe api; added _to_java_column(col) for strlen 4d07318 [Tarek Auel] fixed python unit test 45f2bee [Tarek Auel] fixed result of pi and e c39f47b [Tarek Auel] add python api bd50a3a [Tarek Auel] add missing python functions
author: Tarek Auel <tarek.auel@gmail.com> 2015-06-30 16:59:44 -0700
committer: Davies Liu <davies@databricks.com> 2015-06-30 17:00:51 -0700
commit: ccdb05222a223187199183fd48e3a3313d536965 (patch)
tree: bd7d37d5a0746c6077180ebb1282bc1413155fbc /python
parent: 8133125ca0b83985e0c2aa2a6ad477556867e412 (diff)
download: spark-ccdb05222a223187199183fd48e3a3313d536965.tar.gz
spark-ccdb05222a223187199183fd48e3a3313d536965.tar.bz2
spark-ccdb05222a223187199183fd48e3a3313d536965.zip
1 files changed, 52 insertions, 13 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 45ecd826bd..4e2be88e9e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -39,12 +39,15 @@ __all__ = [
     'coalesce',
     'countDistinct',
     'explode',
+    'log2',
+    'md5',
     'monotonicallyIncreasingId',
     'rand',
     'randn',
     'sha1',
     'sha2',
     'sparkPartitionId',
+    'strlen',
     'struct',
     'udf',
     'when']
@@ -320,6 +323,19 @@ def explode(col):
     return Column(jc)
 
 
+@ignore_unicode_prefix
+@since(1.5)
+def md5(col):
+    """Calculates the MD5 digest and returns the value as a 32 character hex string.
+
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()
+    [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.md5(_to_java_column(col))
+    return Column(jc)
+
+
 @since(1.4)
 def monotonicallyIncreasingId():
     """A column that generates monotonically increasing 64-bit integers.
@@ -367,6 +383,19 @@ def randn(seed=None):
 
 @ignore_unicode_prefix
 @since(1.5)
+def sha1(col):
+    """Returns the hex string result of SHA-1.
+
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
+    [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.sha1(_to_java_column(col))
+    return Column(jc)
+
+
+@ignore_unicode_prefix
+@since(1.5)
 def sha2(col, numBits):
     """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,
     and SHA-512). The numBits indicates the desired bit length of the result, which must have a
@@ -383,19 +412,6 @@ def sha2(col, numBits):
     return Column(jc)
 
 
-@ignore_unicode_prefix
-@since(1.5)
-def sha1(col):
-    """Returns the hex string result of SHA-1.
-
-    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
-    [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
-    """
-    sc = SparkContext._active_spark_context
-    jc = sc._jvm.functions.sha1(_to_java_column(col))
-    return Column(jc)
-
-
 @since(1.4)
 def sparkPartitionId():
     """A column for partition ID of the Spark task.
@@ -410,6 +426,18 @@ def sparkPartitionId():
 
 
 @ignore_unicode_prefix
+@since(1.5)
+def strlen(col):
+    """Calculates the length of a string expression.
+
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(strlen('a').alias('length')).collect()
+    [Row(length=3)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.strlen(_to_java_column(col)))
+
+
+@ignore_unicode_prefix
 @since(1.4)
 def struct(*cols):
     """Creates a new struct column.
@@ -471,6 +499,17 @@ def log(arg1, arg2=None):
     return Column(jc)
 
 
+@since(1.5)
+def log2(col):
+    """Returns the base-2 logarithm of the argument.
+
+    >>> sqlContext.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect()
+    [Row(log2=2.0)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.log2(_to_java_column(col)))
+
+
 @since(1.4)
 def lag(col, count=1, default=None):
     """
author	Tarek Auel <tarek.auel@gmail.com>	2015-06-30 16:59:44 -0700
committer	Davies Liu <davies@databricks.com>	2015-06-30 17:00:51 -0700
commit	ccdb05222a223187199183fd48e3a3313d536965 (patch)
tree	bd7d37d5a0746c6077180ebb1282bc1413155fbc /python
parent	8133125ca0b83985e0c2aa2a6ad477556867e412 (diff)
download	spark-ccdb05222a223187199183fd48e3a3313d536965.tar.gz spark-ccdb05222a223187199183fd48e3a3313d536965.tar.bz2 spark-ccdb05222a223187199183fd48e3a3313d536965.zip