aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorTarek Auel <tarek.auel@gmail.com>2015-06-30 16:59:44 -0700
committerDavies Liu <davies@databricks.com>2015-06-30 17:00:51 -0700
commitccdb05222a223187199183fd48e3a3313d536965 (patch)
treebd7d37d5a0746c6077180ebb1282bc1413155fbc /python
parent8133125ca0b83985e0c2aa2a6ad477556867e412 (diff)
downloadspark-ccdb05222a223187199183fd48e3a3313d536965.tar.gz
spark-ccdb05222a223187199183fd48e3a3313d536965.tar.bz2
spark-ccdb05222a223187199183fd48e3a3313d536965.zip
[SPARK-8727] [SQL] Missing python api; md5, log2
Jira: https://issues.apache.org/jira/browse/SPARK-8727 Author: Tarek Auel <tarek.auel@gmail.com> Author: Tarek Auel <tarek.auel@googlemail.com> Closes #7114 from tarekauel/missing-python and squashes the following commits: ef4c61b [Tarek Auel] [SPARK-8727] revert dataframe change 4029d4d [Tarek Auel] removed dataframe pi and e unit test 66f0d2b [Tarek Auel] removed pi and e from python api and dataframe api; added _to_java_column(col) for strlen 4d07318 [Tarek Auel] fixed python unit test 45f2bee [Tarek Auel] fixed result of pi and e c39f47b [Tarek Auel] add python api bd50a3a [Tarek Auel] add missing python functions
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/functions.py65
1 files changed, 52 insertions, 13 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 45ecd826bd..4e2be88e9e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -39,12 +39,15 @@ __all__ = [
'coalesce',
'countDistinct',
'explode',
+ 'log2',
+ 'md5',
'monotonicallyIncreasingId',
'rand',
'randn',
'sha1',
'sha2',
'sparkPartitionId',
+ 'strlen',
'struct',
'udf',
'when']
@@ -320,6 +323,19 @@ def explode(col):
return Column(jc)
+@ignore_unicode_prefix
+@since(1.5)
+def md5(col):
+ """Calculates the MD5 digest and returns the value as a 32 character hex string.
+
+ >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()
+ [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]
+ """
+ sc = SparkContext._active_spark_context
+ jc = sc._jvm.functions.md5(_to_java_column(col))
+ return Column(jc)
+
+
@since(1.4)
def monotonicallyIncreasingId():
"""A column that generates monotonically increasing 64-bit integers.
@@ -367,6 +383,19 @@ def randn(seed=None):
@ignore_unicode_prefix
@since(1.5)
+def sha1(col):
+ """Returns the hex string result of SHA-1.
+
+ >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
+ [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
+ """
+ sc = SparkContext._active_spark_context
+ jc = sc._jvm.functions.sha1(_to_java_column(col))
+ return Column(jc)
+
+
+@ignore_unicode_prefix
+@since(1.5)
def sha2(col, numBits):
"""Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,
and SHA-512). The numBits indicates the desired bit length of the result, which must have a
@@ -383,19 +412,6 @@ def sha2(col, numBits):
return Column(jc)
-@ignore_unicode_prefix
-@since(1.5)
-def sha1(col):
- """Returns the hex string result of SHA-1.
-
- >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
- [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
- """
- sc = SparkContext._active_spark_context
- jc = sc._jvm.functions.sha1(_to_java_column(col))
- return Column(jc)
-
-
@since(1.4)
def sparkPartitionId():
"""A column for partition ID of the Spark task.
@@ -410,6 +426,18 @@ def sparkPartitionId():
@ignore_unicode_prefix
+@since(1.5)
+def strlen(col):
+ """Calculates the length of a string expression.
+
+ >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(strlen('a').alias('length')).collect()
+ [Row(length=3)]
+ """
+ sc = SparkContext._active_spark_context
+ return Column(sc._jvm.functions.strlen(_to_java_column(col)))
+
+
+@ignore_unicode_prefix
@since(1.4)
def struct(*cols):
"""Creates a new struct column.
@@ -471,6 +499,17 @@ def log(arg1, arg2=None):
return Column(jc)
+@since(1.5)
+def log2(col):
+ """Returns the base-2 logarithm of the argument.
+
+ >>> sqlContext.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect()
+ [Row(log2=2.0)]
+ """
+ sc = SparkContext._active_spark_context
+ return Column(sc._jvm.functions.log2(_to_java_column(col)))
+
+
@since(1.4)
def lag(col, count=1, default=None):
"""