From ccdb05222a223187199183fd48e3a3313d536965 Mon Sep 17 00:00:00 2001 From: Tarek Auel Date: Tue, 30 Jun 2015 16:59:44 -0700 Subject: [SPARK-8727] [SQL] Missing python api; md5, log2 Jira: https://issues.apache.org/jira/browse/SPARK-8727 Author: Tarek Auel Author: Tarek Auel Closes #7114 from tarekauel/missing-python and squashes the following commits: ef4c61b [Tarek Auel] [SPARK-8727] revert dataframe change 4029d4d [Tarek Auel] removed dataframe pi and e unit test 66f0d2b [Tarek Auel] removed pi and e from python api and dataframe api; added _to_java_column(col) for strlen 4d07318 [Tarek Auel] fixed python unit test 45f2bee [Tarek Auel] fixed result of pi and e c39f47b [Tarek Auel] add python api bd50a3a [Tarek Auel] add missing python functions --- python/pyspark/sql/functions.py | 65 ++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 13 deletions(-) (limited to 'python') diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 45ecd826bd..4e2be88e9e 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -39,12 +39,15 @@ __all__ = [ 'coalesce', 'countDistinct', 'explode', + 'log2', + 'md5', 'monotonicallyIncreasingId', 'rand', 'randn', 'sha1', 'sha2', 'sparkPartitionId', + 'strlen', 'struct', 'udf', 'when'] @@ -320,6 +323,19 @@ def explode(col): return Column(jc) +@ignore_unicode_prefix +@since(1.5) +def md5(col): + """Calculates the MD5 digest and returns the value as a 32 character hex string. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() + [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.md5(_to_java_column(col)) + return Column(jc) + + @since(1.4) def monotonicallyIncreasingId(): """A column that generates monotonically increasing 64-bit integers. @@ -365,6 +381,19 @@ def randn(seed=None): return Column(jc) +@ignore_unicode_prefix +@since(1.5) +def sha1(col): + """Returns the hex string result of SHA-1. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() + [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.sha1(_to_java_column(col)) + return Column(jc) + + @ignore_unicode_prefix @since(1.5) def sha2(col, numBits): @@ -383,19 +412,6 @@ def sha2(col, numBits): return Column(jc) -@ignore_unicode_prefix -@since(1.5) -def sha1(col): - """Returns the hex string result of SHA-1. - - >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() - [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.sha1(_to_java_column(col)) - return Column(jc) - - @since(1.4) def sparkPartitionId(): """A column for partition ID of the Spark task. @@ -409,6 +425,18 @@ def sparkPartitionId(): return Column(sc._jvm.functions.sparkPartitionId()) +@ignore_unicode_prefix +@since(1.5) +def strlen(col): + """Calculates the length of a string expression. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(strlen('a').alias('length')).collect() + [Row(length=3)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.strlen(_to_java_column(col))) + + @ignore_unicode_prefix @since(1.4) def struct(*cols): @@ -471,6 +499,17 @@ def log(arg1, arg2=None): return Column(jc) +@since(1.5) +def log2(col): + """Returns the base-2 logarithm of the argument. + + >>> sqlContext.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect() + [Row(log2=2.0)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.log2(_to_java_column(col))) + + @since(1.4) def lag(col, count=1, default=None): """ -- cgit v1.2.3