diff options
author | Reynold Xin <rxin@databricks.com> | 2015-08-02 11:36:11 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-08-02 11:36:11 -0700 |
commit | 8eafa2aeb6c1b465cfdb99f04c2137fc3eac0c01 (patch) | |
tree | c7235322a7fb4d1b4d7adb45ba0ace2dbe42b748 | |
parent | 244016a95c43ce6db422378e85a9d527bfe59bf1 (diff) | |
download | spark-8eafa2aeb6c1b465cfdb99f04c2137fc3eac0c01.tar.gz spark-8eafa2aeb6c1b465cfdb99f04c2137fc3eac0c01.tar.bz2 spark-8eafa2aeb6c1b465cfdb99f04c2137fc3eac0c01.zip |
[SPARK-9208][SQL] Sort DataFrame functions alphabetically.
Author: Reynold Xin <rxin@databricks.com>
Closes #7861 from rxin/api-audit and squashes the following commits:
7200256 [Reynold Xin] [SPARK-9208][SQL] Sort DataFrame functions alphabetically.
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 634 | ||||
-rw-r--r-- | sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala | 20 |
2 files changed, 291 insertions, 363 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 197cd3de61..3595829907 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -70,15 +70,6 @@ object functions { def column(colName: String): Column = Column(colName) /** - * Convert a number in string format from one base to another. - * - * @group math_funcs - * @since 1.5.0 - */ - def conv(num: Column, fromBase: Int, toBase: Int): Column = - Conv(num.expr, lit(fromBase).expr, lit(toBase).expr) - - /** * Creates a [[Column]] of literal value. * * The passed in object is returned directly if it is already a [[Column]]. @@ -132,36 +123,54 @@ object functions { ////////////////////////////////////////////////////////////////////////////////////////////// /** - * Aggregate function: returns the sum of all values in the expression. + * Aggregate function: returns the approximate number of distinct items in a group. * * @group agg_funcs * @since 1.3.0 */ - def sum(e: Column): Column = Sum(e.expr) + def approxCountDistinct(e: Column): Column = ApproxCountDistinct(e.expr) /** - * Aggregate function: returns the sum of all values in the given column. + * Aggregate function: returns the approximate number of distinct items in a group. * * @group agg_funcs * @since 1.3.0 */ - def sum(columnName: String): Column = sum(Column(columnName)) + def approxCountDistinct(columnName: String): Column = approxCountDistinct(column(columnName)) /** - * Aggregate function: returns the sum of distinct values in the expression. + * Aggregate function: returns the approximate number of distinct items in a group. * * @group agg_funcs * @since 1.3.0 */ - def sumDistinct(e: Column): Column = SumDistinct(e.expr) + def approxCountDistinct(e: Column, rsd: Double): Column = ApproxCountDistinct(e.expr, rsd) /** - * Aggregate function: returns the sum of distinct values in the expression. + * Aggregate function: returns the approximate number of distinct items in a group. * * @group agg_funcs * @since 1.3.0 */ - def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName)) + def approxCountDistinct(columnName: String, rsd: Double): Column = { + approxCountDistinct(Column(columnName), rsd) + } + + /** + * Aggregate function: returns the average of the values in a group. + * + * @group agg_funcs + * @since 1.3.0 + */ + def avg(e: Column): Column = Average(e.expr) + + /** + * Aggregate function: returns the average of the values in a group. + * + * @group agg_funcs + * @since 1.3.0 + */ + def avg(columnName: String): Column = avg(Column(columnName)) /** * Aggregate function: returns the number of items in a group. @@ -204,140 +213,158 @@ object functions { countDistinct(Column(columnName), columnNames.map(Column.apply) : _*) /** - * Aggregate function: returns the approximate number of distinct items in a group. + * Aggregate function: returns the first value in a group. * * @group agg_funcs * @since 1.3.0 */ - def approxCountDistinct(e: Column): Column = ApproxCountDistinct(e.expr) + def first(e: Column): Column = First(e.expr) /** - * Aggregate function: returns the approximate number of distinct items in a group. + * Aggregate function: returns the first value of a column in a group. * * @group agg_funcs * @since 1.3.0 */ - def approxCountDistinct(columnName: String): Column = approxCountDistinct(column(columnName)) + def first(columnName: String): Column = first(Column(columnName)) /** - * Aggregate function: returns the approximate number of distinct items in a group. + * Aggregate function: returns the last value in a group. * * @group agg_funcs * @since 1.3.0 */ - def approxCountDistinct(e: Column, rsd: Double): Column = ApproxCountDistinct(e.expr, rsd) + def last(e: Column): Column = Last(e.expr) /** - * Aggregate function: returns the approximate number of distinct items in a group. + * Aggregate function: returns the last value of the column in a group. * * @group agg_funcs * @since 1.3.0 */ - def approxCountDistinct(columnName: String, rsd: Double): Column = { - approxCountDistinct(Column(columnName), rsd) - } + def last(columnName: String): Column = last(Column(columnName)) /** - * Aggregate function: returns the average of the values in a group. + * Aggregate function: returns the maximum value of the expression in a group. * * @group agg_funcs * @since 1.3.0 */ - def avg(e: Column): Column = Average(e.expr) + def max(e: Column): Column = Max(e.expr) /** - * Aggregate function: returns the average of the values in a group. + * Aggregate function: returns the maximum value of the column in a group. * * @group agg_funcs * @since 1.3.0 */ - def avg(columnName: String): Column = avg(Column(columnName)) + def max(columnName: String): Column = max(Column(columnName)) /** - * Aggregate function: returns the first value in a group. + * Aggregate function: returns the average of the values in a group. + * Alias for avg. * * @group agg_funcs - * @since 1.3.0 + * @since 1.4.0 */ - def first(e: Column): Column = First(e.expr) + def mean(e: Column): Column = avg(e) /** - * Aggregate function: returns the first value of a column in a group. + * Aggregate function: returns the average of the values in a group. + * Alias for avg. * * @group agg_funcs - * @since 1.3.0 + * @since 1.4.0 */ - def first(columnName: String): Column = first(Column(columnName)) + def mean(columnName: String): Column = avg(columnName) /** - * Aggregate function: returns the last value in a group. + * Aggregate function: returns the minimum value of the expression in a group. * * @group agg_funcs * @since 1.3.0 */ - def last(e: Column): Column = Last(e.expr) + def min(e: Column): Column = Min(e.expr) /** - * Aggregate function: returns the last value of the column in a group. + * Aggregate function: returns the minimum value of the column in a group. * * @group agg_funcs * @since 1.3.0 */ - def last(columnName: String): Column = last(Column(columnName)) + def min(columnName: String): Column = min(Column(columnName)) /** - * Aggregate function: returns the average of the values in a group. - * Alias for avg. + * Aggregate function: returns the sum of all values in the expression. * * @group agg_funcs - * @since 1.4.0 + * @since 1.3.0 */ - def mean(e: Column): Column = avg(e) + def sum(e: Column): Column = Sum(e.expr) /** - * Aggregate function: returns the average of the values in a group. - * Alias for avg. + * Aggregate function: returns the sum of all values in the given column. * * @group agg_funcs - * @since 1.4.0 + * @since 1.3.0 */ - def mean(columnName: String): Column = avg(columnName) + def sum(columnName: String): Column = sum(Column(columnName)) /** - * Aggregate function: returns the minimum value of the expression in a group. + * Aggregate function: returns the sum of distinct values in the expression. * * @group agg_funcs * @since 1.3.0 */ - def min(e: Column): Column = Min(e.expr) + def sumDistinct(e: Column): Column = SumDistinct(e.expr) /** - * Aggregate function: returns the minimum value of the column in a group. + * Aggregate function: returns the sum of distinct values in the expression. * * @group agg_funcs * @since 1.3.0 */ - def min(columnName: String): Column = min(Column(columnName)) + def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName)) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Window functions + ////////////////////////////////////////////////////////////////////////////////////////////// /** - * Aggregate function: returns the maximum value of the expression in a group. + * Window function: returns the cumulative distribution of values within a window partition, + * i.e. the fraction of rows that are below the current row. * - * @group agg_funcs - * @since 1.3.0 + * {{{ + * N = total number of rows in the partition + * cumeDist(x) = number of values before (and including) x / N + * }}} + * + * + * This is equivalent to the CUME_DIST function in SQL. + * + * @group window_funcs + * @since 1.4.0 */ - def max(e: Column): Column = Max(e.expr) + def cumeDist(): Column = { + UnresolvedWindowFunction("cume_dist", Nil) + } /** - * Aggregate function: returns the maximum value of the column in a group. + * Window function: returns the rank of rows within a window partition, without any gaps. * - * @group agg_funcs - * @since 1.3.0 + * The difference between rank and denseRank is that denseRank leaves no gaps in ranking + * sequence when there are ties. That is, if you were ranking a competition using denseRank + * and had three people tie for second place, you would say that all three were in second + * place and that the next person came in third. + * + * This is equivalent to the DENSE_RANK function in SQL. + * + * @group window_funcs + * @since 1.4.0 */ - def max(columnName: String): Column = max(Column(columnName)) - - ////////////////////////////////////////////////////////////////////////////////////////////// - // Window functions - ////////////////////////////////////////////////////////////////////////////////////////////// + def denseRank(): Column = { + UnresolvedWindowFunction("dense_rank", Nil) + } /** * Window function: returns the value that is `offset` rows before the current row, and @@ -466,32 +493,20 @@ object functions { } /** - * Window function: returns a sequential number starting at 1 within a window partition. - * - * This is equivalent to the ROW_NUMBER function in SQL. - * - * @group window_funcs - * @since 1.4.0 - */ - def rowNumber(): Column = { - UnresolvedWindowFunction("row_number", Nil) - } - - /** - * Window function: returns the rank of rows within a window partition, without any gaps. + * Window function: returns the relative rank (i.e. percentile) of rows within a window partition. * - * The difference between rank and denseRank is that denseRank leaves no gaps in ranking - * sequence when there are ties. That is, if you were ranking a competition using denseRank - * and had three people tie for second place, you would say that all three were in second - * place and that the next person came in third. + * This is computed by: + * {{{ + * (rank of row in its partition - 1) / (number of rows in the partition - 1) + * }}} * - * This is equivalent to the DENSE_RANK function in SQL. + * This is equivalent to the PERCENT_RANK function in SQL. * * @group window_funcs * @since 1.4.0 */ - def denseRank(): Column = { - UnresolvedWindowFunction("dense_rank", Nil) + def percentRank(): Column = { + UnresolvedWindowFunction("percent_rank", Nil) } /** @@ -512,39 +527,15 @@ object functions { } /** - * Window function: returns the cumulative distribution of values within a window partition, - * i.e. the fraction of rows that are below the current row. - * - * {{{ - * N = total number of rows in the partition - * cumeDist(x) = number of values before (and including) x / N - * }}} - * - * - * This is equivalent to the CUME_DIST function in SQL. - * - * @group window_funcs - * @since 1.4.0 - */ - def cumeDist(): Column = { - UnresolvedWindowFunction("cume_dist", Nil) - } - - /** - * Window function: returns the relative rank (i.e. percentile) of rows within a window partition. - * - * This is computed by: - * {{{ - * (rank of row in its partition - 1) / (number of rows in the partition - 1) - * }}} + * Window function: returns a sequential number starting at 1 within a window partition. * - * This is equivalent to the PERCENT_RANK function in SQL. + * This is equivalent to the ROW_NUMBER function in SQL. * * @group window_funcs * @since 1.4.0 */ - def percentRank(): Column = { - UnresolvedWindowFunction("percent_rank", Nil) + def rowNumber(): Column = { + UnresolvedWindowFunction("row_number", Nil) } ////////////////////////////////////////////////////////////////////////////////////////////// @@ -595,10 +586,10 @@ object functions { } /** - * Returns the first column that is not null and not NaN. - * {{{ - * df.select(coalesce(df("a"), df("b"))) - * }}} + * Returns the first column that is not null, or null if all inputs are null. + * + * For example, `coalesce(a, b, c)` will return a if a is not null, + * or b if a is null and b is not null, or c if both a and b are null but c is not null. * * @group normal_funcs * @since 1.3.0 @@ -607,9 +598,11 @@ object functions { def coalesce(e: Column*): Column = Coalesce(e.map(_.expr)) /** - * Creates a new row for each element in the given array or map column. + * Creates a string column for the file name of the current Spark task. + * + * @group normal_funcs */ - def explode(e: Column): Column = Explode(e.expr) + def inputFileName(): Column = InputFileName() /** * Return true iff the column is NaN. @@ -637,13 +630,14 @@ object functions { def monotonicallyIncreasingId(): Column = MonotonicallyIncreasingID() /** - * Return an alternative value `r` if `l` is NaN. - * This function is useful for mapping NaN values to null. + * Returns col1 if it is not NaN, or col2 if col1 is NaN. + * + * Both inputs should be floating point columns (DoubleType or FloatType). * * @group normal_funcs * @since 1.5.0 */ - def nanvl(l: Column, r: Column): Column = NaNvl(l.expr, r.expr) + def nanvl(col1: Column, col2: Column): Column = NaNvl(col1.expr, col2.expr) /** * Unary minus, i.e. negate the expression. @@ -677,31 +671,6 @@ object functions { def not(e: Column): Column = !e /** - * Evaluates a list of conditions and returns one of multiple possible result expressions. - * If otherwise is not defined at the end, null is returned for unmatched conditions. - * - * {{{ - * // Example: encoding gender string column into integer. - * - * // Scala: - * people.select(when(people("gender") === "male", 0) - * .when(people("gender") === "female", 1) - * .otherwise(2)) - * - * // Java: - * people.select(when(col("gender").equalTo("male"), 0) - * .when(col("gender").equalTo("female"), 1) - * .otherwise(2)) - * }}} - * - * @group normal_funcs - * @since 1.4.0 - */ - def when(condition: Column, value: Any): Column = { - CaseWhen(Seq(condition.expr, lit(value).expr)) - } - - /** * Generate a random column with i.i.d. samples from U[0.0, 1.0]. * * @group normal_funcs @@ -744,15 +713,6 @@ object functions { def sparkPartitionId(): Column = SparkPartitionID() /** - * The file name of the current Spark task - * - * Note that this is indeterministic becuase it depends on what is currently being read in. - * - * @group normal_funcs - */ - def inputFileName(): Column = InputFileName() - - /** * Computes the square root of the specified float value. * * @group math_funcs @@ -794,6 +754,31 @@ object functions { } /** + * Evaluates a list of conditions and returns one of multiple possible result expressions. + * If otherwise is not defined at the end, null is returned for unmatched conditions. + * + * {{{ + * // Example: encoding gender string column into integer. + * + * // Scala: + * people.select(when(people("gender") === "male", 0) + * .when(people("gender") === "female", 1) + * .otherwise(2)) + * + * // Java: + * people.select(when(col("gender").equalTo("male"), 0) + * .when(col("gender").equalTo("female"), 1) + * .otherwise(2)) + * }}} + * + * @group normal_funcs + * @since 1.4.0 + */ + def when(condition: Column, value: Any): Column = { + CaseWhen(Seq(condition.expr, lit(value).expr)) + } + + /** * Computes bitwise NOT. * * @group normal_funcs @@ -993,6 +978,15 @@ object functions { def ceil(columnName: String): Column = ceil(Column(columnName)) /** + * Convert a number in a string column from one base to another. + * + * @group math_funcs + * @since 1.5.0 + */ + def conv(num: Column, fromBase: Int, toBase: Int): Column = + Conv(num.expr, lit(fromBase).expr, lit(toBase).expr) + + /** * Computes the cosine of the given value. * * @group math_funcs @@ -1025,22 +1019,6 @@ object functions { def cosh(columnName: String): Column = cosh(Column(columnName)) /** - * Returns the current date. - * - * @group datetime_funcs - * @since 1.5.0 - */ - def current_date(): Column = CurrentDate() - - /** - * Returns the current timestamp. - * - * @group datetime_funcs - * @since 1.5.0 - */ - def current_timestamp(): Column = CurrentTimestamp() - - /** * Computes the exponential of the given value. * * @group math_funcs @@ -1671,109 +1649,75 @@ object functions { ////////////////////////////////////////////////////////////////////////////////////////////// /** - * Concatenates input strings together into a single string. - * - * @group string_funcs - * @since 1.5.0 - */ - @scala.annotation.varargs - def concat(exprs: Column*): Column = Concat(exprs.map(_.expr)) - - /** - * Concatenates input strings together into a single string, using the given separator. + * Computes the numeric value of the first character of the string column, and returns the + * result as a int column. * * @group string_funcs * @since 1.5.0 */ - @scala.annotation.varargs - def concat_ws(sep: String, exprs: Column*): Column = { - ConcatWs(Literal.create(sep, StringType) +: exprs.map(_.expr)) - } + def ascii(e: Column): Column = Ascii(e.expr) /** - * Computes the length of a given string / binary value. + * Computes the BASE64 encoding of a binary column and returns it as a string column. + * This is the reverse of unbase64. * * @group string_funcs * @since 1.5.0 */ - def length(e: Column): Column = Length(e.expr) - - /** - * Converts a string expression to lower case. - * - * @group string_funcs - * @since 1.3.0 - */ - def lower(e: Column): Column = Lower(e.expr) - - /** - * Converts a string expression to upper case. - * - * @group string_funcs - * @since 1.3.0 - */ - def upper(e: Column): Column = Upper(e.expr) + def base64(e: Column): Column = Base64(e.expr) /** - * Formats the number X to a format like '#,###,###.##', rounded to d decimal places, - * and returns the result as a string. - * If d is 0, the result has no decimal point or fractional part. - * If d < 0, the result will be null. + * Concatenates multiple input string columns together into a single string column. * * @group string_funcs * @since 1.5.0 */ - def format_number(x: Column, d: Int): Column = FormatNumber(x.expr, lit(d).expr) + @scala.annotation.varargs + def concat(exprs: Column*): Column = Concat(exprs.map(_.expr)) /** - * Substring starts at `pos` and is of length `len` when str is String type or - * returns the slice of byte array that starts at `pos` in byte and is of length `len` - * when str is Binary type + * Concatenates multiple input string columns together into a single string column, + * using the given separator. * * @group string_funcs * @since 1.5.0 */ - def substring(str: Column, pos: Int, len: Int): Column = - Substring(str.expr, lit(pos).expr, lit(len).expr) - - /** - * Computes the Levenshtein distance of the two given string columns. - * @group string_funcs - * @since 1.5.0 - */ - def levenshtein(l: Column, r: Column): Column = Levenshtein(l.expr, r.expr) + @scala.annotation.varargs + def concat_ws(sep: String, exprs: Column*): Column = { + ConcatWs(Literal.create(sep, StringType) +: exprs.map(_.expr)) + } /** - * Computes the numeric value of the first character of the specified string column. + * Computes the first argument into a string from a binary using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. * * @group string_funcs * @since 1.5.0 */ - def ascii(e: Column): Column = Ascii(e.expr) + def decode(value: Column, charset: String): Column = Decode(value.expr, lit(charset).expr) /** - * Trim the spaces from both ends for the specified string column. + * Computes the first argument into a binary from a string using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. * * @group string_funcs * @since 1.5.0 */ - def trim(e: Column): Column = StringTrim(e.expr) + def encode(value: Column, charset: String): Column = Encode(value.expr, lit(charset).expr) /** - * Trim the spaces from left end for the specified string value. + * Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places, + * and returns the result as a string column. * - * @group string_funcs - * @since 1.5.0 - */ - def ltrim(e: Column): Column = StringTrimLeft(e.expr) - - /** - * Trim the spaces from right end for the specified string value. + * If d is 0, the result has no decimal point or fractional part. + * If d < 0, the result will be null. * * @group string_funcs * @since 1.5.0 */ - def rtrim(e: Column): Column = StringTrimRight(e.expr) + def format_number(x: Column, d: Int): Column = FormatNumber(x.expr, lit(d).expr) /** * Formats the arguments in printf-style and returns the result as a string column. @@ -1787,9 +1731,11 @@ object functions { } /** - * Returns string, with the first letter of each word in uppercase. + * Returns a new string column by converting the first letter of each word to uppercase. * Words are delimited by whitespace. * + * For example, "hello world" will become "Hello World". + * * @group string_funcs * @since 1.5.0 */ @@ -1808,15 +1754,27 @@ object functions { def instr(str: Column, substring: String): Column = StringInstr(str.expr, lit(substring).expr) /** - * Returns the substring from string str before count occurrences of the delimiter delim. - * If count is positive, everything the left of the final delimiter (counting from left) is - * returned. If count is negative, every to the right of the final delimiter (counting from the - * right) is returned. substring_index performs a case-sensitive match when searching for delim. + * Computes the length of a given string or binary column. * * @group string_funcs + * @since 1.5.0 */ - def substring_index(str: Column, delim: String, count: Int): Column = - SubstringIndex(str.expr, lit(delim).expr, lit(count).expr) + def length(e: Column): Column = Length(e.expr) + + /** + * Converts a string column to lower case. + * + * @group string_funcs + * @since 1.3.0 + */ + def lower(e: Column): Column = Lower(e.expr) + + /** + * Computes the Levenshtein distance of the two given string columns. + * @group string_funcs + * @since 1.5.0 + */ + def levenshtein(l: Column, r: Column): Column = Levenshtein(l.expr, r.expr) /** * Locate the position of the first occurrence of substr. @@ -1831,6 +1789,14 @@ object functions { } /** + * Trim the spaces from left end for the specified string value. + * + * @group string_funcs + * @since 1.5.0 + */ + def ltrim(e: Column): Column = StringTrimLeft(e.expr) + + /** * Locate the position of the first occurrence of substr in a string column, after position pos. * * NOTE: The position is not zero based, but 1 based index. returns 0 if substr @@ -1843,6 +1809,15 @@ object functions { StringLocate(lit(substr).expr, str.expr, lit(pos).expr) } + /** + * Left-pad the string column with + * + * @group string_funcs + * @since 1.5.0 + */ + def lpad(str: Column, len: Int, pad: String): Column = { + StringLPad(str.expr, lit(len).expr, lit(pad).expr) + } /** * Extract a specific(idx) group identified by a java regex, from the specified string column. @@ -1865,101 +1840,108 @@ object functions { } /** - * Computes the BASE64 encoding of a binary column and returns it as a string column. - * This is the reverse of unbase64. + * Decodes a BASE64 encoded string column and returns it as a binary column. + * This is the reverse of base64. * * @group string_funcs * @since 1.5.0 */ - def base64(e: Column): Column = Base64(e.expr) + def unbase64(e: Column): Column = UnBase64(e.expr) /** - * Decodes a BASE64 encoded string column and returns it as a binary column. - * This is the reverse of base64. + * Right-padded with pad to a length of len. * * @group string_funcs * @since 1.5.0 */ - def unbase64(e: Column): Column = UnBase64(e.expr) + def rpad(str: Column, len: Int, pad: String): Column = { + StringRPad(str.expr, lit(len).expr, lit(pad).expr) + } /** - * Left-padded with pad to a length of len. + * Repeats a string column n times, and returns it as a new string column. * * @group string_funcs * @since 1.5.0 */ - def lpad(str: Column, len: Int, pad: String): Column = { - StringLPad(str.expr, lit(len).expr, lit(pad).expr) + def repeat(str: Column, n: Int): Column = { + StringRepeat(str.expr, lit(n).expr) } /** - * Computes the first argument into a binary from a string using the provided character set - * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). - * If either argument is null, the result will also be null. + * Reverses the string column and returns it as a new string column. * * @group string_funcs * @since 1.5.0 */ - def encode(value: Column, charset: String): Column = Encode(value.expr, lit(charset).expr) + def reverse(str: Column): Column = { + StringReverse(str.expr) + } /** - * Computes the first argument into a string from a binary using the provided character set - * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). - * If either argument is null, the result will also be null. + * Trim the spaces from right end for the specified string value. * * @group string_funcs * @since 1.5.0 */ - def decode(value: Column, charset: String): Column = Decode(value.expr, lit(charset).expr) + def rtrim(e: Column): Column = StringTrimRight(e.expr) /** - * Right-padded with pad to a length of len. + * * Return the soundex code for the specified expression. * * @group string_funcs * @since 1.5.0 */ - def rpad(str: Column, len: Int, pad: String): Column = { - StringRPad(str.expr, lit(len).expr, lit(pad).expr) - } + def soundex(e: Column): Column = SoundEx(e.expr) /** - * Repeats a string column n times, and returns it as a new string column. + * Splits str around pattern (pattern is a regular expression). + * NOTE: pattern is a string represent the regular expression. * * @group string_funcs * @since 1.5.0 */ - def repeat(str: Column, n: Int): Column = { - StringRepeat(str.expr, lit(n).expr) + def split(str: Column, pattern: String): Column = { + StringSplit(str.expr, lit(pattern).expr) } /** - * * Return the soundex code for the specified expression. + * Substring starts at `pos` and is of length `len` when str is String type or + * returns the slice of byte array that starts at `pos` in byte and is of length `len` + * when str is Binary type * * @group string_funcs * @since 1.5.0 */ - def soundex(e: Column): Column = SoundEx(e.expr) + def substring(str: Column, pos: Int, len: Int): Column = + Substring(str.expr, lit(pos).expr, lit(len).expr) /** - * Splits str around pattern (pattern is a regular expression). - * NOTE: pattern is a string represent the regular expression. + * Returns the substring from string str before count occurrences of the delimiter delim. + * If count is positive, everything the left of the final delimiter (counting from left) is + * returned. If count is negative, every to the right of the final delimiter (counting from the + * right) is returned. substring_index performs a case-sensitive match when searching for delim. * * @group string_funcs - * @since 1.5.0 */ - def split(str: Column, pattern: String): Column = { - StringSplit(str.expr, lit(pattern).expr) - } + def substring_index(str: Column, delim: String, count: Int): Column = + SubstringIndex(str.expr, lit(delim).expr, lit(count).expr) /** - * Reversed the string for the specified value. + * Trim the spaces from both ends for the specified string column. * * @group string_funcs * @since 1.5.0 */ - def reverse(str: Column): Column = { - StringReverse(str.expr) - } + def trim(e: Column): Column = StringTrim(e.expr) + + /** + * Converts a string column to upper case. + * + * @group string_funcs + * @since 1.3.0 + */ + def upper(e: Column): Column = Upper(e.expr) ////////////////////////////////////////////////////////////////////////////////////////////// // DateTime functions @@ -1967,6 +1949,7 @@ object functions { /** * Returns the date that is numMonths after startDate. + * * @group datetime_funcs * @since 1.5.0 */ @@ -1974,20 +1957,20 @@ object functions { AddMonths(startDate.expr, Literal(numMonths)) /** - * Converts a date/timestamp/string to a value of string in the format specified by the date - * format given by the second argument. + * Returns the current date as a date column. * - * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All - * pattern letters of [[java.text.SimpleDateFormat]] can be used. - * - * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a - * specialized implementation. + * @group datetime_funcs + * @since 1.5.0 + */ + def current_date(): Column = CurrentDate() + + /** + * Returns the current timestamp as a timestamp column. * * @group datetime_funcs * @since 1.5.0 */ - def date_format(dateExpr: Column, format: String): Column = - DateFormatClass(dateExpr.expr, Literal(format)) + def current_timestamp(): Column = CurrentTimestamp() /** * Converts a date/timestamp/string to a value of string in the format specified by the date @@ -2002,8 +1985,8 @@ object functions { * @group datetime_funcs * @since 1.5.0 */ - def date_format(dateColumnName: String, format: String): Column = - date_format(Column(dateColumnName), format) + def date_format(dateExpr: Column, format: String): Column = + DateFormatClass(dateExpr.expr, Literal(format)) /** * Returns the date that is `days` days after `start` @@ -2034,13 +2017,6 @@ object functions { def year(e: Column): Column = Year(e.expr) /** - * Extracts the year as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def year(columnName: String): Column = year(Column(columnName)) - - /** * Extracts the quarter as an integer from a given date/timestamp/string. * @group datetime_funcs * @since 1.5.0 @@ -2048,13 +2024,6 @@ object functions { def quarter(e: Column): Column = Quarter(e.expr) /** - * Extracts the quarter as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def quarter(columnName: String): Column = quarter(Column(columnName)) - - /** * Extracts the month as an integer from a given date/timestamp/string. * @group datetime_funcs * @since 1.5.0 @@ -2062,13 +2031,6 @@ object functions { def month(e: Column): Column = Month(e.expr) /** - * Extracts the month as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def month(columnName: String): Column = month(Column(columnName)) - - /** * Extracts the day of the month as an integer from a given date/timestamp/string. * @group datetime_funcs * @since 1.5.0 @@ -2076,13 +2038,6 @@ object functions { def dayofmonth(e: Column): Column = DayOfMonth(e.expr) /** - * Extracts the day of the month as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def dayofmonth(columnName: String): Column = dayofmonth(Column(columnName)) - - /** * Extracts the day of the year as an integer from a given date/timestamp/string. * @group datetime_funcs * @since 1.5.0 @@ -2090,13 +2045,6 @@ object functions { def dayofyear(e: Column): Column = DayOfYear(e.expr) /** - * Extracts the day of the year as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def dayofyear(columnName: String): Column = dayofyear(Column(columnName)) - - /** * Extracts the hours as an integer from a given date/timestamp/string. * @group datetime_funcs * @since 1.5.0 @@ -2104,13 +2052,6 @@ object functions { def hour(e: Column): Column = Hour(e.expr) /** - * Extracts the hours as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def hour(columnName: String): Column = hour(Column(columnName)) - - /** * Given a date column, returns the last day of the month which the given date belongs to. * For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the * month in July 2015. @@ -2127,13 +2068,6 @@ object functions { */ def minute(e: Column): Column = Minute(e.expr) - /** - * Extracts the minutes as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def minute(columnName: String): Column = minute(Column(columnName)) - /* * Returns number of months between dates `date1` and `date2`. * @group datetime_funcs @@ -2164,13 +2098,6 @@ object functions { def second(e: Column): Column = Second(e.expr) /** - * Extracts the seconds as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def second(columnName: String): Column = second(Column(columnName)) - - /** * Extracts the week number as an integer from a given date/timestamp/string. * @group datetime_funcs * @since 1.5.0 @@ -2178,13 +2105,6 @@ object functions { def weekofyear(e: Column): Column = WeekOfYear(e.expr) /** - * Extracts the week number as an integer from a given date/timestamp/string. - * @group datetime_funcs - * @since 1.5.0 - */ - def weekofyear(columnName: String): Column = weekofyear(Column(columnName)) - - /** * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string * representing the timestamp of that moment in the current system time zone in the given * format. @@ -2226,7 +2146,7 @@ object functions { */ def unix_timestamp(s: Column, p: String): Column = UnixTimestamp(s.expr, Literal(p)) - /* + /** * Converts the column into DateType. * * @group datetime_funcs @@ -2265,6 +2185,14 @@ object functions { ////////////////////////////////////////////////////////////////////////////////////////////// /** + * Creates a new row for each element in the given array or map column. + * + * @group collection_funcs + * @since 1.3.0 + */ + def explode(e: Column): Column = Explode(e.expr) + + /** * Returns length of array or map. * * @group collection_funcs @@ -2279,7 +2207,7 @@ object functions { * @group collection_funcs * @since 1.5.0 */ - def sort_array(e: Column): Column = sort_array(e, true) + def sort_array(e: Column): Column = sort_array(e, asc = true) /** * Sorts the input array for the given column in ascending / descending order, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 0850f5cf77..17897caf95 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -89,7 +89,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(date_format("a", "y"), date_format("b", "y"), date_format("c", "y")), + df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")), Row("2015", "2015", "2013")) checkAnswer( @@ -101,7 +101,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(year("a"), year("b"), year("c")), + df.select(year($"a"), year($"b"), year($"c")), Row(2015, 2015, 2013)) checkAnswer( @@ -115,7 +115,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(quarter("a"), quarter("b"), quarter("c")), + df.select(quarter($"a"), quarter($"b"), quarter($"c")), Row(2, 2, 4)) checkAnswer( @@ -127,7 +127,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(month("a"), month("b"), month("c")), + df.select(month($"a"), month($"b"), month($"c")), Row(4, 4, 4)) checkAnswer( @@ -139,7 +139,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(dayofmonth("a"), dayofmonth("b"), dayofmonth("c")), + df.select(dayofmonth($"a"), dayofmonth($"b"), dayofmonth($"c")), Row(8, 8, 8)) checkAnswer( @@ -151,7 +151,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(dayofyear("a"), dayofyear("b"), dayofyear("c")), + df.select(dayofyear($"a"), dayofyear($"b"), dayofyear($"c")), Row(98, 98, 98)) checkAnswer( @@ -163,7 +163,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(hour("a"), hour("b"), hour("c")), + df.select(hour($"a"), hour($"b"), hour($"c")), Row(0, 13, 13)) checkAnswer( @@ -175,7 +175,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(minute("a"), minute("b"), minute("c")), + df.select(minute($"a"), minute($"b"), minute($"c")), Row(0, 10, 10)) checkAnswer( @@ -187,7 +187,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(second("a"), second("b"), second("c")), + df.select(second($"a"), second($"b"), second($"c")), Row(0, 15, 15)) checkAnswer( @@ -199,7 +199,7 @@ class DateFunctionsSuite extends QueryTest { val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c") checkAnswer( - df.select(weekofyear("a"), weekofyear("b"), weekofyear("c")), + df.select(weekofyear($"a"), weekofyear($"b"), weekofyear($"c")), Row(15, 15, 15)) checkAnswer( |