diff options
author | Davies Liu <davies@databricks.com> | 2015-02-09 11:42:52 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-02-09 11:42:52 -0800 |
commit | afb131637d96e1e5e07eb8abf24e32e7f3b2304d (patch) | |
tree | e99ca6a3cb84cc402c2f8bdd1b560d51f84dc19d /python | |
parent | de7806048ac49a8bfdf44d8f87bc11cea1dfb242 (diff) | |
download | spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.tar.gz spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.tar.bz2 spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.zip |
[SPARK-5678] Convert DataFrame to pandas.DataFrame and Series
```
pyspark.sql.DataFrame.to_pandas = to_pandas(self) unbound pyspark.sql.DataFrame method
Collect all the rows and return a `pandas.DataFrame`.
>>> df.to_pandas() # doctest: +SKIP
age name
0 2 Alice
1 5 Bob
pyspark.sql.Column.to_pandas = to_pandas(self) unbound pyspark.sql.Column method
Return a pandas.Series from the column
>>> df.age.to_pandas() # doctest: +SKIP
0 2
1 5
dtype: int64
```
Not tests by jenkins (they depends on pandas)
Author: Davies Liu <davies@databricks.com>
Closes #4476 from davies/to_pandas and squashes the following commits:
6276fb6 [Davies Liu] Convert DataFrame to pandas.DataFrame and Series
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/sql.py | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index e55f285a77..6a6dfbc585 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -2284,6 +2284,18 @@ class DataFrame(object): """ return self.select('*', col.alias(colName)) + def to_pandas(self): + """ + Collect all the rows and return a `pandas.DataFrame`. + + >>> df.to_pandas() # doctest: +SKIP + age name + 0 2 Alice + 1 5 Bob + """ + import pandas as pd + return pd.DataFrame.from_records(self.collect(), columns=self.columns) + # Having SchemaRDD for backward compatibility (for docs) class SchemaRDD(DataFrame): @@ -2551,6 +2563,19 @@ class Column(DataFrame): jc = self._jc.cast(jdt) return Column(jc, self.sql_ctx) + def to_pandas(self): + """ + Return a pandas.Series from the column + + >>> df.age.to_pandas() # doctest: +SKIP + 0 2 + 1 5 + dtype: int64 + """ + import pandas as pd + data = [c for c, in self.collect()] + return pd.Series(data) + def _aggregate_func(name, doc=""): """ Create a function for aggregator by name""" |