[SPARK-5678] Convert DataFrame to pandas.DataFrame and Series

``` pyspark.sql.DataFrame.to_pandas = to_pandas(self) unbound pyspark.sql.DataFrame method Collect all the rows and return a `pandas.DataFrame`. >>> df.to_pandas() # doctest: +SKIP age name 0 2 Alice 1 5 Bob pyspark.sql.Column.to_pandas = to_pandas(self) unbound pyspark.sql.Column method Return a pandas.Series from the column >>> df.age.to_pandas() # doctest: +SKIP 0 2 1 5 dtype: int64 ``` Not tests by jenkins (they depends on pandas) Author: Davies Liu <davies@databricks.com> Closes #4476 from davies/to_pandas and squashes the following commits: 6276fb6 [Davies Liu] Convert DataFrame to pandas.DataFrame and Series
author: Davies Liu <davies@databricks.com> 2015-02-09 11:42:52 -0800
committer: Reynold Xin <rxin@databricks.com> 2015-02-09 11:42:52 -0800
commit: afb131637d96e1e5e07eb8abf24e32e7f3b2304d (patch)
tree: e99ca6a3cb84cc402c2f8bdd1b560d51f84dc19d /python
parent: de7806048ac49a8bfdf44d8f87bc11cea1dfb242 (diff)
download: spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.tar.gz
spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.tar.bz2
spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.zip
1 files changed, 25 insertions, 0 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index e55f285a77..6a6dfbc585 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -2284,6 +2284,18 @@ class DataFrame(object):
         """
         return self.select('*', col.alias(colName))
 
+    def to_pandas(self):
+        """
+        Collect all the rows and return a `pandas.DataFrame`.
+
+        >>> df.to_pandas()  # doctest: +SKIP
+           age   name
+        0    2  Alice
+        1    5    Bob
+        """
+        import pandas as pd
+        return pd.DataFrame.from_records(self.collect(), columns=self.columns)
+
 
 # Having SchemaRDD for backward compatibility (for docs)
 class SchemaRDD(DataFrame):
@@ -2551,6 +2563,19 @@ class Column(DataFrame):
             jc = self._jc.cast(jdt)
         return Column(jc, self.sql_ctx)
 
+    def to_pandas(self):
+        """
+        Return a pandas.Series from the column
+
+        >>> df.age.to_pandas()  # doctest: +SKIP
+        0    2
+        1    5
+        dtype: int64
+        """
+        import pandas as pd
+        data = [c for c, in self.collect()]
+        return pd.Series(data)
+
 
 def _aggregate_func(name, doc=""):
     """ Create a function for aggregator by name"""
author	Davies Liu <davies@databricks.com>	2015-02-09 11:42:52 -0800
committer	Reynold Xin <rxin@databricks.com>	2015-02-09 11:42:52 -0800
commit	afb131637d96e1e5e07eb8abf24e32e7f3b2304d (patch)
tree	e99ca6a3cb84cc402c2f8bdd1b560d51f84dc19d /python
parent	de7806048ac49a8bfdf44d8f87bc11cea1dfb242 (diff)
download	spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.tar.gz spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.tar.bz2 spark-afb131637d96e1e5e07eb8abf24e32e7f3b2304d.zip