From fae4e2d6094de57a438ee4188ce47fc5b01b96fe Mon Sep 17 00:00:00 2001 From: ksonj Date: Thu, 7 May 2015 01:02:00 -0700 Subject: [SPARK-7035] Encourage __getitem__ over __getattr__ on column access in the Python DataFrame API Author: ksonj Closes #5971 from ksonj/doc and squashes the following commits: dadfebb [ksonj] __getitem__ is cleaner than __getattr__ --- docs/sql-programming-guide.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'docs/sql-programming-guide.md') diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index b8233ae06f..df4c123bdd 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -139,7 +139,6 @@ DataFrames provide a domain-specific language for structured data manipulation i Here we include some basic examples of structured data processing using DataFrames: -
{% highlight scala %} @@ -242,6 +241,12 @@ df.groupBy("age").count().show();
+In Python it's possible to access a DataFrame's columns either by attribute +(`df.age`) or by indexing (`df['age']`). While the former is convenient for +interactive data exploration, users are highly encouraged to use the +latter form, which is future proof and won't break with column names that +are also attributes on the DataFrame class. + {% highlight python %} from pyspark.sql import SQLContext sqlContext = SQLContext(sc) @@ -270,14 +275,14 @@ df.select("name").show() ## Justin # Select everybody, but increment the age by 1 -df.select(df.name, df.age + 1).show() +df.select(df['name'], df['age'] + 1).show() ## name (age + 1) ## Michael null ## Andy 31 ## Justin 20 # Select people older than 21 -df.filter(df.age > 21).show() +df.filter(df['age'] > 21).show() ## age name ## 30 Andy -- cgit v1.2.3