aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-02-10 19:40:12 -0800
committerMichael Armbrust <michael@databricks.com>2015-02-10 19:40:12 -0800
commitea60284095cad43aa7ac98256576375d0e91a52a (patch)
tree35ac6e3935e1e7c731f7b9a850f2daa9640387d1 /docs
parenta60aea86b4d4b716b5ec3bff776b509fe0831342 (diff)
downloadspark-ea60284095cad43aa7ac98256576375d0e91a52a.tar.gz
spark-ea60284095cad43aa7ac98256576375d0e91a52a.tar.bz2
spark-ea60284095cad43aa7ac98256576375d0e91a52a.zip
[SPARK-5704] [SQL] [PySpark] createDataFrame from RDD with columns
Deprecate inferSchema() and applySchema(), use createDataFrame() instead, which could take an optional `schema` to create an DataFrame from an RDD. The `schema` could be StructType or list of names of columns. Author: Davies Liu <davies@databricks.com> Closes #4498 from davies/create and squashes the following commits: 08469c1 [Davies Liu] remove Scala/Java API for now c80a7a9 [Davies Liu] fix hive test d1bd8f2 [Davies Liu] cleanup applySchema 9526e97 [Davies Liu] createDataFrame from RDD with columns
Diffstat (limited to 'docs')
-rw-r--r--docs/ml-guide.md12
-rw-r--r--docs/sql-programming-guide.md16
2 files changed, 14 insertions, 14 deletions
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index be178d7689..4bf14fba34 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -260,7 +260,7 @@ List<LabeledPoint> localTraining = Lists.newArrayList(
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-JavaSchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+JavaSchemaRDD training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
@@ -300,7 +300,7 @@ List<LabeledPoint> localTest = Lists.newArrayList(
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+JavaSchemaRDD test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
@@ -443,7 +443,7 @@ List<LabeledDocument> localTraining = Lists.newArrayList(
new LabeledDocument(2L, "spark f g h", 1.0),
new LabeledDocument(3L, "hadoop mapreduce", 0.0));
JavaSchemaRDD training =
- jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+ jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -469,7 +469,7 @@ List<Document> localTest = Lists.newArrayList(
new Document(6L, "mapreduce spark"),
new Document(7L, "apache hadoop"));
JavaSchemaRDD test =
- jsql.applySchema(jsc.parallelize(localTest), Document.class);
+ jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents.
model.transform(test).registerAsTable("prediction");
@@ -626,7 +626,7 @@ List<LabeledDocument> localTraining = Lists.newArrayList(
new LabeledDocument(10L, "spark compile", 1.0),
new LabeledDocument(11L, "hadoop software", 0.0));
JavaSchemaRDD training =
- jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+ jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -669,7 +669,7 @@ List<Document> localTest = Lists.newArrayList(
new Document(5L, "l m n"),
new Document(6L, "mapreduce spark"),
new Document(7L, "apache hadoop"));
-JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+JavaSchemaRDD test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents. cvModel uses the best model found (lrModel).
cvModel.transform(test).registerAsTable("prediction");
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 38f617d0c8..b2b007509c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -225,7 +225,7 @@ public static class Person implements Serializable {
{% endhighlight %}
-A schema can be applied to an existing RDD by calling `applySchema` and providing the Class object
+A schema can be applied to an existing RDD by calling `createDataFrame` and providing the Class object
for the JavaBean.
{% highlight java %}
@@ -247,7 +247,7 @@ JavaRDD<Person> people = sc.textFile("examples/src/main/resources/people.txt").m
});
// Apply a schema to an RDD of JavaBeans and register it as a table.
-JavaSchemaRDD schemaPeople = sqlContext.applySchema(people, Person.class);
+JavaSchemaRDD schemaPeople = sqlContext.createDataFrame(people, Person.class);
schemaPeople.registerTempTable("people");
// SQL can be run over RDDs that have been registered as tables.
@@ -315,7 +315,7 @@ a `SchemaRDD` can be created programmatically with three steps.
1. Create an RDD of `Row`s from the original RDD;
2. Create the schema represented by a `StructType` matching the structure of
`Row`s in the RDD created in Step 1.
-3. Apply the schema to the RDD of `Row`s via `applySchema` method provided
+3. Apply the schema to the RDD of `Row`s via `createDataFrame` method provided
by `SQLContext`.
For example:
@@ -341,7 +341,7 @@ val schema =
val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
// Apply the schema to the RDD.
-val peopleSchemaRDD = sqlContext.applySchema(rowRDD, schema)
+val peopleSchemaRDD = sqlContext.createDataFrame(rowRDD, schema)
// Register the SchemaRDD as a table.
peopleSchemaRDD.registerTempTable("people")
@@ -367,7 +367,7 @@ a `SchemaRDD` can be created programmatically with three steps.
1. Create an RDD of `Row`s from the original RDD;
2. Create the schema represented by a `StructType` matching the structure of
`Row`s in the RDD created in Step 1.
-3. Apply the schema to the RDD of `Row`s via `applySchema` method provided
+3. Apply the schema to the RDD of `Row`s via `createDataFrame` method provided
by `JavaSQLContext`.
For example:
@@ -406,7 +406,7 @@ JavaRDD<Row> rowRDD = people.map(
});
// Apply the schema to the RDD.
-JavaSchemaRDD peopleSchemaRDD = sqlContext.applySchema(rowRDD, schema);
+JavaSchemaRDD peopleSchemaRDD = sqlContext.createDataFrame(rowRDD, schema);
// Register the SchemaRDD as a table.
peopleSchemaRDD.registerTempTable("people");
@@ -436,7 +436,7 @@ a `SchemaRDD` can be created programmatically with three steps.
1. Create an RDD of tuples or lists from the original RDD;
2. Create the schema represented by a `StructType` matching the structure of
tuples or lists in the RDD created in the step 1.
-3. Apply the schema to the RDD via `applySchema` method provided by `SQLContext`.
+3. Apply the schema to the RDD via `createDataFrame` method provided by `SQLContext`.
For example:
{% highlight python %}
@@ -458,7 +458,7 @@ fields = [StructField(field_name, StringType(), True) for field_name in schemaSt
schema = StructType(fields)
# Apply the schema to the RDD.
-schemaPeople = sqlContext.applySchema(people, schema)
+schemaPeople = sqlContext.createDataFrame(people, schema)
# Register the SchemaRDD as a table.
schemaPeople.registerTempTable("people")