aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorHolden Karau <holden@pigscanfly.ca>2015-09-17 09:17:43 -0700
committerXiangrui Meng <meng@databricks.com>2015-09-17 09:17:43 -0700
commite51345e1e04e439827a07c95887d14ba38333057 (patch)
treebdb8fc3d5d6188ce41b00851ec780c98eb2a1382 /mllib
parent268088b899e6e165e746aed87840d47bfaf50c43 (diff)
downloadspark-e51345e1e04e439827a07c95887d14ba38333057.tar.gz
spark-e51345e1e04e439827a07c95887d14ba38333057.tar.bz2
spark-e51345e1e04e439827a07c95887d14ba38333057.zip
[SPARK-10077] [DOCS] [ML] Add package info for java of ml/feature
Should be the same as SPARK-7808 but use Java for the code example. It would be great to add package doc for `spark.ml.feature`. Author: Holden Karau <holden@pigscanfly.ca> Closes #8740 from holdenk/SPARK-10077-JAVA-PACKAGE-DOC-FOR-SPARK.ML.FEATURE.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java108
1 files changed, 108 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
new file mode 100644
index 0000000000..c22d2e0cd2
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Feature transformers
+ *
+ * The `ml.feature` package provides common feature transformers that help convert raw data or
+ * features into more suitable forms for model fitting.
+ * Most feature transformers are implemented as {@link org.apache.spark.ml.Transformer}s, which
+ * transforms one {@link org.apache.spark.sql.DataFrame} into another, e.g.,
+ * {@link org.apache.spark.feature.HashingTF}.
+ * Some feature transformers are implemented as {@link org.apache.spark.ml.Estimator}}s, because the
+ * transformation requires some aggregated information of the dataset, e.g., document
+ * frequencies in {@link org.apache.spark.ml.feature.IDF}.
+ * For those feature transformers, calling {@link org.apache.spark.ml.Estimator#fit} is required to
+ * obtain the model first, e.g., {@link org.apache.spark.ml.feature.IDFModel}, in order to apply
+ * transformation.
+ * The transformation is usually done by appending new columns to the input
+ * {@link org.apache.spark.sql.DataFrame}, so all input columns are carried over.
+ *
+ * We try to make each transformer minimal, so it becomes flexible to assemble feature
+ * transformation pipelines.
+ * {@link org.apache.spark.ml.Pipeline} can be used to chain feature transformers, and
+ * {@link org.apache.spark.ml.feature.VectorAssembler} can be used to combine multiple feature
+ * transformations, for example:
+ *
+ * <pre>
+ * <code>
+ * import java.util.Arrays;
+ *
+ * import org.apache.spark.api.java.JavaRDD;
+ * import static org.apache.spark.sql.types.DataTypes.*;
+ * import org.apache.spark.sql.types.StructType;
+ * import org.apache.spark.sql.DataFrame;
+ * import org.apache.spark.sql.RowFactory;
+ * import org.apache.spark.sql.Row;
+ *
+ * import org.apache.spark.ml.feature.*;
+ * import org.apache.spark.ml.Pipeline;
+ * import org.apache.spark.ml.PipelineStage;
+ * import org.apache.spark.ml.PipelineModel;
+ *
+ * // a DataFrame with three columns: id (integer), text (string), and rating (double).
+ * StructType schema = createStructType(
+ * Arrays.asList(
+ * createStructField("id", IntegerType, false),
+ * createStructField("text", StringType, false),
+ * createStructField("rating", DoubleType, false)));
+ * JavaRDD<Row> rowRDD = jsc.parallelize(
+ * Arrays.asList(
+ * RowFactory.create(0, "Hi I heard about Spark", 3.0),
+ * RowFactory.create(1, "I wish Java could use case classes", 4.0),
+ * RowFactory.create(2, "Logistic regression models are neat", 4.0)));
+ * DataFrame df = jsql.createDataFrame(rowRDD, schema);
+ * // define feature transformers
+ * RegexTokenizer tok = new RegexTokenizer()
+ * .setInputCol("text")
+ * .setOutputCol("words");
+ * StopWordsRemover sw = new StopWordsRemover()
+ * .setInputCol("words")
+ * .setOutputCol("filtered_words");
+ * HashingTF tf = new HashingTF()
+ * .setInputCol("filtered_words")
+ * .setOutputCol("tf")
+ * .setNumFeatures(10000);
+ * IDF idf = new IDF()
+ * .setInputCol("tf")
+ * .setOutputCol("tf_idf");
+ * VectorAssembler assembler = new VectorAssembler()
+ * .setInputCols(new String[] {"tf_idf", "rating"})
+ * .setOutputCol("features");
+ *
+ * // assemble and fit the feature transformation pipeline
+ * Pipeline pipeline = new Pipeline()
+ * .setStages(new PipelineStage[] {tok, sw, tf, idf, assembler});
+ * PipelineModel model = pipeline.fit(df);
+ *
+ * // save transformed features with raw data
+ * model.transform(df)
+ * .select("id", "text", "rating", "features")
+ * .write().format("parquet").save("/output/path");
+ * </code>
+ * </pre>
+ *
+ * Some feature transformers implemented in MLlib are inspired by those implemented in scikit-learn.
+ * The major difference is that most scikit-learn feature transformers operate eagerly on the entire
+ * input dataset, while MLlib's feature transformers operate lazily on individual columns,
+ * which is more efficient and flexible to handle large and complex datasets.
+ *
+ * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html" target="_blank">
+ * scikit-learn.preprocessing</a>
+ */
+package org.apache.spark.ml.feature;