From 8f11c6116bf8c7246682cbb2d6f27bf0f1531c6d Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 21 May 2015 22:57:33 -0700
Subject: [SPARK-7535] [.0] [MLLIB] Audit the pipeline APIs for 1.4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some changes to the pipeilne APIs:

1. Estimator/Transformer/ doesn’t need to extend Params since PipelineStage already does.
1. Move Evaluator to ml.evaluation.
1. Mention larger metric values are better.
1. PipelineModel doc. “compiled” -> “fitted”
1. Hide object PolynomialExpansion.
1. Hide object VectorAssembler.
1. Word2Vec.minCount (and other) -> group param
1. ParamValidators -> DeveloperApi
1. Hide MetadataUtils/SchemaUtils.

jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6322 from mengxr/SPARK-7535.0 and squashes the following commits:

9e9c7da [Xiangrui Meng] move JavaEvaluator to ml.evaluation as well
e179480 [Xiangrui Meng] move Evaluation to ml.evaluation in PySpark
08ef61f [Xiangrui Meng] update pipieline APIs
---
 .../main/scala/org/apache/spark/ml/Estimator.scala |  2 +-
 .../main/scala/org/apache/spark/ml/Evaluator.scala | 52 ----------------------
 .../main/scala/org/apache/spark/ml/Pipeline.scala  |  2 +-
 .../scala/org/apache/spark/ml/Transformer.scala    |  2 +-
 .../evaluation/BinaryClassificationEvaluator.scala |  2 +-
 .../org/apache/spark/ml/evaluation/Evaluator.scala | 52 ++++++++++++++++++++++
 .../spark/ml/feature/PolynomialExpansion.scala     |  2 +-
 .../apache/spark/ml/feature/VectorAssembler.scala  |  3 +-
 .../org/apache/spark/ml/feature/Word2Vec.scala     |  3 ++
 .../scala/org/apache/spark/ml/param/params.scala   |  6 ++-
 .../apache/spark/ml/tuning/CrossValidator.scala    |  1 +
 .../org/apache/spark/ml/util/MetadataUtils.scala   |  6 +--
 .../org/apache/spark/ml/util/SchemaUtils.scala     |  6 +--
 13 files changed, 70 insertions(+), 69 deletions(-)
 delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala

(limited to 'mllib')

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
index 7f3f3262a6..9e16e60270 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
  * Abstract class for estimators that fit models to data.
  */
 @AlphaComponent
-abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
+abstract class Estimator[M <: Model[M]] extends PipelineStage {
 
   /**
    * Fits a single model to the input data with optional parameters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
deleted file mode 100644
index 5f2f8c94e9..0000000000
--- a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml
-
-import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.param.{ParamMap, Params}
-import org.apache.spark.sql.DataFrame
-
-/**
- * :: AlphaComponent ::
- * Abstract class for evaluators that compute metrics from predictions.
- */
-@AlphaComponent
-abstract class Evaluator extends Params {
-
-  /**
-   * Evaluates the output.
-   *
-   * @param dataset a dataset that contains labels/observations and predictions.
-   * @param paramMap parameter map that specifies the input columns and output metrics
-   * @return metric
-   */
-  def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = {
-    this.copy(paramMap).evaluate(dataset)
-  }
-
-  /**
-   * Evaluates the output.
-   * @param dataset a dataset that contains labels/observations and predictions.
-   * @return metric
-   */
-  def evaluate(dataset: DataFrame): Double
-
-  override def copy(extra: ParamMap): Evaluator = {
-    super.copy(extra).asInstanceOf[Evaluator]
-  }
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index fac54188f9..43bee1b770 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -170,7 +170,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
 
 /**
  * :: AlphaComponent ::
- * Represents a compiled pipeline.
+ * Represents a fitted pipeline.
  */
 @AlphaComponent
 class PipelineModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index d96b54e511..38bb6a5a53 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
  * Abstract class for transformers that transform one dataset into another.
  */
 @AlphaComponent
-abstract class Transformer extends PipelineStage with Params {
+abstract class Transformer extends PipelineStage {
 
   /**
    * Transforms the dataset with optional parameters
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index c1af09c969..ddbdd00ceb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.evaluation
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.Evaluator
+import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
new file mode 100644
index 0000000000..cabd1c97c0
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.param.{ParamMap, Params}
+import org.apache.spark.sql.DataFrame
+
+/**
+ * :: AlphaComponent ::
+ * Abstract class for evaluators that compute metrics from predictions.
+ */
+@AlphaComponent
+abstract class Evaluator extends Params {
+
+  /**
+   * Evaluates model output and returns a scalar metric (larger is better).
+   *
+   * @param dataset a dataset that contains labels/observations and predictions.
+   * @param paramMap parameter map that specifies the input columns and output metrics
+   * @return metric
+   */
+  def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = {
+    this.copy(paramMap).evaluate(dataset)
+  }
+
+  /**
+   * Evaluates the output.
+   * @param dataset a dataset that contains labels/observations and predictions.
+   * @return metric
+   */
+  def evaluate(dataset: DataFrame): Double
+
+  override def copy(extra: ParamMap): Evaluator = {
+    super.copy(extra).asInstanceOf[Evaluator]
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 41564410e4..8ddf9d6a1e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -75,7 +75,7 @@ class PolynomialExpansion(override val uid: String)
  * To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
  * current index and increment it properly for sparse input.
  */
-object PolynomialExpansion {
+private[feature] object PolynomialExpansion {
 
   private def choose(n: Int, k: Int): Int = {
     Range(n, n - k, -1).product / Range(k, 1, -1).product
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 1c00094769..181b62f46f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -78,8 +78,7 @@ class VectorAssembler(override val uid: String)
   }
 }
 
-@AlphaComponent
-object VectorAssembler {
+private object VectorAssembler {
 
   private[feature] def assemble(vv: Any*): Vector = {
     val indices = ArrayBuilder.make[Int]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 90f0be76df..ed03266922 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -37,6 +37,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * The dimension of the code that you want to transform from words.
+   * @group param
    */
   final val vectorSize = new IntParam(
     this, "vectorSize", "the dimension of codes after transforming from words")
@@ -47,6 +48,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * Number of partitions for sentences of words.
+   * @group param
    */
   final val numPartitions = new IntParam(
     this, "numPartitions", "number of partitions for sentences of words")
@@ -58,6 +60,7 @@ private[feature] trait Word2VecBase extends Params
   /**
    * The minimum number of times a token must appear to be included in the word2vec model's
    * vocabulary.
+   * @group param
    */
   final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
     "appear to be included in the word2vec model's vocabulary")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 94abfcda5c..12fc5b561f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -24,7 +24,7 @@ import scala.annotation.varargs
 import scala.collection.mutable
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.util.Identifiable
 
 /**
@@ -92,9 +92,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
 }
 
 /**
+ * :: DeveloperApi ::
  * Factory methods for common validation functions for [[Param.isValid]].
  * The numerical methods only support Int, Long, Float, and Double.
  */
+@DeveloperApi
 object ParamValidators {
 
   /** (private[param]) Default validation always return true */
@@ -529,11 +531,13 @@ trait Params extends Identifiable with Serializable {
 }
 
 /**
+ * :: DeveloperApi ::
  * Java-friendly wrapper for [[Params]].
  * Java developers who need to extend [[Params]] should use this class instead.
  * If you need to extend a abstract class which already extends [[Params]], then that abstract
  * class should be Java-friendly as well.
  */
+@DeveloperApi
 abstract class JavaParams extends Params
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 5c6ff2dda3..e21ff94a20 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -22,6 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
 import org.apache.spark.Logging
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml._
+import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.util.MLUtils
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index 56075c9a6b..2a1db90f2c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -19,18 +19,14 @@ package org.apache.spark.ml.util
 
 import scala.collection.immutable.HashMap
 
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.attribute._
 import org.apache.spark.sql.types.StructField
 
 
 /**
- * :: Experimental ::
- *
  * Helper utilities for tree-based algorithms
  */
-@Experimental
-object MetadataUtils {
+private[spark] object MetadataUtils {
 
   /**
    * Examine a schema to identify the number of classes in a label column.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 11592b77eb..7cd53c6d7e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.ml.util
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
+
 /**
- * :: DeveloperApi ::
  * Utils for handling schemas.
  */
-@DeveloperApi
-object SchemaUtils {
+private[spark] object SchemaUtils {
 
   // TODO: Move the utility methods to SQL.
 
-- 
cgit v1.2.3