aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-05-21 22:57:33 -0700
committerXiangrui Meng <meng@databricks.com>2015-05-21 22:57:33 -0700
commit8f11c6116bf8c7246682cbb2d6f27bf0f1531c6d (patch)
tree144b7d5b9ec1215e88d05539f51e042a6d39470c /mllib
parente4136ea6c457bc74cee312aa14974498ab4633eb (diff)
downloadspark-8f11c6116bf8c7246682cbb2d6f27bf0f1531c6d.tar.gz
spark-8f11c6116bf8c7246682cbb2d6f27bf0f1531c6d.tar.bz2
spark-8f11c6116bf8c7246682cbb2d6f27bf0f1531c6d.zip
[SPARK-7535] [.0] [MLLIB] Audit the pipeline APIs for 1.4
Some changes to the pipeilne APIs: 1. Estimator/Transformer/ doesn’t need to extend Params since PipelineStage already does. 1. Move Evaluator to ml.evaluation. 1. Mention larger metric values are better. 1. PipelineModel doc. “compiled” -> “fitted” 1. Hide object PolynomialExpansion. 1. Hide object VectorAssembler. 1. Word2Vec.minCount (and other) -> group param 1. ParamValidators -> DeveloperApi 1. Hide MetadataUtils/SchemaUtils. jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #6322 from mengxr/SPARK-7535.0 and squashes the following commits: 9e9c7da [Xiangrui Meng] move JavaEvaluator to ml.evaluation as well e179480 [Xiangrui Meng] move Evaluation to ml.evaluation in PySpark 08ef61f [Xiangrui Meng] update pipieline APIs
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Estimator.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Transformer.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala (renamed from mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala)4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/param/params.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala6
12 files changed, 20 insertions, 19 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
index 7f3f3262a6..9e16e60270 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
* Abstract class for estimators that fit models to data.
*/
@AlphaComponent
-abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
+abstract class Estimator[M <: Model[M]] extends PipelineStage {
/**
* Fits a single model to the input data with optional parameters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index fac54188f9..43bee1b770 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -170,7 +170,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
/**
* :: AlphaComponent ::
- * Represents a compiled pipeline.
+ * Represents a fitted pipeline.
*/
@AlphaComponent
class PipelineModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index d96b54e511..38bb6a5a53 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
* Abstract class for transformers that transform one dataset into another.
*/
@AlphaComponent
-abstract class Transformer extends PipelineStage with Params {
+abstract class Transformer extends PipelineStage {
/**
* Transforms the dataset with optional parameters
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index c1af09c969..ddbdd00ceb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -18,7 +18,7 @@
package org.apache.spark.ml.evaluation
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.Evaluator
+import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index 5f2f8c94e9..cabd1c97c0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.spark.ml
+package org.apache.spark.ml.evaluation
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param.{ParamMap, Params}
@@ -29,7 +29,7 @@ import org.apache.spark.sql.DataFrame
abstract class Evaluator extends Params {
/**
- * Evaluates the output.
+ * Evaluates model output and returns a scalar metric (larger is better).
*
* @param dataset a dataset that contains labels/observations and predictions.
* @param paramMap parameter map that specifies the input columns and output metrics
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 41564410e4..8ddf9d6a1e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -75,7 +75,7 @@ class PolynomialExpansion(override val uid: String)
* To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
* current index and increment it properly for sparse input.
*/
-object PolynomialExpansion {
+private[feature] object PolynomialExpansion {
private def choose(n: Int, k: Int): Int = {
Range(n, n - k, -1).product / Range(k, 1, -1).product
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 1c00094769..181b62f46f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -78,8 +78,7 @@ class VectorAssembler(override val uid: String)
}
}
-@AlphaComponent
-object VectorAssembler {
+private object VectorAssembler {
private[feature] def assemble(vv: Any*): Vector = {
val indices = ArrayBuilder.make[Int]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 90f0be76df..ed03266922 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -37,6 +37,7 @@ private[feature] trait Word2VecBase extends Params
/**
* The dimension of the code that you want to transform from words.
+ * @group param
*/
final val vectorSize = new IntParam(
this, "vectorSize", "the dimension of codes after transforming from words")
@@ -47,6 +48,7 @@ private[feature] trait Word2VecBase extends Params
/**
* Number of partitions for sentences of words.
+ * @group param
*/
final val numPartitions = new IntParam(
this, "numPartitions", "number of partitions for sentences of words")
@@ -58,6 +60,7 @@ private[feature] trait Word2VecBase extends Params
/**
* The minimum number of times a token must appear to be included in the word2vec model's
* vocabulary.
+ * @group param
*/
final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
"appear to be included in the word2vec model's vocabulary")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 94abfcda5c..12fc5b561f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -24,7 +24,7 @@ import scala.annotation.varargs
import scala.collection.mutable
import scala.collection.JavaConverters._
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
import org.apache.spark.ml.util.Identifiable
/**
@@ -92,9 +92,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
}
/**
+ * :: DeveloperApi ::
* Factory methods for common validation functions for [[Param.isValid]].
* The numerical methods only support Int, Long, Float, and Double.
*/
+@DeveloperApi
object ParamValidators {
/** (private[param]) Default validation always return true */
@@ -529,11 +531,13 @@ trait Params extends Identifiable with Serializable {
}
/**
+ * :: DeveloperApi ::
* Java-friendly wrapper for [[Params]].
* Java developers who need to extend [[Params]] should use this class instead.
* If you need to extend a abstract class which already extends [[Params]], then that abstract
* class should be Java-friendly as well.
*/
+@DeveloperApi
abstract class JavaParams extends Params
/**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 5c6ff2dda3..e21ff94a20 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -22,6 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
+import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.util.MLUtils
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index 56075c9a6b..2a1db90f2c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -19,18 +19,14 @@ package org.apache.spark.ml.util
import scala.collection.immutable.HashMap
-import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.attribute._
import org.apache.spark.sql.types.StructField
/**
- * :: Experimental ::
- *
* Helper utilities for tree-based algorithms
*/
-@Experimental
-object MetadataUtils {
+private[spark] object MetadataUtils {
/**
* Examine a schema to identify the number of classes in a label column.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 11592b77eb..7cd53c6d7e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -17,15 +17,13 @@
package org.apache.spark.ml.util
-import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.types.{DataType, StructField, StructType}
+
/**
- * :: DeveloperApi ::
* Utils for handling schemas.
*/
-@DeveloperApi
-object SchemaUtils {
+private[spark] object SchemaUtils {
// TODO: Move the utility methods to SQL.