aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2016-07-13 15:40:44 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-07-13 15:40:44 -0700
commita5f51e21627c1bcfc62829a3a962707abf41a452 (patch)
tree20eb56c9d315942de3a14906adf46b7cea75afe5 /mllib
parentc5ec879828369ec1d21acd7f18a792306634ff74 (diff)
downloadspark-a5f51e21627c1bcfc62829a3a962707abf41a452.tar.gz
spark-a5f51e21627c1bcfc62829a3a962707abf41a452.tar.bz2
spark-a5f51e21627c1bcfc62829a3a962707abf41a452.zip
[SPARK-16485][ML][DOC] Fix privacy of GLM members, rename sqlDataTypes for ML, doc fixes
## What changes were proposed in this pull request? Fixing issues found during 2.0 API checks: * GeneralizedLinearRegressionModel: linkObj, familyObj, familyAndLink should not be exposed * sqlDataTypes: name does not follow conventions. Do we need to expose it? * Evaluator: inconsistent doc between evaluate and isLargerBetter * MinMaxScaler: math rendering --> hard to make it great, but I'll change it a little * GeneralizedLinearRegressionSummary: aic doc is incorrect --> will change to use more common name ## How was this patch tested? Existing unit tests. Docs generated locally. (MinMaxScaler is improved a tiny bit.) Author: Joseph K. Bradley <joseph@databricks.com> Closes #14187 from jkbradley/final-api-check-2.0.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala (renamed from mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala)5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala10
-rw-r--r--mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java2
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala4
6 files changed, 18 insertions, 14 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index 5f765c071b..dfbc3e5222 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -30,7 +30,8 @@ import org.apache.spark.sql.Dataset
abstract class Evaluator extends Params {
/**
- * Evaluates model output and returns a scalar metric (larger is better).
+ * Evaluates model output and returns a scalar metric.
+ * The value of [[isLargerBetter]] specifies whether larger values are better.
*
* @param dataset a dataset that contains labels/observations and predictions.
* @param paramMap parameter map that specifies the input columns and output metrics
@@ -42,7 +43,9 @@ abstract class Evaluator extends Params {
}
/**
- * Evaluates the output.
+ * Evaluates model output and returns a scalar metric.
+ * The value of [[isLargerBetter]] specifies whether larger values are better.
+ *
* @param dataset a dataset that contains labels/observations and predictions.
* @return metric
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index 7b03f0c0f3..9ed8d83324 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -78,9 +78,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
* statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
* feature E is calculated as,
*
- * Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
+ * `Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min`
*
- * For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min)
+ * For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`.
* Note that since zero values will probably be transformed to non-zero values, output of the
* transformer will be DenseVector even for sparse input.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala
index 52a6fd25e2..a66ba27a7b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala
@@ -17,15 +17,16 @@
package org.apache.spark.ml.linalg
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.sql.types.DataType
/**
* :: DeveloperApi ::
* SQL data types for vectors and matrices.
*/
+@Since("2.0.0")
@DeveloperApi
-object sqlDataTypes {
+object SQLDataTypes {
/** Data type for [[Vector]]. */
val VectorType: DataType = new VectorUDT
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index a23e90d9e1..2bdc09e1db 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -376,7 +376,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
def deviance(y: Double, mu: Double, weight: Double): Double
/**
- * Akaike's 'An Information Criterion'(AIC) value of the family for a given dataset.
+ * Akaike Information Criterion (AIC) value of the family for a given dataset.
*
* @param predictions an RDD of (y, mu, weight) of instances in evaluation dataset
* @param deviance the deviance for the fitted model in evaluation dataset
@@ -702,13 +702,13 @@ class GeneralizedLinearRegressionModel private[ml] (
import GeneralizedLinearRegression._
- lazy val familyObj = Family.fromName($(family))
- lazy val linkObj = if (isDefined(link)) {
+ private lazy val familyObj = Family.fromName($(family))
+ private lazy val linkObj = if (isDefined(link)) {
Link.fromName($(link))
} else {
familyObj.defaultLink
}
- lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj)
+ private lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj)
override protected def predict(features: Vector): Double = {
val eta = predictLink(features)
@@ -1021,7 +1021,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
rss / degreesOfFreedom
}
- /** Akaike's "An Information Criterion"(AIC) for the fitted model. */
+ /** Akaike Information Criterion (AIC) for the fitted model. */
@Since("2.0.0")
lazy val aic: Double = {
val w = weightCol
diff --git a/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java b/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java
index b09e13112f..bd64a7186e 100644
--- a/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java
@@ -20,7 +20,7 @@ package org.apache.spark.ml.linalg;
import org.junit.Assert;
import org.junit.Test;
-import static org.apache.spark.ml.linalg.sqlDataTypes.*;
+import static org.apache.spark.ml.linalg.SQLDataTypes.*;
public class JavaSQLDataTypesSuite {
@Test
diff --git a/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala
index 13bf3d3015..0bd0c32f19 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
class SQLDataTypesSuite extends SparkFunSuite {
test("sqlDataTypes") {
- assert(sqlDataTypes.VectorType === new VectorUDT)
- assert(sqlDataTypes.MatrixType === new MatrixUDT)
+ assert(SQLDataTypes.VectorType === new VectorUDT)
+ assert(SQLDataTypes.MatrixType === new MatrixUDT)
}
}