From a5f51e21627c1bcfc62829a3a962707abf41a452 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Wed, 13 Jul 2016 15:40:44 -0700 Subject: [SPARK-16485][ML][DOC] Fix privacy of GLM members, rename sqlDataTypes for ML, doc fixes ## What changes were proposed in this pull request? Fixing issues found during 2.0 API checks: * GeneralizedLinearRegressionModel: linkObj, familyObj, familyAndLink should not be exposed * sqlDataTypes: name does not follow conventions. Do we need to expose it? * Evaluator: inconsistent doc between evaluate and isLargerBetter * MinMaxScaler: math rendering --> hard to make it great, but I'll change it a little * GeneralizedLinearRegressionSummary: aic doc is incorrect --> will change to use more common name ## How was this patch tested? Existing unit tests. Docs generated locally. (MinMaxScaler is improved a tiny bit.) Author: Joseph K. Bradley Closes #14187 from jkbradley/final-api-check-2.0. --- .../org/apache/spark/ml/evaluation/Evaluator.scala | 7 +++-- .../org/apache/spark/ml/feature/MinMaxScaler.scala | 4 +-- .../org/apache/spark/ml/linalg/SQLDataTypes.scala | 36 ++++++++++++++++++++++ .../org/apache/spark/ml/linalg/dataTypes.scala | 35 --------------------- .../regression/GeneralizedLinearRegression.scala | 10 +++--- .../spark/ml/linalg/JavaSQLDataTypesSuite.java | 2 +- .../apache/spark/ml/linalg/SQLDataTypesSuite.scala | 4 +-- 7 files changed, 51 insertions(+), 47 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala (limited to 'mllib') diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala index 5f765c071b..dfbc3e5222 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala @@ -30,7 +30,8 @@ import org.apache.spark.sql.Dataset abstract class Evaluator extends Params { /** - * Evaluates model output and returns a scalar metric (larger is better). + * Evaluates model output and returns a scalar metric. + * The value of [[isLargerBetter]] specifies whether larger values are better. * * @param dataset a dataset that contains labels/observations and predictions. * @param paramMap parameter map that specifies the input columns and output metrics @@ -42,7 +43,9 @@ abstract class Evaluator extends Params { } /** - * Evaluates the output. + * Evaluates model output and returns a scalar metric. + * The value of [[isLargerBetter]] specifies whether larger values are better. + * * @param dataset a dataset that contains labels/observations and predictions. * @return metric */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index 7b03f0c0f3..9ed8d83324 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -78,9 +78,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H * statistics, which is also known as min-max normalization or Rescaling. The rescaled value for * feature E is calculated as, * - * Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min + * `Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min` * - * For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) + * For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`. * Note that since zero values will probably be transformed to non-zero values, output of the * transformer will be DenseVector even for sparse input. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala new file mode 100644 index 0000000000..a66ba27a7b --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/SQLDataTypes.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.linalg + +import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.sql.types.DataType + +/** + * :: DeveloperApi :: + * SQL data types for vectors and matrices. + */ +@Since("2.0.0") +@DeveloperApi +object SQLDataTypes { + + /** Data type for [[Vector]]. */ + val VectorType: DataType = new VectorUDT + + /** Data type for [[Matrix]]. */ + val MatrixType: DataType = new MatrixUDT +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala deleted file mode 100644 index 52a6fd25e2..0000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/dataTypes.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.linalg - -import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.types.DataType - -/** - * :: DeveloperApi :: - * SQL data types for vectors and matrices. - */ -@DeveloperApi -object sqlDataTypes { - - /** Data type for [[Vector]]. */ - val VectorType: DataType = new VectorUDT - - /** Data type for [[Matrix]]. */ - val MatrixType: DataType = new MatrixUDT -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a23e90d9e1..2bdc09e1db 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -376,7 +376,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine def deviance(y: Double, mu: Double, weight: Double): Double /** - * Akaike's 'An Information Criterion'(AIC) value of the family for a given dataset. + * Akaike Information Criterion (AIC) value of the family for a given dataset. * * @param predictions an RDD of (y, mu, weight) of instances in evaluation dataset * @param deviance the deviance for the fitted model in evaluation dataset @@ -702,13 +702,13 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - lazy val familyObj = Family.fromName($(family)) - lazy val linkObj = if (isDefined(link)) { + private lazy val familyObj = Family.fromName($(family)) + private lazy val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { familyObj.defaultLink } - lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj) + private lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj) override protected def predict(features: Vector): Double = { val eta = predictLink(features) @@ -1021,7 +1021,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( rss / degreesOfFreedom } - /** Akaike's "An Information Criterion"(AIC) for the fitted model. */ + /** Akaike Information Criterion (AIC) for the fitted model. */ @Since("2.0.0") lazy val aic: Double = { val w = weightCol diff --git a/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java b/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java index b09e13112f..bd64a7186e 100644 --- a/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/linalg/JavaSQLDataTypesSuite.java @@ -20,7 +20,7 @@ package org.apache.spark.ml.linalg; import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.ml.linalg.sqlDataTypes.*; +import static org.apache.spark.ml.linalg.SQLDataTypes.*; public class JavaSQLDataTypesSuite { @Test diff --git a/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala index 13bf3d3015..0bd0c32f19 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/linalg/SQLDataTypesSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite class SQLDataTypesSuite extends SparkFunSuite { test("sqlDataTypes") { - assert(sqlDataTypes.VectorType === new VectorUDT) - assert(sqlDataTypes.MatrixType === new MatrixUDT) + assert(SQLDataTypes.VectorType === new VectorUDT) + assert(SQLDataTypes.MatrixType === new MatrixUDT) } } -- cgit v1.2.3