aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-02-20 02:31:32 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-20 02:31:32 -0800
commit4a17eedb16343413e5b6f8bb58c6da8952ee7ab6 (patch)
treeac17fd9eed4f42ba3095b148c68d4e78b6afc875 /mllib
parentd3dfebebce9f76e4433e16d4d6d29fb8fa4d4193 (diff)
downloadspark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.gz
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.bz2
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.zip
[SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release
For SPARK-5867: * The spark.ml programming guide needs to be updated to use the new SQL DataFrame API instead of the old SchemaRDD API. * It should also include Python examples now. For SPARK-5892: * Fix Python docs * Various other cleanups BTW, I accidentally merged this with master. If you want to compile it on your own, use this branch which is based on spark/branch-1.3 and cherry-picks the commits from this PR: [https://github.com/jkbradley/spark/tree/doc-review-1.3-check] CC: mengxr (ML), davies (Python docs) Author: Joseph K. Bradley <joseph@databricks.com> Closes #4675 from jkbradley/doc-review-1.3 and squashes the following commits: f191bb0 [Joseph K. Bradley] small cleanups e786efa [Joseph K. Bradley] small doc corrections 6b1ab4a [Joseph K. Bradley] fixed python lint test 946affa [Joseph K. Bradley] Added sample data for ml.MovieLensALS example. Changed spark.ml Java examples to use DataFrames API instead of sql() da81558 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into doc-review-1.3 629dbf5 [Joseph K. Bradley] Updated based on code review: * made new page for old migration guides * small fixes * moved inherit_doc in python b9df7c4 [Joseph K. Bradley] Small cleanups: toDF to toDF(), adding s for string interpolation 34b067f [Joseph K. Bradley] small doc correction da16aef [Joseph K. Bradley] Fixed python mllib docs 8cce91c [Joseph K. Bradley] GMM: removed old imports, added some doc 695f3f6 [Joseph K. Bradley] partly done trying to fix inherit_doc for class hierarchies in python docs a72c018 [Joseph K. Bradley] made ChiSqTestResult appear in python docs b05a80d [Joseph K. Bradley] organize imports. doc cleanups e572827 [Joseph K. Bradley] updated programming guide for ml and mllib
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala14
4 files changed, 23 insertions, 11 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 80584ef5e5..568b653056 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -19,12 +19,10 @@ package org.apache.spark.mllib.clustering
import scala.collection.mutable.IndexedSeq
-import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, SparseVector => BSV,
- Transpose, Vector => BV}
+import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV}
import org.apache.spark.annotation.Experimental
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, DenseMatrix, Matrices,
- SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors}
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
@@ -43,7 +41,11 @@ import org.apache.spark.util.Utils
* less than convergenceTol, or until it has reached the max number of iterations.
* While this process is generally guaranteed to converge, it is not guaranteed
* to find a global optimum.
- *
+ *
+ * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
+ *
* @param k The number of independent Gaussians in the mixture model
* @param convergenceTol The maximum change in log-likelihood at which convergence
* is considered to have occurred.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 89b38679b7..0e4a4d0085 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -706,7 +706,7 @@ object Matrices {
}
/**
- * Generate a `DenseMatrix` consisting of zeros.
+ * Generate a `Matrix` consisting of zeros.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @return `Matrix` with size `numRows` x `numCols` and values of zeros
@@ -778,8 +778,8 @@ object Matrices {
SparseMatrix.sprandn(numRows, numCols, density, rng)
/**
- * Generate a diagonal matrix in `DenseMatrix` format from the supplied values.
- * @param vector a `Vector` tat will form the values on the diagonal of the matrix
+ * Generate a diagonal matrix in `Matrix` format from the supplied values.
+ * @param vector a `Vector` that will form the values on the diagonal of the matrix
* @return Square `Matrix` with size `values.length` x `values.length` and `values`
* on the diagonal
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 480bbfb5fe..4bdcb283da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -247,7 +247,7 @@ object Vectors {
}
/**
- * Creates a dense vector of all zeros.
+ * Creates a vector of all zeros.
*
* @param size vector size
* @return a zero vector
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 0acdab797e..8bfa0d2b64 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -63,10 +63,12 @@ abstract class Gradient extends Serializable {
* http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
* multinomial logistic regression model. A simple calculation shows that
*
+ * {{{
* P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))
* P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))
* ...
* P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))
+ * }}}
*
* for K classes multiclass classification problem.
*
@@ -75,9 +77,11 @@ abstract class Gradient extends Serializable {
* will be (K-1) * N.
*
* As a result, the loss of objective function for a single instance of data can be written as
+ * {{{
* l(w, x) = -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w)
* = log(1 + \sum_i^{K-1}\exp(x w_i)) - (1-\alpha(y)) x w_{y-1}
* = log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
+ * }}}
*
* where \alpha(i) = 1 if i != 0, and
* \alpha(i) = 0 if i == 0,
@@ -86,14 +90,16 @@ abstract class Gradient extends Serializable {
* For optimization, we have to calculate the first derivative of the loss function, and
* a simple calculation shows that
*
+ * {{{
* \frac{\partial l(w, x)}{\partial w_{ij}}
* = (\exp(x w_i) / (1 + \sum_k^{K-1} \exp(x w_k)) - (1-\alpha(y)\delta_{y, i+1})) * x_j
* = multiplier_i * x_j
+ * }}}
*
* where \delta_{i, j} = 1 if i == j,
* \delta_{i, j} = 0 if i != j, and
- * multiplier
- * = \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})
+ * multiplier =
+ * \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})
*
* If any of margins is larger than 709.78, the numerical computation of multiplier and loss
* function will be suffered from arithmetic overflow. This issue occurs when there are outliers
@@ -103,10 +109,12 @@ abstract class Gradient extends Serializable {
* Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
* easily rewritten into the following equivalent numerically stable formula.
*
+ * {{{
* l(w, x) = log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
* = log(\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin)) + maxMargin
* - (1-\alpha(y)) margins_{y-1}
* = log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
+ * }}}
*
* where sum = \exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1.
*
@@ -115,8 +123,10 @@ abstract class Gradient extends Serializable {
*
* For multiplier, similar trick can be applied as the following,
*
+ * {{{
* multiplier = \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})
* = \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
+ * }}}
*
* where each term in \exp is also smaller than zero, so overflow is not a concern.
*