aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-08-25 14:11:38 -0700
committerDB Tsai <dbt@netflix.com>2015-08-25 14:11:38 -0700
commit00ae4be97f7b205432db2967ba6d506286ef2ca6 (patch)
tree5fb2dd4e9b882fab93fc6f17f587ab505d680580 /mllib
parent9205907876cf65695e56c2a94bedd83df3675c03 (diff)
downloadspark-00ae4be97f7b205432db2967ba6d506286ef2ca6.tar.gz
spark-00ae4be97f7b205432db2967ba6d506286ef2ca6.tar.bz2
spark-00ae4be97f7b205432db2967ba6d506286ef2ca6.zip
[SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util
Same as #8421 but for `mllib.pmml` and `mllib.util`. cc dbtsai Author: Xiangrui Meng <meng@databricks.com> Closes #8430 from mengxr/SPARK-10239 and squashes the following commits: a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala6
9 files changed, 41 insertions, 11 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
index 5e882d4ebb..274ac7c995 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
import org.jpmml.model.JAXBUtil
import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
/**
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
* developed by the Data Mining Group (www.dmg.org).
*/
@DeveloperApi
+@Since("1.4.0")
trait PMMLExportable {
/**
@@ -48,6 +49,7 @@ trait PMMLExportable {
* Export the model to a local file in PMML format
*/
@Experimental
+ @Since("1.4.0")
def toPMML(localPath: String): Unit = {
toPMML(new StreamResult(new File(localPath)))
}
@@ -57,6 +59,7 @@ trait PMMLExportable {
* Export the model to a directory on a distributed file system in PMML format
*/
@Experimental
+ @Since("1.4.0")
def toPMML(sc: SparkContext, path: String): Unit = {
val pmml = toPMML()
sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
@@ -67,6 +70,7 @@ trait PMMLExportable {
* Export the model to the OutputStream in PMML format
*/
@Experimental
+ @Since("1.4.0")
def toPMML(outputStream: OutputStream): Unit = {
toPMML(new StreamResult(outputStream))
}
@@ -76,6 +80,7 @@ trait PMMLExportable {
* Export the model to a String in PMML format
*/
@Experimental
+ @Since("1.4.0")
def toPMML(): String = {
val writer = new StringWriter
toPMML(new StreamResult(writer))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
index be335a1aca..dffe6e7893 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
@@ -17,16 +17,17 @@
package org.apache.spark.mllib.util
-import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
/**
* :: DeveloperApi ::
* A collection of methods used to validate data before applying ML algorithms.
*/
@DeveloperApi
+@Since("0.8.0")
object DataValidators extends Logging {
/**
@@ -34,6 +35,7 @@ object DataValidators extends Logging {
*
* @return True if labels are all zero or one, false otherwise.
*/
+ @Since("1.0.0")
val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
if (numInvalid != 0) {
@@ -48,6 +50,7 @@ object DataValidators extends Logging {
*
* @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise.
*/
+ @Since("1.3.0")
def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x =>
x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
index e6bcff48b0..00fd1606a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.util
import scala.util.Random
-import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD
/**
@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
* cluster with scale 1 around each center.
*/
@DeveloperApi
+@Since("0.8.0")
object KMeansDataGenerator {
/**
@@ -42,6 +43,7 @@ object KMeansDataGenerator {
* @param r Scaling factor for the distribution of the initial centers
* @param numPartitions Number of partitions of the generated RDD; default 2
*/
+ @Since("0.8.0")
def generateKMeansRDD(
sc: SparkContext,
numPoints: Int,
@@ -62,6 +64,7 @@ object KMeansDataGenerator {
}
}
+ @Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 6) {
// scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 7a1c779606..d0ba454f37 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -22,11 +22,11 @@ import scala.util.Random
import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
/**
* :: DeveloperApi ::
@@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
* response variable `Y`.
*/
@DeveloperApi
+@Since("0.8.0")
object LinearDataGenerator {
/**
@@ -46,6 +47,7 @@ object LinearDataGenerator {
* @param seed Random seed
* @return Java List of input.
*/
+ @Since("0.8.0")
def generateLinearInputAsList(
intercept: Double,
weights: Array[Double],
@@ -68,6 +70,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor.
* @return Seq of input.
*/
+ @Since("0.8.0")
def generateLinearInput(
intercept: Double,
weights: Array[Double],
@@ -92,6 +95,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor.
* @return Seq of input.
*/
+ @Since("0.8.0")
def generateLinearInput(
intercept: Double,
weights: Array[Double],
@@ -132,6 +136,7 @@ object LinearDataGenerator {
*
* @return RDD of LabeledPoint containing sample data.
*/
+ @Since("0.8.0")
def generateLinearRDD(
sc: SparkContext,
nexamples: Int,
@@ -151,6 +156,7 @@ object LinearDataGenerator {
data
}
+ @Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
index c09cbe69bb..33477ee20e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.util
import scala.util.Random
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
@@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors
* with probability `probOne` and scales features for positive examples by `eps`.
*/
@DeveloperApi
+@Since("0.8.0")
object LogisticRegressionDataGenerator {
/**
@@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator {
* @param nparts Number of partitions of the generated RDD. Default value is 2.
* @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
*/
+ @Since("0.8.0")
def generateLogisticRDD(
sc: SparkContext,
nexamples: Int,
@@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator {
data
}
+ @Since("0.8.0")
def main(args: Array[String]) {
if (args.length != 5) {
// scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
index 16f430599a..906bd30563 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -23,7 +23,7 @@ import scala.language.postfixOps
import scala.util.Random
import org.apache.spark.SparkContext
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix}
import org.apache.spark.rdd.RDD
@@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD
* testSampFact (Double) Percentage of training data to use as test data.
*/
@DeveloperApi
+@Since("0.8.0")
object MFDataGenerator {
+ @Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 4940974bf4..81c2f0ce6e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream
/**
* Helper methods to load, save and pre-process data used in ML Lib.
*/
+@Since("0.8.0")
object MLUtils {
private[mllib] lazy val EPSILON = {
@@ -168,6 +169,7 @@ object MLUtils {
*
* @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]]
*/
+ @Since("1.0.0")
def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) {
// TODO: allow to specify label precision and feature precision.
val dataStr = data.map { case LabeledPoint(label, features) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
index ad20b7694a..cde5979396 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
@@ -21,11 +21,11 @@ import scala.util.Random
import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
/**
* :: DeveloperApi ::
@@ -33,8 +33,10 @@ import org.apache.spark.mllib.regression.LabeledPoint
* for the features and adds Gaussian noise with weight 0.1 to generate labels.
*/
@DeveloperApi
+@Since("0.8.0")
object SVMDataGenerator {
+ @Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index 30d642c754..4d71d534a0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -24,7 +24,7 @@ import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.apache.spark.SparkContext
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.types.{DataType, StructField, StructType}
@@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType}
* This should be inherited by the class which implements model instances.
*/
@DeveloperApi
+@Since("1.3.0")
trait Saveable {
/**
@@ -50,6 +51,7 @@ trait Saveable {
* @param path Path specifying the directory in which to save this model.
* If the directory already exists, this method throws an exception.
*/
+ @Since("1.3.0")
def save(sc: SparkContext, path: String): Unit
/** Current version of model save/load format. */
@@ -64,6 +66,7 @@ trait Saveable {
* This should be inherited by an object paired with the model class.
*/
@DeveloperApi
+@Since("1.3.0")
trait Loader[M <: Saveable] {
/**
@@ -75,6 +78,7 @@ trait Loader[M <: Saveable] {
* @param path Path specifying the directory to which the model was saved.
* @return Model instance
*/
+ @Since("1.3.0")
def load(sc: SparkContext, path: String): M
}