aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2014-07-13 19:27:43 -0700
committerXiangrui Meng <meng@databricks.com>2014-07-13 19:27:43 -0700
commit635888cbed0e3f4127252fb84db449f0cc9ed659 (patch)
tree43433e3393c889f25a8ef4898099664a1a5ce0a7
parent4c8be64e768fe71643b37f1e82f619c8aeac6eff (diff)
downloadspark-635888cbed0e3f4127252fb84db449f0cc9ed659.tar.gz
spark-635888cbed0e3f4127252fb84db449f0cc9ed659.tar.bz2
spark-635888cbed0e3f4127252fb84db449f0cc9ed659.zip
SPARK-2363. Clean MLlib's sample data files
(Just made a PR for this, mengxr was the reporter of:) MLlib has sample data under serveral folders: 1) data/mllib 2) data/ 3) mllib/data/* Per previous discussion with Matei Zaharia, we want to put them under `data/mllib` and clean outdated files. Author: Sean Owen <sowen@cloudera.com> Closes #1394 from srowen/SPARK-2363 and squashes the following commits: 54313dd [Sean Owen] Move ML example data from /mllib/data/ and /data/ into /data/mllib/
-rw-r--r--data/mllib/als/test.data (renamed from mllib/data/als/test.data)0
-rw-r--r--data/mllib/kmeans_data.txt (renamed from data/kmeans_data.txt)0
-rwxr-xr-xdata/mllib/lr-data/random.data (renamed from mllib/data/lr-data/random.data)0
-rw-r--r--data/mllib/lr_data.txt (renamed from data/lr_data.txt)0
-rw-r--r--data/mllib/pagerank_data.txt (renamed from data/pagerank_data.txt)0
-rw-r--r--data/mllib/ridge-data/lpsa.data (renamed from mllib/data/ridge-data/lpsa.data)0
-rw-r--r--data/mllib/sample_libsvm_data.txt (renamed from mllib/data/sample_libsvm_data.txt)0
-rw-r--r--data/mllib/sample_naive_bayes_data.txt (renamed from mllib/data/sample_naive_bayes_data.txt)0
-rw-r--r--data/mllib/sample_svm_data.txt (renamed from mllib/data/sample_svm_data.txt)0
-rw-r--r--data/mllib/sample_tree_data.csv (renamed from mllib/data/sample_tree_data.csv)0
-rw-r--r--docs/bagel-programming-guide.md2
-rw-r--r--docs/mllib-basics.md6
-rw-r--r--docs/mllib-clustering.md4
-rw-r--r--docs/mllib-collaborative-filtering.md4
-rw-r--r--docs/mllib-decision-tree.md4
-rw-r--r--docs/mllib-linear-methods.md8
-rw-r--r--docs/mllib-naive-bayes.md2
-rw-r--r--docs/mllib-optimization.md2
18 files changed, 16 insertions, 16 deletions
diff --git a/mllib/data/als/test.data b/data/mllib/als/test.data
index e476cc23e0..e476cc23e0 100644
--- a/mllib/data/als/test.data
+++ b/data/mllib/als/test.data
diff --git a/data/kmeans_data.txt b/data/mllib/kmeans_data.txt
index 338664f78d..338664f78d 100644
--- a/data/kmeans_data.txt
+++ b/data/mllib/kmeans_data.txt
diff --git a/mllib/data/lr-data/random.data b/data/mllib/lr-data/random.data
index 29bcb8acba..29bcb8acba 100755
--- a/mllib/data/lr-data/random.data
+++ b/data/mllib/lr-data/random.data
diff --git a/data/lr_data.txt b/data/mllib/lr_data.txt
index d4df0634e0..d4df0634e0 100644
--- a/data/lr_data.txt
+++ b/data/mllib/lr_data.txt
diff --git a/data/pagerank_data.txt b/data/mllib/pagerank_data.txt
index 95755ab8f5..95755ab8f5 100644
--- a/data/pagerank_data.txt
+++ b/data/mllib/pagerank_data.txt
diff --git a/mllib/data/ridge-data/lpsa.data b/data/mllib/ridge-data/lpsa.data
index fdd16e36b4..fdd16e36b4 100644
--- a/mllib/data/ridge-data/lpsa.data
+++ b/data/mllib/ridge-data/lpsa.data
diff --git a/mllib/data/sample_libsvm_data.txt b/data/mllib/sample_libsvm_data.txt
index 861c70cde7..861c70cde7 100644
--- a/mllib/data/sample_libsvm_data.txt
+++ b/data/mllib/sample_libsvm_data.txt
diff --git a/mllib/data/sample_naive_bayes_data.txt b/data/mllib/sample_naive_bayes_data.txt
index 981da382d6..981da382d6 100644
--- a/mllib/data/sample_naive_bayes_data.txt
+++ b/data/mllib/sample_naive_bayes_data.txt
diff --git a/mllib/data/sample_svm_data.txt b/data/mllib/sample_svm_data.txt
index 7ab30bd93c..7ab30bd93c 100644
--- a/mllib/data/sample_svm_data.txt
+++ b/data/mllib/sample_svm_data.txt
diff --git a/mllib/data/sample_tree_data.csv b/data/mllib/sample_tree_data.csv
index bc97e2941a..bc97e2941a 100644
--- a/mllib/data/sample_tree_data.csv
+++ b/data/mllib/sample_tree_data.csv
diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md
index b280df0c8e..7e55131754 100644
--- a/docs/bagel-programming-guide.md
+++ b/docs/bagel-programming-guide.md
@@ -46,7 +46,7 @@ import org.apache.spark.bagel.Bagel._
Next, we load a sample graph from a text file as a distributed dataset and package it into `PRVertex` objects. We also cache the distributed dataset because Bagel will use it multiple times and we'd like to avoid recomputing it.
{% highlight scala %}
-val input = sc.textFile("data/pagerank_data.txt")
+val input = sc.textFile("data/mllib/pagerank_data.txt")
val numVerts = input.count()
diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index 5796e16e8f..f9585251fa 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -193,7 +193,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
-val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
{% endhighlight %}
</div>
@@ -207,7 +207,7 @@ import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.api.java.JavaRDD;
JavaRDD<LabeledPoint> examples =
- MLUtils.loadLibSVMFile(jsc.sc(), "mllib/data/sample_libsvm_data.txt").toJavaRDD();
+ MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
{% endhighlight %}
</div>
@@ -218,7 +218,7 @@ examples stored in LIBSVM format.
{% highlight python %}
from pyspark.mllib.util import MLUtils
-examples = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
{% endhighlight %}
</div>
</div>
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 429cdf8d40..c76ac010d3 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -51,7 +51,7 @@ import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
// Load and parse the data
-val data = sc.textFile("data/kmeans_data.txt")
+val data = sc.textFile("data/mllib/kmeans_data.txt")
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
// Cluster the data into two classes using KMeans
@@ -86,7 +86,7 @@ from numpy import array
from math import sqrt
# Load and parse the data
-data = sc.textFile("data/kmeans_data.txt")
+data = sc.textFile("data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
# Build the model (cluster the data)
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index d51002f015..5cd7173872 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -58,7 +58,7 @@ import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
// Load and parse the data
-val data = sc.textFile("mllib/data/als/test.data")
+val data = sc.textFile("data/mllib/als/test.data")
val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
@@ -112,7 +112,7 @@ from pyspark.mllib.recommendation import ALS
from numpy import array
# Load and parse the data
-data = sc.textFile("mllib/data/als/test.data")
+data = sc.textFile("data/mllib/als/test.data")
ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
# Build the recommendation model using Alternating Least Squares
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 3002a66a4f..9cd768599e 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -122,7 +122,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Gini
// Load and parse the data file
-val data = sc.textFile("mllib/data/sample_tree_data.csv")
+val data = sc.textFile("data/mllib/sample_tree_data.csv")
val parsedData = data.map { line =>
val parts = line.split(',').map(_.toDouble)
LabeledPoint(parts(0), Vectors.dense(parts.tail))
@@ -161,7 +161,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Variance
// Load and parse the data file
-val data = sc.textFile("mllib/data/sample_tree_data.csv")
+val data = sc.textFile("data/mllib/sample_tree_data.csv")
val parsedData = data.map { line =>
val parts = line.split(',').map(_.toDouble)
LabeledPoint(parts(0), Vectors.dense(parts.tail))
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 4dfbebbcd0..b4d22e0df5 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -187,7 +187,7 @@ import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
// Load training data in LIBSVM format.
-val data = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
// Split data into training (60%) and test (40%).
val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
@@ -259,7 +259,7 @@ def parsePoint(line):
values = [float(x) for x in line.split(' ')]
return LabeledPoint(values[0], values[1:])
-data = sc.textFile("mllib/data/sample_svm_data.txt")
+data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)
# Build the model
@@ -309,7 +309,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
// Load and parse the data
-val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
@@ -356,7 +356,7 @@ def parsePoint(line):
values = [float(x) for x in line.replace(',', ' ').split(' ')]
return LabeledPoint(values[0], values[1:])
-data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+data = sc.textFile("data/mllib/ridge-data/lpsa.data")
parsedData = data.map(parsePoint)
# Build the model
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 1d1d7dcf6f..b1650c83c9 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -40,7 +40,7 @@ import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
-val data = sc.textFile("mllib/data/sample_naive_bayes_data.txt")
+val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index ae9ede58e8..651958c781 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -214,7 +214,7 @@ import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.LogisticRegressionModel
-val data = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
val numFeatures = data.take(1)(0).features.size
// Split data into training (60%) and test (40%).