diff options
author | Xusen Yin <yinxusen@gmail.com> | 2014-04-04 11:12:47 -0700 |
---|---|---|
committer | Matei Zaharia <matei@databricks.com> | 2014-04-04 11:12:47 -0700 |
commit | f1fa617023d30d8cdc5acef0274bad8cc3e89cea (patch) | |
tree | 5eaf3f485c66a74ea260afbc3a0b941ad1621579 /core/src/test/java | |
parent | 01cf4c402b9fda59680e56112bfaa2b748416d0e (diff) | |
download | spark-f1fa617023d30d8cdc5acef0274bad8cc3e89cea.tar.gz spark-f1fa617023d30d8cdc5acef0274bad8cc3e89cea.tar.bz2 spark-f1fa617023d30d8cdc5acef0274bad8cc3e89cea.zip |
[SPARK-1133] Add whole text files reader in MLlib
Here is a pointer to the former [PR164](https://github.com/apache/spark/pull/164).
I add the pull request for the JIRA issue [SPARK-1133](https://spark-project.atlassian.net/browse/SPARK-1133), which brings a new files reader API in MLlib.
Author: Xusen Yin <yinxusen@gmail.com>
Closes #252 from yinxusen/whole-files-input and squashes the following commits:
7191be6 [Xusen Yin] refine comments
0af3faf [Xusen Yin] add JavaAPI test
01745ee [Xusen Yin] fix deletion error
cc97dca [Xusen Yin] move whole text file API to Spark core
d792cee [Xusen Yin] remove the typo character "+"
6bdf2c2 [Xusen Yin] test for small local file system block size
a1f1e7e [Xusen Yin] add two extra spaces
28cb0fe [Xusen Yin] add whole text files reader
Diffstat (limited to 'core/src/test/java')
-rw-r--r-- | core/src/test/java/org/apache/spark/JavaAPISuite.java | 30 |
1 files changed, 27 insertions, 3 deletions
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index c6b65c7348..2372f2d992 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -17,9 +17,7 @@ package org.apache.spark; -import java.io.File; -import java.io.IOException; -import java.io.Serializable; +import java.io.*; import java.util.*; import scala.Tuple2; @@ -600,6 +598,32 @@ public class JavaAPISuite implements Serializable { } @Test + public void wholeTextFiles() throws IOException { + byte[] content1 = "spark is easy to use.\n".getBytes(); + byte[] content2 = "spark is also easy to use.\n".getBytes(); + + File tempDir = Files.createTempDir(); + String tempDirName = tempDir.getAbsolutePath(); + DataOutputStream ds = new DataOutputStream(new FileOutputStream(tempDirName + "/part-00000")); + ds.write(content1); + ds.close(); + ds = new DataOutputStream(new FileOutputStream(tempDirName + "/part-00001")); + ds.write(content2); + ds.close(); + + HashMap<String, String> container = new HashMap<String, String>(); + container.put(tempDirName+"/part-00000", new Text(content1).toString()); + container.put(tempDirName+"/part-00001", new Text(content2).toString()); + + JavaPairRDD<String, String> readRDD = sc.wholeTextFiles(tempDirName); + List<Tuple2<String, String>> result = readRDD.collect(); + + for (Tuple2<String, String> res : result) { + Assert.assertEquals(res._2(), container.get(res._1())); + } + } + + @Test public void textFilesCompressed() throws IOException { File tempDir = Files.createTempDir(); String outputDir = new File(tempDir, "output").getAbsolutePath(); |