From e5c4cd8a5e188592f8786a265c0cd073c69ac886 Mon Sep 17 00:00:00 2001
From: Matei Zaharia
Date: Tue, 1 Feb 2011 15:11:08 -0800
Subject: Made examples and core subprojects
---
core/lib/apache-log4j-1.2.16/log4j-1.2.16.jar | Bin 0 -> 481534 bytes
core/lib/asm-3.2/.DS_Store | Bin 0 -> 6148 bytes
core/lib/asm-3.2/lib/all/README.txt | 3 +
core/lib/asm-3.2/lib/all/asm-all-3.2.jar | Bin 0 -> 207939 bytes
core/lib/asm-3.2/lib/all/asm-all-3.2.pom | 15 +
core/lib/asm-3.2/lib/all/asm-debug-all-3.2.jar | Bin 0 -> 305420 bytes
core/lib/asm-3.2/lib/all/asm-debug-all-3.2.pom | 15 +
core/lib/asm-3.2/lib/asm-3.2.jar | Bin 0 -> 43401 bytes
core/lib/asm-3.2/lib/asm-3.2.pom | 14 +
core/lib/asm-3.2/lib/asm-analysis-3.2.jar | Bin 0 -> 17988 bytes
core/lib/asm-3.2/lib/asm-analysis-3.2.pom | 21 +
core/lib/asm-3.2/lib/asm-commons-3.2.jar | Bin 0 -> 37619 bytes
core/lib/asm-3.2/lib/asm-commons-3.2.pom | 21 +
core/lib/asm-3.2/lib/asm-parent-3.2.pom | 136 +
core/lib/asm-3.2/lib/asm-tree-3.2.jar | Bin 0 -> 21881 bytes
core/lib/asm-3.2/lib/asm-tree-3.2.pom | 21 +
core/lib/asm-3.2/lib/asm-util-3.2.jar | Bin 0 -> 36552 bytes
core/lib/asm-3.2/lib/asm-util-3.2.pom | 21 +
core/lib/asm-3.2/lib/asm-xml-3.2.jar | Bin 0 -> 51856 bytes
core/lib/asm-3.2/lib/asm-xml-3.2.pom | 21 +
core/lib/colt.jar | Bin 0 -> 581945 bytes
core/lib/guava-r07/COPYING | 202 +
core/lib/guava-r07/README | 28 +
core/lib/guava-r07/guava-r07.jar | Bin 0 -> 1075964 bytes
core/lib/hadoop-0.20.0/.DS_Store | Bin 0 -> 6148 bytes
core/lib/hadoop-0.20.0/CHANGES.txt | 8288 +++
core/lib/hadoop-0.20.0/LICENSE.txt | 244 +
core/lib/hadoop-0.20.0/NOTICE.txt | 2 +
core/lib/hadoop-0.20.0/README.txt | 31 +
core/lib/hadoop-0.20.0/bin/hadoop | 289 +
core/lib/hadoop-0.20.0/bin/hadoop-config.sh | 68 +
core/lib/hadoop-0.20.0/bin/hadoop-daemon.sh | 143 +
core/lib/hadoop-0.20.0/bin/hadoop-daemons.sh | 34 +
core/lib/hadoop-0.20.0/bin/rcc | 99 +
core/lib/hadoop-0.20.0/bin/slaves.sh | 68 +
core/lib/hadoop-0.20.0/bin/start-all.sh | 30 +
core/lib/hadoop-0.20.0/bin/start-balancer.sh | 25 +
core/lib/hadoop-0.20.0/bin/start-dfs.sh | 52 +
core/lib/hadoop-0.20.0/bin/start-mapred.sh | 29 +
core/lib/hadoop-0.20.0/bin/stop-all.sh | 27 +
core/lib/hadoop-0.20.0/bin/stop-balancer.sh | 26 +
core/lib/hadoop-0.20.0/bin/stop-dfs.sh | 29 +
core/lib/hadoop-0.20.0/bin/stop-mapred.sh | 28 +
core/lib/hadoop-0.20.0/build.xml | 1796 +
.../c++/Linux-amd64-64/include/hadoop/Pipes.hh | 258 +
.../Linux-amd64-64/include/hadoop/SerialUtils.hh | 169 +
.../Linux-amd64-64/include/hadoop/StringUtils.hh | 81 +
.../include/hadoop/TemplateFactory.hh | 96 +
.../c++/Linux-amd64-64/lib/libhadooppipes.a | Bin 0 -> 318270 bytes
.../c++/Linux-amd64-64/lib/libhadooputils.a | Bin 0 -> 88620 bytes
.../c++/Linux-i386-32/include/hadoop/Pipes.hh | 258 +
.../Linux-i386-32/include/hadoop/SerialUtils.hh | 169 +
.../Linux-i386-32/include/hadoop/StringUtils.hh | 81 +
.../include/hadoop/TemplateFactory.hh | 96 +
.../c++/Linux-i386-32/lib/libhadooppipes.a | Bin 0 -> 226390 bytes
.../c++/Linux-i386-32/lib/libhadooputils.a | Bin 0 -> 62576 bytes
.../hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la | 41 +
.../hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so | Bin 0 -> 41611 bytes
.../c++/Linux-i386-32/lib/libhdfs.so.0 | Bin 0 -> 41611 bytes
.../c++/Linux-i386-32/lib/libhdfs.so.0.0.0 | Bin 0 -> 41611 bytes
core/lib/hadoop-0.20.0/conf/capacity-scheduler.xml | 156 +
core/lib/hadoop-0.20.0/conf/configuration.xsl | 24 +
core/lib/hadoop-0.20.0/conf/core-site.xml | 8 +
core/lib/hadoop-0.20.0/conf/hadoop-env.sh | 54 +
.../hadoop-0.20.0/conf/hadoop-metrics.properties | 40 +
core/lib/hadoop-0.20.0/conf/hadoop-policy.xml | 97 +
core/lib/hadoop-0.20.0/conf/hdfs-site.xml | 8 +
core/lib/hadoop-0.20.0/conf/log4j.properties | 94 +
core/lib/hadoop-0.20.0/conf/mapred-site.xml | 8 +
core/lib/hadoop-0.20.0/conf/masters | 1 +
core/lib/hadoop-0.20.0/conf/slaves | 1 +
core/lib/hadoop-0.20.0/conf/ssl-client.xml.example | 57 +
core/lib/hadoop-0.20.0/conf/ssl-server.xml.example | 55 +
.../hadoop-0.20.0-capacity-scheduler.jar | Bin 0 -> 51224 bytes
.../contrib/datajoin/hadoop-0.20.0-datajoin.jar | Bin 0 -> 12667 bytes
.../hadoop-0.20.0-eclipse-plugin.jar | Bin 0 -> 3009728 bytes
.../fairscheduler/hadoop-0.20.0-fairscheduler.jar | Bin 0 -> 37087 bytes
core/lib/hadoop-0.20.0/contrib/hdfsproxy/README | 30 +
.../hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy | 170 +
.../contrib/hdfsproxy/bin/hdfsproxy-config.sh | 67 +
.../contrib/hdfsproxy/bin/hdfsproxy-daemon.sh | 141 +
.../contrib/hdfsproxy/bin/hdfsproxy-daemons.sh | 34 +
.../contrib/hdfsproxy/bin/hdfsproxy-slaves.sh | 68 +
.../contrib/hdfsproxy/bin/start-hdfsproxy.sh | 37 +
.../contrib/hdfsproxy/bin/stop-hdfsproxy.sh | 28 +
core/lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml | 183 +
.../contrib/hdfsproxy/conf/configuration.xsl | 24 +
.../contrib/hdfsproxy/conf/hdfsproxy-default.xml | 59 +
.../contrib/hdfsproxy/conf/hdfsproxy-env.sh | 44 +
.../hdfsproxy/conf/hdfsproxy-env.sh.template | 44 +
.../contrib/hdfsproxy/conf/hdfsproxy-hosts | 1 +
.../contrib/hdfsproxy/conf/log4j.properties | 61 +
.../contrib/hdfsproxy/conf/user-certs.xml | 26 +
.../contrib/hdfsproxy/conf/user-permissions.xml | 28 +
.../contrib/hdfsproxy/hdfsproxy-1.0.jar | Bin 0 -> 21572 bytes
.../contrib/index/hadoop-0.20.0-index.jar | Bin 0 -> 63178 bytes
.../contrib/streaming/hadoop-0.20.0-streaming.jar | Bin 0 -> 68304 bytes
.../contrib/thriftfs/hadoop-0.20.0-thriftfs.jar | Bin 0 -> 10434 bytes
.../lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh | 47 +
.../contrib/vaidya/conf/postex_diagnosis_tests.xml | 104 +
.../contrib/vaidya/hadoop-0.20.0-vaidya.jar | Bin 0 -> 42201 bytes
core/lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar | Bin 0 -> 6839 bytes
core/lib/hadoop-0.20.0/hadoop-0.20.0-core.jar | Bin 0 -> 2585066 bytes
core/lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar | Bin 0 -> 142465 bytes
core/lib/hadoop-0.20.0/hadoop-0.20.0-test.jar | Bin 0 -> 1440518 bytes
core/lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar | Bin 0 -> 69804 bytes
core/lib/hadoop-0.20.0/ivy.xml | 261 +
core/lib/hadoop-0.20.0/ivy/hadoop-core.pom | 257 +
core/lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar | Bin 0 -> 893199 bytes
core/lib/hadoop-0.20.0/ivy/ivysettings.xml | 81 +
core/lib/hadoop-0.20.0/ivy/libraries.properties | 71 +
core/lib/hadoop-0.20.0/lib/.DS_Store | Bin 0 -> 6148 bytes
.../hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar | Bin 0 -> 258337 bytes
core/lib/hadoop-0.20.0/lib/commons-codec-1.3.jar | Bin 0 -> 46725 bytes
core/lib/hadoop-0.20.0/lib/commons-el-1.0.jar | Bin 0 -> 112341 bytes
.../hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar | Bin 0 -> 279781 bytes
.../hadoop-0.20.0/lib/commons-logging-1.0.4.jar | Bin 0 -> 38015 bytes
.../lib/commons-logging-api-1.0.4.jar | Bin 0 -> 26202 bytes
core/lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar | Bin 0 -> 180792 bytes
core/lib/hadoop-0.20.0/lib/core-3.1.1.jar | Bin 0 -> 3566844 bytes
.../hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt | 66 +
core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar | Bin 0 -> 706710 bytes
.../hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar | Bin 0 -> 405086 bytes
.../hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar | Bin 0 -> 76698 bytes
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml | 43272 +++++++++++++++
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml | 44778 ++++++++++++++++
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml | 38788 ++++++++++++++
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml | 38826 ++++++++++++++
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml | 43972 ++++++++++++++++
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml | 44195 ++++++++++++++++
core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml | 52140 +++++++++++++++++++
core/lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar | Bin 0 -> 321806 bytes
core/lib/hadoop-0.20.0/lib/jetty-6.1.14.jar | Bin 0 -> 516429 bytes
core/lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar | Bin 0 -> 163121 bytes
core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar | Bin 0 -> 1024681 bytes
core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar | Bin 0 -> 134910 bytes
core/lib/hadoop-0.20.0/lib/junit-3.8.1.jar | Bin 0 -> 121070 bytes
core/lib/hadoop-0.20.0/lib/kfs-0.2.2.jar | Bin 0 -> 11428 bytes
core/lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt | 202 +
core/lib/hadoop-0.20.0/lib/log4j-1.2.15.jar | Bin 0 -> 391834 bytes
core/lib/hadoop-0.20.0/lib/native/.DS_Store | Bin 0 -> 6148 bytes
.../lib/native/Linux-amd64-64/libhadoop.a | Bin 0 -> 101536 bytes
.../lib/native/Linux-amd64-64/libhadoop.la | 35 +
.../lib/native/Linux-amd64-64/libhadoop.so | Bin 0 -> 64941 bytes
.../lib/native/Linux-amd64-64/libhadoop.so.1 | Bin 0 -> 64941 bytes
.../lib/native/Linux-amd64-64/libhadoop.so.1.0.0 | Bin 0 -> 64941 bytes
.../lib/native/Linux-i386-32/libhadoop.a | Bin 0 -> 76446 bytes
.../lib/native/Linux-i386-32/libhadoop.la | 35 +
.../lib/native/Linux-i386-32/libhadoop.so | Bin 0 -> 59620 bytes
.../lib/native/Linux-i386-32/libhadoop.so.1 | Bin 0 -> 59620 bytes
.../lib/native/Linux-i386-32/libhadoop.so.1.0.0 | Bin 0 -> 59620 bytes
core/lib/hadoop-0.20.0/lib/oro-2.0.8.jar | Bin 0 -> 65261 bytes
.../hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar | Bin 0 -> 132368 bytes
core/lib/hadoop-0.20.0/lib/xmlenc-0.52.jar | Bin 0 -> 15010 bytes
core/lib/hadoop-0.20.0/librecordio/librecordio.a | Bin 0 -> 2520838 bytes
.../hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml | 40 +
.../lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml | 40 +
core/lib/hadoop-0.20.0/webapps/hdfs/index.html | 20 +
core/lib/hadoop-0.20.0/webapps/job/WEB-INF/web.xml | 180 +
core/lib/hadoop-0.20.0/webapps/job/index.html | 20 +
.../hadoop-0.20.0/webapps/static/hadoop-logo.jpg | Bin 0 -> 9443 bytes
core/lib/hadoop-0.20.0/webapps/static/hadoop.css | 134 +
core/lib/hadoop-0.20.0/webapps/static/jobconf.xsl | 18 +
.../lib/hadoop-0.20.0/webapps/static/jobtracker.js | 151 +
.../lib/hadoop-0.20.0/webapps/task/WEB-INF/web.xml | 20 +
core/lib/hadoop-0.20.0/webapps/task/index.html | 1 +
.../jetty-server-7.1.6.v20100715.jar | Bin 0 -> 647178 bytes
core/lib/jetty-7.1.6.v20100715/servlet-api-2.5.jar | Bin 0 -> 105112 bytes
core/lib/jline.jar | Bin 0 -> 87543 bytes
core/lib/liblzf-3.5/Changes | 125 +
core/lib/liblzf-3.5/LICENSE | 27 +
core/lib/liblzf-3.5/Makefile | 66 +
core/lib/liblzf-3.5/Makefile.in | 66 +
core/lib/liblzf-3.5/README | 29 +
core/lib/liblzf-3.5/config.h | 17 +
core/lib/liblzf-3.5/config.h.in | 16 +
core/lib/liblzf-3.5/config.log | 515 +
core/lib/liblzf-3.5/config.status | 826 +
core/lib/liblzf-3.5/configure | 7871 +++
core/lib/liblzf-3.5/configure.ac | 25 +
core/lib/liblzf-3.5/crc32.h | 65 +
core/lib/liblzf-3.5/cs/CLZF.cs | 344 +
core/lib/liblzf-3.5/cs/README | 7 +
core/lib/liblzf-3.5/install-sh | 251 +
core/lib/liblzf-3.5/lzf.c | 537 +
core/lib/liblzf-3.5/lzf.h | 100 +
core/lib/liblzf-3.5/lzfP.h | 159 +
core/lib/liblzf-3.5/lzf_c.c | 296 +
core/lib/liblzf-3.5/lzf_d.c | 148 +
core/lib/mesos.jar | Bin 0 -> 33618 bytes
core/lib/scalacheck_2.8.0-1.7.jar | Bin 0 -> 745883 bytes
core/lib/scalatest-1.2/LICENSE | 202 +
core/lib/scalatest-1.2/NOTICE | 7 +
core/lib/scalatest-1.2/README.txt | 58 +
core/lib/scalatest-1.2/scalatest-1.2.jar | Bin 0 -> 1784096 bytes
core/lib/slf4j-1.6.1/slf4j-api-1.6.1.jar | Bin 0 -> 25496 bytes
core/lib/slf4j-1.6.1/slf4j-log4j12-1.6.1.jar | Bin 0 -> 9753 bytes
core/src/main/java/spark/compress/lzf/LZF.java | 27 +
.../java/spark/compress/lzf/LZFInputStream.java | 180 +
.../java/spark/compress/lzf/LZFOutputStream.java | 85 +
core/src/main/native/.gitignore | 3 +
core/src/main/native/Makefile | 40 +
core/src/main/native/spark_compress_lzf_LZF.c | 90 +
core/src/main/scala/spark/Accumulators.scala | 73 +
core/src/main/scala/spark/BoundedMemoryCache.scala | 69 +
core/src/main/scala/spark/Broadcast.scala | 799 +
core/src/main/scala/spark/Cache.scala | 63 +
core/src/main/scala/spark/ClosureCleaner.scala | 159 +
core/src/main/scala/spark/DfsShuffle.scala | 120 +
core/src/main/scala/spark/Executor.scala | 116 +
core/src/main/scala/spark/HadoopFile.scala | 118 +
core/src/main/scala/spark/HttpServer.scala | 67 +
core/src/main/scala/spark/Job.scala | 18 +
core/src/main/scala/spark/LocalFileShuffle.scala | 171 +
core/src/main/scala/spark/LocalScheduler.scala | 72 +
core/src/main/scala/spark/Logging.scala | 49 +
core/src/main/scala/spark/MesosScheduler.scala | 294 +
core/src/main/scala/spark/NumberedSplitRDD.scala | 42 +
core/src/main/scala/spark/ParallelArray.scala | 76 +
core/src/main/scala/spark/RDD.scala | 418 +
core/src/main/scala/spark/Scheduler.scala | 10 +
.../main/scala/spark/SerializableWritable.scala | 26 +
core/src/main/scala/spark/Shuffle.scala | 15 +
core/src/main/scala/spark/SimpleJob.scala | 272 +
core/src/main/scala/spark/SizeEstimator.scala | 160 +
core/src/main/scala/spark/SoftReferenceCache.scala | 13 +
core/src/main/scala/spark/SparkContext.scala | 175 +
core/src/main/scala/spark/SparkException.scala | 3 +
core/src/main/scala/spark/Split.scala | 13 +
core/src/main/scala/spark/Task.scala | 16 +
core/src/main/scala/spark/TaskResult.scala | 9 +
core/src/main/scala/spark/Utils.scala | 127 +
core/src/main/scala/spark/WeakReferenceCache.scala | 14 +
.../scala/spark/repl/ExecutorClassLoader.scala | 108 +
core/src/main/scala/spark/repl/Main.scala | 16 +
.../main/scala/spark/repl/SparkCompletion.scala | 353 +
.../scala/spark/repl/SparkCompletionOutput.scala | 92 +
.../scala/spark/repl/SparkInteractiveReader.scala | 60 +
.../main/scala/spark/repl/SparkInterpreter.scala | 1395 +
.../scala/spark/repl/SparkInterpreterLoop.scala | 659 +
.../spark/repl/SparkInterpreterSettings.scala | 112 +
.../main/scala/spark/repl/SparkJLineReader.scala | 38 +
.../main/scala/spark/repl/SparkSimpleReader.scala | 33 +
.../test/scala/spark/ParallelArraySplitSuite.scala | 161 +
core/src/test/scala/spark/ShuffleSuite.scala | 130 +
core/src/test/scala/spark/repl/ReplSuite.scala | 142 +
examples/src/main/scala/BroadcastTest.scala | 28 +
examples/src/main/scala/CpuHog.scala | 24 +
examples/src/main/scala/HdfsTest.scala | 16 +
examples/src/main/scala/LocalALS.scala | 119 +
examples/src/main/scala/LocalFileLR.scala | 36 +
examples/src/main/scala/LocalLR.scala | 41 +
examples/src/main/scala/LocalPi.scala | 15 +
examples/src/main/scala/SleepJob.scala | 19 +
examples/src/main/scala/SparkALS.scala | 139 +
examples/src/main/scala/SparkHdfsLR.scala | 51 +
examples/src/main/scala/SparkLR.scala | 49 +
examples/src/main/scala/SparkPi.scala | 21 +
examples/src/main/scala/Vector.scala | 63 +
lib/apache-log4j-1.2.16/log4j-1.2.16.jar | Bin 481534 -> 0 bytes
lib/asm-3.2/.DS_Store | Bin 6148 -> 0 bytes
lib/asm-3.2/lib/all/README.txt | 3 -
lib/asm-3.2/lib/all/asm-all-3.2.jar | Bin 207939 -> 0 bytes
lib/asm-3.2/lib/all/asm-all-3.2.pom | 15 -
lib/asm-3.2/lib/all/asm-debug-all-3.2.jar | Bin 305420 -> 0 bytes
lib/asm-3.2/lib/all/asm-debug-all-3.2.pom | 15 -
lib/asm-3.2/lib/asm-3.2.jar | Bin 43401 -> 0 bytes
lib/asm-3.2/lib/asm-3.2.pom | 14 -
lib/asm-3.2/lib/asm-analysis-3.2.jar | Bin 17988 -> 0 bytes
lib/asm-3.2/lib/asm-analysis-3.2.pom | 21 -
lib/asm-3.2/lib/asm-commons-3.2.jar | Bin 37619 -> 0 bytes
lib/asm-3.2/lib/asm-commons-3.2.pom | 21 -
lib/asm-3.2/lib/asm-parent-3.2.pom | 136 -
lib/asm-3.2/lib/asm-tree-3.2.jar | Bin 21881 -> 0 bytes
lib/asm-3.2/lib/asm-tree-3.2.pom | 21 -
lib/asm-3.2/lib/asm-util-3.2.jar | Bin 36552 -> 0 bytes
lib/asm-3.2/lib/asm-util-3.2.pom | 21 -
lib/asm-3.2/lib/asm-xml-3.2.jar | Bin 51856 -> 0 bytes
lib/asm-3.2/lib/asm-xml-3.2.pom | 21 -
lib/colt.jar | Bin 581945 -> 0 bytes
lib/guava-r07/COPYING | 202 -
lib/guava-r07/README | 28 -
lib/guava-r07/guava-r07.jar | Bin 1075964 -> 0 bytes
lib/hadoop-0.20.0/.DS_Store | Bin 6148 -> 0 bytes
lib/hadoop-0.20.0/CHANGES.txt | 8288 ---
lib/hadoop-0.20.0/LICENSE.txt | 244 -
lib/hadoop-0.20.0/NOTICE.txt | 2 -
lib/hadoop-0.20.0/README.txt | 31 -
lib/hadoop-0.20.0/bin/hadoop | 289 -
lib/hadoop-0.20.0/bin/hadoop-config.sh | 68 -
lib/hadoop-0.20.0/bin/hadoop-daemon.sh | 143 -
lib/hadoop-0.20.0/bin/hadoop-daemons.sh | 34 -
lib/hadoop-0.20.0/bin/rcc | 99 -
lib/hadoop-0.20.0/bin/slaves.sh | 68 -
lib/hadoop-0.20.0/bin/start-all.sh | 30 -
lib/hadoop-0.20.0/bin/start-balancer.sh | 25 -
lib/hadoop-0.20.0/bin/start-dfs.sh | 52 -
lib/hadoop-0.20.0/bin/start-mapred.sh | 29 -
lib/hadoop-0.20.0/bin/stop-all.sh | 27 -
lib/hadoop-0.20.0/bin/stop-balancer.sh | 26 -
lib/hadoop-0.20.0/bin/stop-dfs.sh | 29 -
lib/hadoop-0.20.0/bin/stop-mapred.sh | 28 -
lib/hadoop-0.20.0/build.xml | 1796 -
.../c++/Linux-amd64-64/include/hadoop/Pipes.hh | 258 -
.../Linux-amd64-64/include/hadoop/SerialUtils.hh | 169 -
.../Linux-amd64-64/include/hadoop/StringUtils.hh | 81 -
.../include/hadoop/TemplateFactory.hh | 96 -
.../c++/Linux-amd64-64/lib/libhadooppipes.a | Bin 318270 -> 0 bytes
.../c++/Linux-amd64-64/lib/libhadooputils.a | Bin 88620 -> 0 bytes
.../c++/Linux-i386-32/include/hadoop/Pipes.hh | 258 -
.../Linux-i386-32/include/hadoop/SerialUtils.hh | 169 -
.../Linux-i386-32/include/hadoop/StringUtils.hh | 81 -
.../include/hadoop/TemplateFactory.hh | 96 -
.../c++/Linux-i386-32/lib/libhadooppipes.a | Bin 226390 -> 0 bytes
.../c++/Linux-i386-32/lib/libhadooputils.a | Bin 62576 -> 0 bytes
lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la | 41 -
lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so | Bin 41611 -> 0 bytes
.../c++/Linux-i386-32/lib/libhdfs.so.0 | Bin 41611 -> 0 bytes
.../c++/Linux-i386-32/lib/libhdfs.so.0.0.0 | Bin 41611 -> 0 bytes
lib/hadoop-0.20.0/conf/capacity-scheduler.xml | 156 -
lib/hadoop-0.20.0/conf/configuration.xsl | 24 -
lib/hadoop-0.20.0/conf/core-site.xml | 8 -
lib/hadoop-0.20.0/conf/hadoop-env.sh | 54 -
lib/hadoop-0.20.0/conf/hadoop-metrics.properties | 40 -
lib/hadoop-0.20.0/conf/hadoop-policy.xml | 97 -
lib/hadoop-0.20.0/conf/hdfs-site.xml | 8 -
lib/hadoop-0.20.0/conf/log4j.properties | 94 -
lib/hadoop-0.20.0/conf/mapred-site.xml | 8 -
lib/hadoop-0.20.0/conf/masters | 1 -
lib/hadoop-0.20.0/conf/slaves | 1 -
lib/hadoop-0.20.0/conf/ssl-client.xml.example | 57 -
lib/hadoop-0.20.0/conf/ssl-server.xml.example | 55 -
.../hadoop-0.20.0-capacity-scheduler.jar | Bin 51224 -> 0 bytes
.../contrib/datajoin/hadoop-0.20.0-datajoin.jar | Bin 12667 -> 0 bytes
.../hadoop-0.20.0-eclipse-plugin.jar | Bin 3009728 -> 0 bytes
.../fairscheduler/hadoop-0.20.0-fairscheduler.jar | Bin 37087 -> 0 bytes
lib/hadoop-0.20.0/contrib/hdfsproxy/README | 30 -
lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy | 170 -
.../contrib/hdfsproxy/bin/hdfsproxy-config.sh | 67 -
.../contrib/hdfsproxy/bin/hdfsproxy-daemon.sh | 141 -
.../contrib/hdfsproxy/bin/hdfsproxy-daemons.sh | 34 -
.../contrib/hdfsproxy/bin/hdfsproxy-slaves.sh | 68 -
.../contrib/hdfsproxy/bin/start-hdfsproxy.sh | 37 -
.../contrib/hdfsproxy/bin/stop-hdfsproxy.sh | 28 -
lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml | 183 -
.../contrib/hdfsproxy/conf/configuration.xsl | 24 -
.../contrib/hdfsproxy/conf/hdfsproxy-default.xml | 59 -
.../contrib/hdfsproxy/conf/hdfsproxy-env.sh | 44 -
.../hdfsproxy/conf/hdfsproxy-env.sh.template | 44 -
.../contrib/hdfsproxy/conf/hdfsproxy-hosts | 1 -
.../contrib/hdfsproxy/conf/log4j.properties | 61 -
.../contrib/hdfsproxy/conf/user-certs.xml | 26 -
.../contrib/hdfsproxy/conf/user-permissions.xml | 28 -
.../contrib/hdfsproxy/hdfsproxy-1.0.jar | Bin 21572 -> 0 bytes
.../contrib/index/hadoop-0.20.0-index.jar | Bin 63178 -> 0 bytes
.../contrib/streaming/hadoop-0.20.0-streaming.jar | Bin 68304 -> 0 bytes
.../contrib/thriftfs/hadoop-0.20.0-thriftfs.jar | Bin 10434 -> 0 bytes
lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh | 47 -
.../contrib/vaidya/conf/postex_diagnosis_tests.xml | 104 -
.../contrib/vaidya/hadoop-0.20.0-vaidya.jar | Bin 42201 -> 0 bytes
lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar | Bin 6839 -> 0 bytes
lib/hadoop-0.20.0/hadoop-0.20.0-core.jar | Bin 2585066 -> 0 bytes
lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar | Bin 142465 -> 0 bytes
lib/hadoop-0.20.0/hadoop-0.20.0-test.jar | Bin 1440518 -> 0 bytes
lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar | Bin 69804 -> 0 bytes
lib/hadoop-0.20.0/ivy.xml | 261 -
lib/hadoop-0.20.0/ivy/hadoop-core.pom | 257 -
lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar | Bin 893199 -> 0 bytes
lib/hadoop-0.20.0/ivy/ivysettings.xml | 81 -
lib/hadoop-0.20.0/ivy/libraries.properties | 71 -
lib/hadoop-0.20.0/lib/.DS_Store | Bin 6148 -> 0 bytes
lib/hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar | Bin 258337 -> 0 bytes
lib/hadoop-0.20.0/lib/commons-codec-1.3.jar | Bin 46725 -> 0 bytes
lib/hadoop-0.20.0/lib/commons-el-1.0.jar | Bin 112341 -> 0 bytes
lib/hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar | Bin 279781 -> 0 bytes
lib/hadoop-0.20.0/lib/commons-logging-1.0.4.jar | Bin 38015 -> 0 bytes
.../lib/commons-logging-api-1.0.4.jar | Bin 26202 -> 0 bytes
lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar | Bin 180792 -> 0 bytes
lib/hadoop-0.20.0/lib/core-3.1.1.jar | Bin 3566844 -> 0 bytes
lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt | 66 -
lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar | Bin 706710 -> 0 bytes
lib/hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar | Bin 405086 -> 0 bytes
lib/hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar | Bin 76698 -> 0 bytes
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml | 43272 ---------------
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml | 44778 ----------------
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml | 38788 --------------
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml | 38826 --------------
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml | 43972 ----------------
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml | 44195 ----------------
lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml | 52140 -------------------
lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar | Bin 321806 -> 0 bytes
lib/hadoop-0.20.0/lib/jetty-6.1.14.jar | Bin 516429 -> 0 bytes
lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar | Bin 163121 -> 0 bytes
lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar | Bin 1024681 -> 0 bytes
lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar | Bin 134910 -> 0 bytes
lib/hadoop-0.20.0/lib/junit-3.8.1.jar | Bin 121070 -> 0 bytes
lib/hadoop-0.20.0/lib/kfs-0.2.2.jar | Bin 11428 -> 0 bytes
lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt | 202 -
lib/hadoop-0.20.0/lib/log4j-1.2.15.jar | Bin 391834 -> 0 bytes
lib/hadoop-0.20.0/lib/native/.DS_Store | Bin 6148 -> 0 bytes
.../lib/native/Linux-amd64-64/libhadoop.a | Bin 101536 -> 0 bytes
.../lib/native/Linux-amd64-64/libhadoop.la | 35 -
.../lib/native/Linux-amd64-64/libhadoop.so | Bin 64941 -> 0 bytes
.../lib/native/Linux-amd64-64/libhadoop.so.1 | Bin 64941 -> 0 bytes
.../lib/native/Linux-amd64-64/libhadoop.so.1.0.0 | Bin 64941 -> 0 bytes
.../lib/native/Linux-i386-32/libhadoop.a | Bin 76446 -> 0 bytes
.../lib/native/Linux-i386-32/libhadoop.la | 35 -
.../lib/native/Linux-i386-32/libhadoop.so | Bin 59620 -> 0 bytes
.../lib/native/Linux-i386-32/libhadoop.so.1 | Bin 59620 -> 0 bytes
.../lib/native/Linux-i386-32/libhadoop.so.1.0.0 | Bin 59620 -> 0 bytes
lib/hadoop-0.20.0/lib/oro-2.0.8.jar | Bin 65261 -> 0 bytes
lib/hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar | Bin 132368 -> 0 bytes
lib/hadoop-0.20.0/lib/xmlenc-0.52.jar | Bin 15010 -> 0 bytes
lib/hadoop-0.20.0/librecordio/librecordio.a | Bin 2520838 -> 0 bytes
lib/hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml | 40 -
lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml | 40 -
lib/hadoop-0.20.0/webapps/hdfs/index.html | 20 -
lib/hadoop-0.20.0/webapps/job/WEB-INF/web.xml | 180 -
lib/hadoop-0.20.0/webapps/job/index.html | 20 -
lib/hadoop-0.20.0/webapps/static/hadoop-logo.jpg | Bin 9443 -> 0 bytes
lib/hadoop-0.20.0/webapps/static/hadoop.css | 134 -
lib/hadoop-0.20.0/webapps/static/jobconf.xsl | 18 -
lib/hadoop-0.20.0/webapps/static/jobtracker.js | 151 -
lib/hadoop-0.20.0/webapps/task/WEB-INF/web.xml | 20 -
lib/hadoop-0.20.0/webapps/task/index.html | 1 -
.../jetty-server-7.1.6.v20100715.jar | Bin 647178 -> 0 bytes
lib/jetty-7.1.6.v20100715/servlet-api-2.5.jar | Bin 105112 -> 0 bytes
lib/jline.jar | Bin 87543 -> 0 bytes
lib/liblzf-3.5/Changes | 125 -
lib/liblzf-3.5/LICENSE | 27 -
lib/liblzf-3.5/Makefile | 66 -
lib/liblzf-3.5/Makefile.in | 66 -
lib/liblzf-3.5/README | 29 -
lib/liblzf-3.5/config.h | 17 -
lib/liblzf-3.5/config.h.in | 16 -
lib/liblzf-3.5/config.log | 515 -
lib/liblzf-3.5/config.status | 826 -
lib/liblzf-3.5/configure | 7871 ---
lib/liblzf-3.5/configure.ac | 25 -
lib/liblzf-3.5/crc32.h | 65 -
lib/liblzf-3.5/cs/CLZF.cs | 344 -
lib/liblzf-3.5/cs/README | 7 -
lib/liblzf-3.5/install-sh | 251 -
lib/liblzf-3.5/lzf.c | 537 -
lib/liblzf-3.5/lzf.h | 100 -
lib/liblzf-3.5/lzfP.h | 159 -
lib/liblzf-3.5/lzf_c.c | 296 -
lib/liblzf-3.5/lzf_d.c | 148 -
lib/mesos.jar | Bin 33618 -> 0 bytes
lib/scalacheck_2.8.0-1.7.jar | Bin 745883 -> 0 bytes
lib/scalatest-1.2/LICENSE | 202 -
lib/scalatest-1.2/NOTICE | 7 -
lib/scalatest-1.2/README.txt | 58 -
lib/scalatest-1.2/scalatest-1.2.jar | Bin 1784096 -> 0 bytes
lib/slf4j-1.6.1/slf4j-api-1.6.1.jar | Bin 25496 -> 0 bytes
lib/slf4j-1.6.1/slf4j-log4j12-1.6.1.jar | Bin 9753 -> 0 bytes
project/build/SparkProject.scala | 70 +-
project/plugins/project/build.properties | 2 +-
run | 34 +-
src/examples/BroadcastTest.scala | 28 -
src/examples/CpuHog.scala | 24 -
src/examples/HdfsTest.scala | 16 -
src/examples/LocalALS.scala | 119 -
src/examples/LocalFileLR.scala | 36 -
src/examples/LocalLR.scala | 41 -
src/examples/LocalPi.scala | 15 -
src/examples/SleepJob.scala | 19 -
src/examples/SparkALS.scala | 139 -
src/examples/SparkHdfsLR.scala | 51 -
src/examples/SparkLR.scala | 49 -
src/examples/SparkPi.scala | 21 -
src/examples/Vector.scala | 63 -
src/main/java/spark/compress/lzf/LZF.java | 27 -
.../java/spark/compress/lzf/LZFInputStream.java | 180 -
.../java/spark/compress/lzf/LZFOutputStream.java | 85 -
src/main/native/.gitignore | 3 -
src/main/native/Makefile | 40 -
src/main/native/spark_compress_lzf_LZF.c | 90 -
src/main/scala/spark/Accumulators.scala | 73 -
src/main/scala/spark/BoundedMemoryCache.scala | 69 -
src/main/scala/spark/Broadcast.scala | 799 -
src/main/scala/spark/Cache.scala | 63 -
src/main/scala/spark/ClosureCleaner.scala | 159 -
src/main/scala/spark/DfsShuffle.scala | 120 -
src/main/scala/spark/Executor.scala | 116 -
src/main/scala/spark/HadoopFile.scala | 118 -
src/main/scala/spark/HttpServer.scala | 67 -
src/main/scala/spark/Job.scala | 18 -
src/main/scala/spark/LocalFileShuffle.scala | 171 -
src/main/scala/spark/LocalScheduler.scala | 72 -
src/main/scala/spark/Logging.scala | 49 -
src/main/scala/spark/MesosScheduler.scala | 294 -
src/main/scala/spark/NumberedSplitRDD.scala | 42 -
src/main/scala/spark/ParallelArray.scala | 76 -
src/main/scala/spark/RDD.scala | 418 -
src/main/scala/spark/Scheduler.scala | 10 -
src/main/scala/spark/SerializableWritable.scala | 26 -
src/main/scala/spark/Shuffle.scala | 15 -
src/main/scala/spark/SimpleJob.scala | 272 -
src/main/scala/spark/SizeEstimator.scala | 160 -
src/main/scala/spark/SoftReferenceCache.scala | 13 -
src/main/scala/spark/SparkContext.scala | 175 -
src/main/scala/spark/SparkException.scala | 3 -
src/main/scala/spark/Split.scala | 13 -
src/main/scala/spark/Task.scala | 16 -
src/main/scala/spark/TaskResult.scala | 9 -
src/main/scala/spark/Utils.scala | 127 -
src/main/scala/spark/WeakReferenceCache.scala | 14 -
.../scala/spark/repl/ExecutorClassLoader.scala | 108 -
src/main/scala/spark/repl/Main.scala | 16 -
src/main/scala/spark/repl/SparkCompletion.scala | 353 -
.../scala/spark/repl/SparkCompletionOutput.scala | 92 -
.../scala/spark/repl/SparkInteractiveReader.scala | 60 -
src/main/scala/spark/repl/SparkInterpreter.scala | 1395 -
.../scala/spark/repl/SparkInterpreterLoop.scala | 659 -
.../spark/repl/SparkInterpreterSettings.scala | 112 -
src/main/scala/spark/repl/SparkJLineReader.scala | 38 -
src/main/scala/spark/repl/SparkSimpleReader.scala | 33 -
src/test/scala/spark/ParallelArraySplitSuite.scala | 161 -
src/test/scala/spark/ShuffleSuite.scala | 130 -
src/test/scala/spark/repl/ReplSuite.scala | 142 -
521 files changed, 342214 insertions(+), 342204 deletions(-)
create mode 100644 core/lib/apache-log4j-1.2.16/log4j-1.2.16.jar
create mode 100644 core/lib/asm-3.2/.DS_Store
create mode 100644 core/lib/asm-3.2/lib/all/README.txt
create mode 100644 core/lib/asm-3.2/lib/all/asm-all-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/all/asm-all-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/all/asm-debug-all-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/all/asm-debug-all-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/asm-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-analysis-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/asm-analysis-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-commons-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/asm-commons-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-parent-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-tree-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/asm-tree-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-util-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/asm-util-3.2.pom
create mode 100644 core/lib/asm-3.2/lib/asm-xml-3.2.jar
create mode 100644 core/lib/asm-3.2/lib/asm-xml-3.2.pom
create mode 100644 core/lib/colt.jar
create mode 100644 core/lib/guava-r07/COPYING
create mode 100644 core/lib/guava-r07/README
create mode 100644 core/lib/guava-r07/guava-r07.jar
create mode 100644 core/lib/hadoop-0.20.0/.DS_Store
create mode 100644 core/lib/hadoop-0.20.0/CHANGES.txt
create mode 100644 core/lib/hadoop-0.20.0/LICENSE.txt
create mode 100644 core/lib/hadoop-0.20.0/NOTICE.txt
create mode 100644 core/lib/hadoop-0.20.0/README.txt
create mode 100755 core/lib/hadoop-0.20.0/bin/hadoop
create mode 100755 core/lib/hadoop-0.20.0/bin/hadoop-config.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/hadoop-daemon.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/hadoop-daemons.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/rcc
create mode 100755 core/lib/hadoop-0.20.0/bin/slaves.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/start-all.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/start-balancer.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/start-dfs.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/start-mapred.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/stop-all.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/stop-balancer.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/stop-dfs.sh
create mode 100755 core/lib/hadoop-0.20.0/bin/stop-mapred.sh
create mode 100644 core/lib/hadoop-0.20.0/build.xml
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/Pipes.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/SerialUtils.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/StringUtils.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/TemplateFactory.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooppipes.a
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooputils.a
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/Pipes.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/SerialUtils.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/StringUtils.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/TemplateFactory.hh
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooppipes.a
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooputils.a
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0
create mode 100644 core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0.0.0
create mode 100644 core/lib/hadoop-0.20.0/conf/capacity-scheduler.xml
create mode 100644 core/lib/hadoop-0.20.0/conf/configuration.xsl
create mode 100644 core/lib/hadoop-0.20.0/conf/core-site.xml
create mode 100644 core/lib/hadoop-0.20.0/conf/hadoop-env.sh
create mode 100644 core/lib/hadoop-0.20.0/conf/hadoop-metrics.properties
create mode 100644 core/lib/hadoop-0.20.0/conf/hadoop-policy.xml
create mode 100644 core/lib/hadoop-0.20.0/conf/hdfs-site.xml
create mode 100644 core/lib/hadoop-0.20.0/conf/log4j.properties
create mode 100644 core/lib/hadoop-0.20.0/conf/mapred-site.xml
create mode 100644 core/lib/hadoop-0.20.0/conf/masters
create mode 100644 core/lib/hadoop-0.20.0/conf/slaves
create mode 100644 core/lib/hadoop-0.20.0/conf/ssl-client.xml.example
create mode 100644 core/lib/hadoop-0.20.0/conf/ssl-server.xml.example
create mode 100644 core/lib/hadoop-0.20.0/contrib/capacity-scheduler/hadoop-0.20.0-capacity-scheduler.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/datajoin/hadoop-0.20.0-datajoin.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/eclipse-plugin/hadoop-0.20.0-eclipse-plugin.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/fairscheduler/hadoop-0.20.0-fairscheduler.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/README
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-config.sh
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemon.sh
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemons.sh
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-slaves.sh
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/start-hdfsproxy.sh
create mode 100755 core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/stop-hdfsproxy.sh
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/configuration.xsl
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-default.xml
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh.template
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-hosts
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/log4j.properties
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-certs.xml
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-permissions.xml
create mode 100644 core/lib/hadoop-0.20.0/contrib/hdfsproxy/hdfsproxy-1.0.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/index/hadoop-0.20.0-index.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/streaming/hadoop-0.20.0-streaming.jar
create mode 100644 core/lib/hadoop-0.20.0/contrib/thriftfs/hadoop-0.20.0-thriftfs.jar
create mode 100755 core/lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh
create mode 100644 core/lib/hadoop-0.20.0/contrib/vaidya/conf/postex_diagnosis_tests.xml
create mode 100644 core/lib/hadoop-0.20.0/contrib/vaidya/hadoop-0.20.0-vaidya.jar
create mode 100644 core/lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar
create mode 100644 core/lib/hadoop-0.20.0/hadoop-0.20.0-core.jar
create mode 100644 core/lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar
create mode 100644 core/lib/hadoop-0.20.0/hadoop-0.20.0-test.jar
create mode 100644 core/lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar
create mode 100644 core/lib/hadoop-0.20.0/ivy.xml
create mode 100644 core/lib/hadoop-0.20.0/ivy/hadoop-core.pom
create mode 100644 core/lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar
create mode 100644 core/lib/hadoop-0.20.0/ivy/ivysettings.xml
create mode 100644 core/lib/hadoop-0.20.0/ivy/libraries.properties
create mode 100644 core/lib/hadoop-0.20.0/lib/.DS_Store
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-codec-1.3.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-el-1.0.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-logging-1.0.4.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-logging-api-1.0.4.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/core-3.1.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt
create mode 100644 core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml
create mode 100644 core/lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jetty-6.1.14.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/junit-3.8.1.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/kfs-0.2.2.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt
create mode 100644 core/lib/hadoop-0.20.0/lib/log4j-1.2.15.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/native/.DS_Store
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.a
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.la
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1.0.0
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.a
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.la
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1
create mode 100644 core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1.0.0
create mode 100644 core/lib/hadoop-0.20.0/lib/oro-2.0.8.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar
create mode 100644 core/lib/hadoop-0.20.0/lib/xmlenc-0.52.jar
create mode 100644 core/lib/hadoop-0.20.0/librecordio/librecordio.a
create mode 100644 core/lib/hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml
create mode 100644 core/lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml
create mode 100644 core/lib/hadoop-0.20.0/webapps/hdfs/index.html
create mode 100644 core/lib/hadoop-0.20.0/webapps/job/WEB-INF/web.xml
create mode 100644 core/lib/hadoop-0.20.0/webapps/job/index.html
create mode 100644 core/lib/hadoop-0.20.0/webapps/static/hadoop-logo.jpg
create mode 100644 core/lib/hadoop-0.20.0/webapps/static/hadoop.css
create mode 100644 core/lib/hadoop-0.20.0/webapps/static/jobconf.xsl
create mode 100644 core/lib/hadoop-0.20.0/webapps/static/jobtracker.js
create mode 100644 core/lib/hadoop-0.20.0/webapps/task/WEB-INF/web.xml
create mode 100644 core/lib/hadoop-0.20.0/webapps/task/index.html
create mode 100644 core/lib/jetty-7.1.6.v20100715/jetty-server-7.1.6.v20100715.jar
create mode 100644 core/lib/jetty-7.1.6.v20100715/servlet-api-2.5.jar
create mode 100644 core/lib/jline.jar
create mode 100644 core/lib/liblzf-3.5/Changes
create mode 100644 core/lib/liblzf-3.5/LICENSE
create mode 100644 core/lib/liblzf-3.5/Makefile
create mode 100644 core/lib/liblzf-3.5/Makefile.in
create mode 100644 core/lib/liblzf-3.5/README
create mode 100644 core/lib/liblzf-3.5/config.h
create mode 100644 core/lib/liblzf-3.5/config.h.in
create mode 100644 core/lib/liblzf-3.5/config.log
create mode 100755 core/lib/liblzf-3.5/config.status
create mode 100755 core/lib/liblzf-3.5/configure
create mode 100644 core/lib/liblzf-3.5/configure.ac
create mode 100644 core/lib/liblzf-3.5/crc32.h
create mode 100644 core/lib/liblzf-3.5/cs/CLZF.cs
create mode 100644 core/lib/liblzf-3.5/cs/README
create mode 100755 core/lib/liblzf-3.5/install-sh
create mode 100644 core/lib/liblzf-3.5/lzf.c
create mode 100644 core/lib/liblzf-3.5/lzf.h
create mode 100644 core/lib/liblzf-3.5/lzfP.h
create mode 100644 core/lib/liblzf-3.5/lzf_c.c
create mode 100644 core/lib/liblzf-3.5/lzf_d.c
create mode 100644 core/lib/mesos.jar
create mode 100644 core/lib/scalacheck_2.8.0-1.7.jar
create mode 100644 core/lib/scalatest-1.2/LICENSE
create mode 100644 core/lib/scalatest-1.2/NOTICE
create mode 100644 core/lib/scalatest-1.2/README.txt
create mode 100644 core/lib/scalatest-1.2/scalatest-1.2.jar
create mode 100644 core/lib/slf4j-1.6.1/slf4j-api-1.6.1.jar
create mode 100644 core/lib/slf4j-1.6.1/slf4j-log4j12-1.6.1.jar
create mode 100644 core/src/main/java/spark/compress/lzf/LZF.java
create mode 100644 core/src/main/java/spark/compress/lzf/LZFInputStream.java
create mode 100644 core/src/main/java/spark/compress/lzf/LZFOutputStream.java
create mode 100644 core/src/main/native/.gitignore
create mode 100644 core/src/main/native/Makefile
create mode 100644 core/src/main/native/spark_compress_lzf_LZF.c
create mode 100644 core/src/main/scala/spark/Accumulators.scala
create mode 100644 core/src/main/scala/spark/BoundedMemoryCache.scala
create mode 100644 core/src/main/scala/spark/Broadcast.scala
create mode 100644 core/src/main/scala/spark/Cache.scala
create mode 100644 core/src/main/scala/spark/ClosureCleaner.scala
create mode 100644 core/src/main/scala/spark/DfsShuffle.scala
create mode 100644 core/src/main/scala/spark/Executor.scala
create mode 100644 core/src/main/scala/spark/HadoopFile.scala
create mode 100644 core/src/main/scala/spark/HttpServer.scala
create mode 100644 core/src/main/scala/spark/Job.scala
create mode 100644 core/src/main/scala/spark/LocalFileShuffle.scala
create mode 100644 core/src/main/scala/spark/LocalScheduler.scala
create mode 100644 core/src/main/scala/spark/Logging.scala
create mode 100644 core/src/main/scala/spark/MesosScheduler.scala
create mode 100644 core/src/main/scala/spark/NumberedSplitRDD.scala
create mode 100644 core/src/main/scala/spark/ParallelArray.scala
create mode 100644 core/src/main/scala/spark/RDD.scala
create mode 100644 core/src/main/scala/spark/Scheduler.scala
create mode 100644 core/src/main/scala/spark/SerializableWritable.scala
create mode 100644 core/src/main/scala/spark/Shuffle.scala
create mode 100644 core/src/main/scala/spark/SimpleJob.scala
create mode 100644 core/src/main/scala/spark/SizeEstimator.scala
create mode 100644 core/src/main/scala/spark/SoftReferenceCache.scala
create mode 100644 core/src/main/scala/spark/SparkContext.scala
create mode 100644 core/src/main/scala/spark/SparkException.scala
create mode 100644 core/src/main/scala/spark/Split.scala
create mode 100644 core/src/main/scala/spark/Task.scala
create mode 100644 core/src/main/scala/spark/TaskResult.scala
create mode 100644 core/src/main/scala/spark/Utils.scala
create mode 100644 core/src/main/scala/spark/WeakReferenceCache.scala
create mode 100644 core/src/main/scala/spark/repl/ExecutorClassLoader.scala
create mode 100644 core/src/main/scala/spark/repl/Main.scala
create mode 100644 core/src/main/scala/spark/repl/SparkCompletion.scala
create mode 100644 core/src/main/scala/spark/repl/SparkCompletionOutput.scala
create mode 100644 core/src/main/scala/spark/repl/SparkInteractiveReader.scala
create mode 100644 core/src/main/scala/spark/repl/SparkInterpreter.scala
create mode 100644 core/src/main/scala/spark/repl/SparkInterpreterLoop.scala
create mode 100644 core/src/main/scala/spark/repl/SparkInterpreterSettings.scala
create mode 100644 core/src/main/scala/spark/repl/SparkJLineReader.scala
create mode 100644 core/src/main/scala/spark/repl/SparkSimpleReader.scala
create mode 100644 core/src/test/scala/spark/ParallelArraySplitSuite.scala
create mode 100644 core/src/test/scala/spark/ShuffleSuite.scala
create mode 100644 core/src/test/scala/spark/repl/ReplSuite.scala
create mode 100644 examples/src/main/scala/BroadcastTest.scala
create mode 100644 examples/src/main/scala/CpuHog.scala
create mode 100644 examples/src/main/scala/HdfsTest.scala
create mode 100644 examples/src/main/scala/LocalALS.scala
create mode 100644 examples/src/main/scala/LocalFileLR.scala
create mode 100644 examples/src/main/scala/LocalLR.scala
create mode 100644 examples/src/main/scala/LocalPi.scala
create mode 100644 examples/src/main/scala/SleepJob.scala
create mode 100644 examples/src/main/scala/SparkALS.scala
create mode 100644 examples/src/main/scala/SparkHdfsLR.scala
create mode 100644 examples/src/main/scala/SparkLR.scala
create mode 100644 examples/src/main/scala/SparkPi.scala
create mode 100644 examples/src/main/scala/Vector.scala
delete mode 100644 lib/apache-log4j-1.2.16/log4j-1.2.16.jar
delete mode 100644 lib/asm-3.2/.DS_Store
delete mode 100644 lib/asm-3.2/lib/all/README.txt
delete mode 100644 lib/asm-3.2/lib/all/asm-all-3.2.jar
delete mode 100644 lib/asm-3.2/lib/all/asm-all-3.2.pom
delete mode 100644 lib/asm-3.2/lib/all/asm-debug-all-3.2.jar
delete mode 100644 lib/asm-3.2/lib/all/asm-debug-all-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-3.2.jar
delete mode 100644 lib/asm-3.2/lib/asm-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-analysis-3.2.jar
delete mode 100644 lib/asm-3.2/lib/asm-analysis-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-commons-3.2.jar
delete mode 100644 lib/asm-3.2/lib/asm-commons-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-parent-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-tree-3.2.jar
delete mode 100644 lib/asm-3.2/lib/asm-tree-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-util-3.2.jar
delete mode 100644 lib/asm-3.2/lib/asm-util-3.2.pom
delete mode 100644 lib/asm-3.2/lib/asm-xml-3.2.jar
delete mode 100644 lib/asm-3.2/lib/asm-xml-3.2.pom
delete mode 100644 lib/colt.jar
delete mode 100644 lib/guava-r07/COPYING
delete mode 100644 lib/guava-r07/README
delete mode 100644 lib/guava-r07/guava-r07.jar
delete mode 100644 lib/hadoop-0.20.0/.DS_Store
delete mode 100644 lib/hadoop-0.20.0/CHANGES.txt
delete mode 100644 lib/hadoop-0.20.0/LICENSE.txt
delete mode 100644 lib/hadoop-0.20.0/NOTICE.txt
delete mode 100644 lib/hadoop-0.20.0/README.txt
delete mode 100755 lib/hadoop-0.20.0/bin/hadoop
delete mode 100755 lib/hadoop-0.20.0/bin/hadoop-config.sh
delete mode 100755 lib/hadoop-0.20.0/bin/hadoop-daemon.sh
delete mode 100755 lib/hadoop-0.20.0/bin/hadoop-daemons.sh
delete mode 100755 lib/hadoop-0.20.0/bin/rcc
delete mode 100755 lib/hadoop-0.20.0/bin/slaves.sh
delete mode 100755 lib/hadoop-0.20.0/bin/start-all.sh
delete mode 100755 lib/hadoop-0.20.0/bin/start-balancer.sh
delete mode 100755 lib/hadoop-0.20.0/bin/start-dfs.sh
delete mode 100755 lib/hadoop-0.20.0/bin/start-mapred.sh
delete mode 100755 lib/hadoop-0.20.0/bin/stop-all.sh
delete mode 100755 lib/hadoop-0.20.0/bin/stop-balancer.sh
delete mode 100755 lib/hadoop-0.20.0/bin/stop-dfs.sh
delete mode 100755 lib/hadoop-0.20.0/bin/stop-mapred.sh
delete mode 100644 lib/hadoop-0.20.0/build.xml
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/Pipes.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/SerialUtils.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/StringUtils.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/TemplateFactory.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooppipes.a
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooputils.a
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/Pipes.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/SerialUtils.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/StringUtils.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/TemplateFactory.hh
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooppipes.a
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooputils.a
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0
delete mode 100644 lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0.0.0
delete mode 100644 lib/hadoop-0.20.0/conf/capacity-scheduler.xml
delete mode 100644 lib/hadoop-0.20.0/conf/configuration.xsl
delete mode 100644 lib/hadoop-0.20.0/conf/core-site.xml
delete mode 100644 lib/hadoop-0.20.0/conf/hadoop-env.sh
delete mode 100644 lib/hadoop-0.20.0/conf/hadoop-metrics.properties
delete mode 100644 lib/hadoop-0.20.0/conf/hadoop-policy.xml
delete mode 100644 lib/hadoop-0.20.0/conf/hdfs-site.xml
delete mode 100644 lib/hadoop-0.20.0/conf/log4j.properties
delete mode 100644 lib/hadoop-0.20.0/conf/mapred-site.xml
delete mode 100644 lib/hadoop-0.20.0/conf/masters
delete mode 100644 lib/hadoop-0.20.0/conf/slaves
delete mode 100644 lib/hadoop-0.20.0/conf/ssl-client.xml.example
delete mode 100644 lib/hadoop-0.20.0/conf/ssl-server.xml.example
delete mode 100644 lib/hadoop-0.20.0/contrib/capacity-scheduler/hadoop-0.20.0-capacity-scheduler.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/datajoin/hadoop-0.20.0-datajoin.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/eclipse-plugin/hadoop-0.20.0-eclipse-plugin.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/fairscheduler/hadoop-0.20.0-fairscheduler.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/README
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-config.sh
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemon.sh
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemons.sh
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-slaves.sh
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/start-hdfsproxy.sh
delete mode 100755 lib/hadoop-0.20.0/contrib/hdfsproxy/bin/stop-hdfsproxy.sh
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/configuration.xsl
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-default.xml
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh.template
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-hosts
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/log4j.properties
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-certs.xml
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-permissions.xml
delete mode 100644 lib/hadoop-0.20.0/contrib/hdfsproxy/hdfsproxy-1.0.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/index/hadoop-0.20.0-index.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/streaming/hadoop-0.20.0-streaming.jar
delete mode 100644 lib/hadoop-0.20.0/contrib/thriftfs/hadoop-0.20.0-thriftfs.jar
delete mode 100755 lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh
delete mode 100644 lib/hadoop-0.20.0/contrib/vaidya/conf/postex_diagnosis_tests.xml
delete mode 100644 lib/hadoop-0.20.0/contrib/vaidya/hadoop-0.20.0-vaidya.jar
delete mode 100644 lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar
delete mode 100644 lib/hadoop-0.20.0/hadoop-0.20.0-core.jar
delete mode 100644 lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar
delete mode 100644 lib/hadoop-0.20.0/hadoop-0.20.0-test.jar
delete mode 100644 lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar
delete mode 100644 lib/hadoop-0.20.0/ivy.xml
delete mode 100644 lib/hadoop-0.20.0/ivy/hadoop-core.pom
delete mode 100644 lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar
delete mode 100644 lib/hadoop-0.20.0/ivy/ivysettings.xml
delete mode 100644 lib/hadoop-0.20.0/ivy/libraries.properties
delete mode 100644 lib/hadoop-0.20.0/lib/.DS_Store
delete mode 100644 lib/hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar
delete mode 100644 lib/hadoop-0.20.0/lib/commons-codec-1.3.jar
delete mode 100644 lib/hadoop-0.20.0/lib/commons-el-1.0.jar
delete mode 100644 lib/hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/commons-logging-1.0.4.jar
delete mode 100644 lib/hadoop-0.20.0/lib/commons-logging-api-1.0.4.jar
delete mode 100644 lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/core-3.1.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt
delete mode 100644 lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml
delete mode 100644 lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jetty-6.1.14.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/junit-3.8.1.jar
delete mode 100644 lib/hadoop-0.20.0/lib/kfs-0.2.2.jar
delete mode 100644 lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt
delete mode 100644 lib/hadoop-0.20.0/lib/log4j-1.2.15.jar
delete mode 100644 lib/hadoop-0.20.0/lib/native/.DS_Store
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.a
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.la
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1.0.0
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.a
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.la
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1
delete mode 100644 lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1.0.0
delete mode 100644 lib/hadoop-0.20.0/lib/oro-2.0.8.jar
delete mode 100644 lib/hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar
delete mode 100644 lib/hadoop-0.20.0/lib/xmlenc-0.52.jar
delete mode 100644 lib/hadoop-0.20.0/librecordio/librecordio.a
delete mode 100644 lib/hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml
delete mode 100644 lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml
delete mode 100644 lib/hadoop-0.20.0/webapps/hdfs/index.html
delete mode 100644 lib/hadoop-0.20.0/webapps/job/WEB-INF/web.xml
delete mode 100644 lib/hadoop-0.20.0/webapps/job/index.html
delete mode 100644 lib/hadoop-0.20.0/webapps/static/hadoop-logo.jpg
delete mode 100644 lib/hadoop-0.20.0/webapps/static/hadoop.css
delete mode 100644 lib/hadoop-0.20.0/webapps/static/jobconf.xsl
delete mode 100644 lib/hadoop-0.20.0/webapps/static/jobtracker.js
delete mode 100644 lib/hadoop-0.20.0/webapps/task/WEB-INF/web.xml
delete mode 100644 lib/hadoop-0.20.0/webapps/task/index.html
delete mode 100644 lib/jetty-7.1.6.v20100715/jetty-server-7.1.6.v20100715.jar
delete mode 100644 lib/jetty-7.1.6.v20100715/servlet-api-2.5.jar
delete mode 100644 lib/jline.jar
delete mode 100644 lib/liblzf-3.5/Changes
delete mode 100644 lib/liblzf-3.5/LICENSE
delete mode 100644 lib/liblzf-3.5/Makefile
delete mode 100644 lib/liblzf-3.5/Makefile.in
delete mode 100644 lib/liblzf-3.5/README
delete mode 100644 lib/liblzf-3.5/config.h
delete mode 100644 lib/liblzf-3.5/config.h.in
delete mode 100644 lib/liblzf-3.5/config.log
delete mode 100755 lib/liblzf-3.5/config.status
delete mode 100755 lib/liblzf-3.5/configure
delete mode 100644 lib/liblzf-3.5/configure.ac
delete mode 100644 lib/liblzf-3.5/crc32.h
delete mode 100644 lib/liblzf-3.5/cs/CLZF.cs
delete mode 100644 lib/liblzf-3.5/cs/README
delete mode 100755 lib/liblzf-3.5/install-sh
delete mode 100644 lib/liblzf-3.5/lzf.c
delete mode 100644 lib/liblzf-3.5/lzf.h
delete mode 100644 lib/liblzf-3.5/lzfP.h
delete mode 100644 lib/liblzf-3.5/lzf_c.c
delete mode 100644 lib/liblzf-3.5/lzf_d.c
delete mode 100644 lib/mesos.jar
delete mode 100644 lib/scalacheck_2.8.0-1.7.jar
delete mode 100644 lib/scalatest-1.2/LICENSE
delete mode 100644 lib/scalatest-1.2/NOTICE
delete mode 100644 lib/scalatest-1.2/README.txt
delete mode 100644 lib/scalatest-1.2/scalatest-1.2.jar
delete mode 100644 lib/slf4j-1.6.1/slf4j-api-1.6.1.jar
delete mode 100644 lib/slf4j-1.6.1/slf4j-log4j12-1.6.1.jar
delete mode 100644 src/examples/BroadcastTest.scala
delete mode 100644 src/examples/CpuHog.scala
delete mode 100644 src/examples/HdfsTest.scala
delete mode 100644 src/examples/LocalALS.scala
delete mode 100644 src/examples/LocalFileLR.scala
delete mode 100644 src/examples/LocalLR.scala
delete mode 100644 src/examples/LocalPi.scala
delete mode 100644 src/examples/SleepJob.scala
delete mode 100644 src/examples/SparkALS.scala
delete mode 100644 src/examples/SparkHdfsLR.scala
delete mode 100644 src/examples/SparkLR.scala
delete mode 100644 src/examples/SparkPi.scala
delete mode 100644 src/examples/Vector.scala
delete mode 100644 src/main/java/spark/compress/lzf/LZF.java
delete mode 100644 src/main/java/spark/compress/lzf/LZFInputStream.java
delete mode 100644 src/main/java/spark/compress/lzf/LZFOutputStream.java
delete mode 100644 src/main/native/.gitignore
delete mode 100644 src/main/native/Makefile
delete mode 100644 src/main/native/spark_compress_lzf_LZF.c
delete mode 100644 src/main/scala/spark/Accumulators.scala
delete mode 100644 src/main/scala/spark/BoundedMemoryCache.scala
delete mode 100644 src/main/scala/spark/Broadcast.scala
delete mode 100644 src/main/scala/spark/Cache.scala
delete mode 100644 src/main/scala/spark/ClosureCleaner.scala
delete mode 100644 src/main/scala/spark/DfsShuffle.scala
delete mode 100644 src/main/scala/spark/Executor.scala
delete mode 100644 src/main/scala/spark/HadoopFile.scala
delete mode 100644 src/main/scala/spark/HttpServer.scala
delete mode 100644 src/main/scala/spark/Job.scala
delete mode 100644 src/main/scala/spark/LocalFileShuffle.scala
delete mode 100644 src/main/scala/spark/LocalScheduler.scala
delete mode 100644 src/main/scala/spark/Logging.scala
delete mode 100644 src/main/scala/spark/MesosScheduler.scala
delete mode 100644 src/main/scala/spark/NumberedSplitRDD.scala
delete mode 100644 src/main/scala/spark/ParallelArray.scala
delete mode 100644 src/main/scala/spark/RDD.scala
delete mode 100644 src/main/scala/spark/Scheduler.scala
delete mode 100644 src/main/scala/spark/SerializableWritable.scala
delete mode 100644 src/main/scala/spark/Shuffle.scala
delete mode 100644 src/main/scala/spark/SimpleJob.scala
delete mode 100644 src/main/scala/spark/SizeEstimator.scala
delete mode 100644 src/main/scala/spark/SoftReferenceCache.scala
delete mode 100644 src/main/scala/spark/SparkContext.scala
delete mode 100644 src/main/scala/spark/SparkException.scala
delete mode 100644 src/main/scala/spark/Split.scala
delete mode 100644 src/main/scala/spark/Task.scala
delete mode 100644 src/main/scala/spark/TaskResult.scala
delete mode 100644 src/main/scala/spark/Utils.scala
delete mode 100644 src/main/scala/spark/WeakReferenceCache.scala
delete mode 100644 src/main/scala/spark/repl/ExecutorClassLoader.scala
delete mode 100644 src/main/scala/spark/repl/Main.scala
delete mode 100644 src/main/scala/spark/repl/SparkCompletion.scala
delete mode 100644 src/main/scala/spark/repl/SparkCompletionOutput.scala
delete mode 100644 src/main/scala/spark/repl/SparkInteractiveReader.scala
delete mode 100644 src/main/scala/spark/repl/SparkInterpreter.scala
delete mode 100644 src/main/scala/spark/repl/SparkInterpreterLoop.scala
delete mode 100644 src/main/scala/spark/repl/SparkInterpreterSettings.scala
delete mode 100644 src/main/scala/spark/repl/SparkJLineReader.scala
delete mode 100644 src/main/scala/spark/repl/SparkSimpleReader.scala
delete mode 100644 src/test/scala/spark/ParallelArraySplitSuite.scala
delete mode 100644 src/test/scala/spark/ShuffleSuite.scala
delete mode 100644 src/test/scala/spark/repl/ReplSuite.scala
diff --git a/core/lib/apache-log4j-1.2.16/log4j-1.2.16.jar b/core/lib/apache-log4j-1.2.16/log4j-1.2.16.jar
new file mode 100644
index 0000000000..3f9d847618
Binary files /dev/null and b/core/lib/apache-log4j-1.2.16/log4j-1.2.16.jar differ
diff --git a/core/lib/asm-3.2/.DS_Store b/core/lib/asm-3.2/.DS_Store
new file mode 100644
index 0000000000..52b0f12a32
Binary files /dev/null and b/core/lib/asm-3.2/.DS_Store differ
diff --git a/core/lib/asm-3.2/lib/all/README.txt b/core/lib/asm-3.2/lib/all/README.txt
new file mode 100644
index 0000000000..d7c96a5edb
--- /dev/null
+++ b/core/lib/asm-3.2/lib/all/README.txt
@@ -0,0 +1,3 @@
+It is highly recommended to use only the necessary ASM jars for your
+application instead of using the asm-all jar, unless you really need
+all ASM packages.
\ No newline at end of file
diff --git a/core/lib/asm-3.2/lib/all/asm-all-3.2.jar b/core/lib/asm-3.2/lib/all/asm-all-3.2.jar
new file mode 100644
index 0000000000..d0ad60ed0a
Binary files /dev/null and b/core/lib/asm-3.2/lib/all/asm-all-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/all/asm-all-3.2.pom b/core/lib/asm-3.2/lib/all/asm-all-3.2.pom
new file mode 100644
index 0000000000..9899a54c3b
--- /dev/null
+++ b/core/lib/asm-3.2/lib/all/asm-all-3.2.pom
@@ -0,0 +1,15 @@
+
+ 4.0.0
+
+
+ asm
+ asm-parent
+ 3.2
+
+
+ ASM All
+ asm
+ asm-all
+ jar
+
+
diff --git a/core/lib/asm-3.2/lib/all/asm-debug-all-3.2.jar b/core/lib/asm-3.2/lib/all/asm-debug-all-3.2.jar
new file mode 100644
index 0000000000..94b8549142
Binary files /dev/null and b/core/lib/asm-3.2/lib/all/asm-debug-all-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/all/asm-debug-all-3.2.pom b/core/lib/asm-3.2/lib/all/asm-debug-all-3.2.pom
new file mode 100644
index 0000000000..9899a54c3b
--- /dev/null
+++ b/core/lib/asm-3.2/lib/all/asm-debug-all-3.2.pom
@@ -0,0 +1,15 @@
+
+ 4.0.0
+
+
+ asm
+ asm-parent
+ 3.2
+
+
+ ASM All
+ asm
+ asm-all
+ jar
+
+
diff --git a/core/lib/asm-3.2/lib/asm-3.2.jar b/core/lib/asm-3.2/lib/asm-3.2.jar
new file mode 100644
index 0000000000..334e7fdc7f
Binary files /dev/null and b/core/lib/asm-3.2/lib/asm-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/asm-3.2.pom b/core/lib/asm-3.2/lib/asm-3.2.pom
new file mode 100644
index 0000000000..c714db09b2
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-3.2.pom
@@ -0,0 +1,14 @@
+
+ 4.0.0
+
+
+ asm-parent
+ asm
+ 3.2
+
+
+ ASM Core
+ asm
+ jar
+
+
diff --git a/core/lib/asm-3.2/lib/asm-analysis-3.2.jar b/core/lib/asm-3.2/lib/asm-analysis-3.2.jar
new file mode 100644
index 0000000000..40ee3151cb
Binary files /dev/null and b/core/lib/asm-3.2/lib/asm-analysis-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/asm-analysis-3.2.pom b/core/lib/asm-3.2/lib/asm-analysis-3.2.pom
new file mode 100644
index 0000000000..b3933387af
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-analysis-3.2.pom
@@ -0,0 +1,21 @@
+
+ 4.0.0
+
+
+ asm-parent
+ asm
+ 3.2
+
+
+ ASM Analysis
+ asm-analysis
+ jar
+
+
+
+ asm-tree
+ asm
+
+
+
+
diff --git a/core/lib/asm-3.2/lib/asm-commons-3.2.jar b/core/lib/asm-3.2/lib/asm-commons-3.2.jar
new file mode 100644
index 0000000000..8dfed0a9b7
Binary files /dev/null and b/core/lib/asm-3.2/lib/asm-commons-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/asm-commons-3.2.pom b/core/lib/asm-3.2/lib/asm-commons-3.2.pom
new file mode 100644
index 0000000000..8517715b4a
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-commons-3.2.pom
@@ -0,0 +1,21 @@
+
+ 4.0.0
+
+
+ asm-parent
+ asm
+ 3.2
+
+
+ ASM Commons
+ asm-commons
+ jar
+
+
+
+ asm-tree
+ asm
+
+
+
+
diff --git a/core/lib/asm-3.2/lib/asm-parent-3.2.pom b/core/lib/asm-3.2/lib/asm-parent-3.2.pom
new file mode 100644
index 0000000000..c220347f6a
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-parent-3.2.pom
@@ -0,0 +1,136 @@
+
+ 4.0.0
+
+ asm-parent
+ asm
+ 3.2
+ pom
+
+ ASM
+ A very small and fast Java bytecode manipulation framework
+ http://asm.objectweb.org/
+
+
+ ObjectWeb
+ http://www.objectweb.org/
+
+ 2000
+
+
+
+ BSD
+ http://asm.objectweb.org/license.html
+
+
+
+
+
+ Eric Bruneton
+ ebruneton
+ Eric.Bruneton@rd.francetelecom.com
+
+ Creator
+ Java Developer
+
+
+
+ Eugene Kuleshov
+ eu
+ eu@javatx.org
+
+ Java Developer
+
+
+
+
+
+ scm:cvs:pserver:anonymous:@cvs.forge.objectweb.org:/cvsroot/asm:asm
+ scm:cvs:ext:${maven.username}@cvs.forge.objectweb.org:/cvsroot/asm:asm
+ http://cvs.forge.objectweb.org/cgi-bin/viewcvs.cgi/asm/asm/
+
+
+
+ http://forge.objectweb.org/tracker/?group_id=23
+
+
+
+
+
+
+ asm
+ ${project.groupId}
+ ${project.version}
+
+
+
+ asm-tree
+ ${project.groupId}
+ ${project.version}
+
+
+
+ asm-analysis
+ ${project.groupId}
+ ${project.version}
+
+
+
+ asm-commons
+ ${project.groupId}
+ ${project.version}
+
+
+
+ asm-util
+ ${project.groupId}
+ ${project.version}
+
+
+
+ asm-xml
+ ${project.groupId}
+ ${project.version}
+
+
+
+
+
+
+
+ ASM Users List
+ sympa@ow2.org?subject=subscribe%20asm
+ sympa@ow2.org?subject=unsubscribe%20asm
+ asm@ow2.org
+ http://www.ow2.org/wws/arc/asm
+
+
+ ASM Team List
+ sympa@ow2.org?subject=subscribe%20asm-team
+ sympa@ow2.org?subject=unsubscribe%20asm-team
+ asm-team@ow2.org
+ http://www.ow2.org/wws/arc/asm-team
+
+
+
+
+ http://mojo.codehaus.org/my-project
+
+ objectweb
+ false
+ ObjectWeb Maven 2.0 Repository
+ dav:https://maven.forge.objectweb.org:8002/maven2/
+ default
+
+
+ objectweb.snapshots
+ false
+ ObjectWeb Maven 2.0 Snapshot Repository
+ dav:https://maven.forge.objectweb.org:8002/maven2-snapshot/
+ default
+
+
+
+
diff --git a/core/lib/asm-3.2/lib/asm-tree-3.2.jar b/core/lib/asm-3.2/lib/asm-tree-3.2.jar
new file mode 100644
index 0000000000..b21fb86a92
Binary files /dev/null and b/core/lib/asm-3.2/lib/asm-tree-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/asm-tree-3.2.pom b/core/lib/asm-3.2/lib/asm-tree-3.2.pom
new file mode 100644
index 0000000000..9f454528f4
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-tree-3.2.pom
@@ -0,0 +1,21 @@
+
+ 4.0.0
+
+
+ asm-parent
+ asm
+ 3.2
+
+
+ ASM Tree
+ asm-tree
+ jar
+
+
+
+ asm
+ asm
+
+
+
+
diff --git a/core/lib/asm-3.2/lib/asm-util-3.2.jar b/core/lib/asm-3.2/lib/asm-util-3.2.jar
new file mode 100644
index 0000000000..499d229034
Binary files /dev/null and b/core/lib/asm-3.2/lib/asm-util-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/asm-util-3.2.pom b/core/lib/asm-3.2/lib/asm-util-3.2.pom
new file mode 100644
index 0000000000..e302b0f356
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-util-3.2.pom
@@ -0,0 +1,21 @@
+
+ 4.0.0
+
+
+ asm-parent
+ asm
+ 3.2
+
+
+ ASM Util
+ asm-util
+ jar
+
+
+
+ asm-tree
+ asm
+
+
+
+
diff --git a/core/lib/asm-3.2/lib/asm-xml-3.2.jar b/core/lib/asm-3.2/lib/asm-xml-3.2.jar
new file mode 100644
index 0000000000..31b31b56fe
Binary files /dev/null and b/core/lib/asm-3.2/lib/asm-xml-3.2.jar differ
diff --git a/core/lib/asm-3.2/lib/asm-xml-3.2.pom b/core/lib/asm-3.2/lib/asm-xml-3.2.pom
new file mode 100644
index 0000000000..0f3de1f2ab
--- /dev/null
+++ b/core/lib/asm-3.2/lib/asm-xml-3.2.pom
@@ -0,0 +1,21 @@
+
+ 4.0.0
+
+
+ asm-parent
+ asm
+ 3.2
+
+
+ ASM XML
+ asm-xml
+ jar
+
+
+
+ asm-util
+ asm
+
+
+
+
diff --git a/core/lib/colt.jar b/core/lib/colt.jar
new file mode 100644
index 0000000000..a7192f68b3
Binary files /dev/null and b/core/lib/colt.jar differ
diff --git a/core/lib/guava-r07/COPYING b/core/lib/guava-r07/COPYING
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/core/lib/guava-r07/COPYING
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/core/lib/guava-r07/README b/core/lib/guava-r07/README
new file mode 100644
index 0000000000..a0e832dd54
--- /dev/null
+++ b/core/lib/guava-r07/README
@@ -0,0 +1,28 @@
+Guava: Google Core Libraries for Java
+
+Requires JDK 5 or higher.
+
+Project page:
+ http://guava-libraries.googlecode.com
+
+Ask "how-to" and "why-didn't-it-work" questions at:
+ http://www.stackoverflow.com/questions/ask
+ (use the "guava" tag so we'll see it)
+
+Ask discussion questions at:
+ http://groups.google.com/group/guava-discuss
+
+Subscribe to project updates in your feed reader:
+ http://code.google.com/feeds/p/guava-libraries/updates/basic
+
+Warnings:
+
+All APIs marked @Beta at the class or method level are subject to
+change. If your code is a library or framework that users outside
+your control will include on their classpath, do not use @Beta
+APIs (at least without repackaging them somehow).
+
+Serialized forms of ALL objects are subject to change. Do not
+persist these and assume they can be read by a future version of
+the library.
+
diff --git a/core/lib/guava-r07/guava-r07.jar b/core/lib/guava-r07/guava-r07.jar
new file mode 100644
index 0000000000..a6c9ce02df
Binary files /dev/null and b/core/lib/guava-r07/guava-r07.jar differ
diff --git a/core/lib/hadoop-0.20.0/.DS_Store b/core/lib/hadoop-0.20.0/.DS_Store
new file mode 100644
index 0000000000..81f4e05e09
Binary files /dev/null and b/core/lib/hadoop-0.20.0/.DS_Store differ
diff --git a/core/lib/hadoop-0.20.0/CHANGES.txt b/core/lib/hadoop-0.20.0/CHANGES.txt
new file mode 100644
index 0000000000..95c8b5c08b
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/CHANGES.txt
@@ -0,0 +1,8288 @@
+Hadoop Change Log
+
+Release 0.20.0 - 2009-04-15
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-4210. Fix findbugs warnings for equals implementations of mapred ID
+ classes. Removed public, static ID::read and ID::forName; made ID an
+ abstract class. (Suresh Srinivas via cdouglas)
+
+ HADOOP-4253. Fix various warnings generated by findbugs.
+ Following deprecated methods in RawLocalFileSystem are removed:
+ public String getName()
+ public void lock(Path p, boolean shared)
+ public void release(Path p)
+ (Suresh Srinivas via johan)
+
+ HADOOP-4618. Move http server from FSNamesystem into NameNode.
+ FSNamesystem.getNameNodeInfoPort() is removed.
+ FSNamesystem.getDFSNameNodeMachine() and FSNamesystem.getDFSNameNodePort()
+ replaced by FSNamesystem.getDFSNameNodeAddress().
+ NameNode(bindAddress, conf) is removed.
+ (shv)
+
+ HADOOP-4567. GetFileBlockLocations returns the NetworkTopology
+ information of the machines where the blocks reside. (dhruba)
+
+ HADOOP-4435. The JobTracker WebUI displays the amount of heap memory
+ in use. (dhruba)
+
+ HADOOP-4628. Move Hive into a standalone subproject. (omalley)
+
+ HADOOP-4188. Removes task's dependency on concrete filesystems.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-1650. Upgrade to Jetty 6. (cdouglas)
+
+ HADOOP-3986. Remove static Configuration from JobClient. (Amareshwari
+ Sriramadasu via cdouglas)
+ JobClient::setCommandLineConfig is removed
+ JobClient::getCommandLineConfig is removed
+ JobShell, TestJobShell classes are removed
+
+ HADOOP-4422. S3 file systems should not create bucket.
+ (David Phillips via tomwhite)
+
+ HADOOP-4035. Support memory based scheduling in capacity scheduler.
+ (Vinod Kumar Vavilapalli via yhemanth)
+
+ HADOOP-3497. Fix bug in overly restrictive file globbing with a
+ PathFilter. (tomwhite)
+
+ HADOOP-4445. Replace running task counts with running task
+ percentage in capacity scheduler UI. (Sreekanth Ramakrishnan via
+ yhemanth)
+
+ HADOOP-4631. Splits the configuration into three parts - one for core,
+ one for mapred and the last one for HDFS. (Sharad Agarwal via cdouglas)
+
+ HADOOP-3344. Fix libhdfs build to use autoconf and build the same
+ architecture (32 vs 64 bit) of the JVM running Ant. The libraries for
+ pipes, utils, and libhdfs are now all in c++//lib.
+ (Giridharan Kesavan via nigel)
+
+ HADOOP-4874. Remove LZO codec because of licensing issues. (omalley)
+
+ HADOOP-4970. The full path name of a file is preserved inside Trash.
+ (Prasad Chakka via dhruba)
+
+ HADOOP-4103. NameNode keeps a count of missing blocks. It warns on
+ WebUI if there are such blocks. '-report' and '-metaSave' have extra
+ info to track such blocks. (Raghu Angadi)
+
+ HADOOP-4783. Change permissions on history files on the jobtracker
+ to be only group readable instead of world readable.
+ (Amareshwari Sriramadasu via yhemanth)
+
+ HADOOP-5531. Removed Chukwa from Hadoop 0.20.0. (nigel)
+
+ NEW FEATURES
+
+ HADOOP-4575. Add a proxy service for relaying HsftpFileSystem requests.
+ Includes client authentication via user certificates and config-based
+ access control. (Kan Zhang via cdouglas)
+
+ HADOOP-4661. Add DistCh, a new tool for distributed ch{mod,own,grp}.
+ (szetszwo)
+
+ HADOOP-4709. Add several new features and bug fixes to Chukwa.
+ Added Hadoop Infrastructure Care Center (UI for visualize data collected
+ by Chukwa)
+ Added FileAdaptor for streaming small file in one chunk
+ Added compression to archive and demux output
+ Added unit tests and validation for agent, collector, and demux map
+ reduce job
+ Added database loader for loading demux output (sequence file) to jdbc
+ connected database
+ Added algorithm to distribute collector load more evenly
+ (Jerome Boulon, Eric Yang, Andy Konwinski, Ariel Rabkin via cdouglas)
+
+ HADOOP-4179. Add Vaidya tool to analyze map/reduce job logs for performanc
+ problems. (Suhas Gogate via omalley)
+
+ HADOOP-4029. Add NameNode storage information to the dfshealth page and
+ move DataNode information to a separated page. (Boris Shkolnik via
+ szetszwo)
+
+ HADOOP-4348. Add service-level authorization for Hadoop. (acmurthy)
+
+ HADOOP-4826. Introduce admin command saveNamespace. (shv)
+
+ HADOOP-3063 BloomMapFile - fail-fast version of MapFile for sparsely
+ populated key space (Andrzej Bialecki via stack)
+
+ HADOOP-1230. Add new map/reduce API and deprecate the old one. Generally,
+ the old code should work without problem. The new api is in
+ org.apache.hadoop.mapreduce and the old classes in org.apache.hadoop.mapred
+ are deprecated. Differences in the new API:
+ 1. All of the methods take Context objects that allow us to add new
+ methods without breaking compatability.
+ 2. Mapper and Reducer now have a "run" method that is called once and
+ contains the control loop for the task, which lets applications
+ replace it.
+ 3. Mapper and Reducer by default are Identity Mapper and Reducer.
+ 4. The FileOutputFormats use part-r-00000 for the output of reduce 0 and
+ part-m-00000 for the output of map 0.
+ 5. The reduce grouping comparator now uses the raw compare instead of
+ object compare.
+ 6. The number of maps in FileInputFormat is controlled by min and max
+ split size rather than min size and the desired number of maps.
+ (omalley)
+
+ HADOOP-3305. Use Ivy to manage dependencies. (Giridharan Kesavan
+ and Steve Loughran via cutting)
+
+ IMPROVEMENTS
+
+ HADOOP-4565. Added CombineFileInputFormat to use data locality information
+ to create splits. (dhruba via zshao)
+
+ HADOOP-4749. Added a new counter REDUCE_INPUT_BYTES. (Yongqiang He via
+ zshao)
+
+ HADOOP-4234. Fix KFS "glue" layer to allow applications to interface
+ with multiple KFS metaservers. (Sriram Rao via lohit)
+
+ HADOOP-4245. Update to latest version of KFS "glue" library jar.
+ (Sriram Rao via lohit)
+
+ HADOOP-4244. Change test-patch.sh to check Eclipse classpath no matter
+ it is run by Hudson or not. (szetszwo)
+
+ HADOOP-3180. Add name of missing class to WritableName.getClass
+ IOException. (Pete Wyckoff via omalley)
+
+ HADOOP-4178. Make the capacity scheduler's default values configurable.
+ (Sreekanth Ramakrishnan via omalley)
+
+ HADOOP-4262. Generate better error message when client exception has null
+ message. (stevel via omalley)
+
+ HADOOP-4226. Refactor and document LineReader to make it more readily
+ understandable. (Yuri Pradkin via cdouglas)
+
+ HADOOP-4238. When listing jobs, if scheduling information isn't available
+ print NA instead of empty output. (Sreekanth Ramakrishnan via johan)
+
+ HADOOP-4284. Support filters that apply to all requests, or global filters,
+ to HttpServer. (Kan Zhang via cdouglas)
+
+ HADOOP-4276. Improve the hashing functions and deserialization of the
+ mapred ID classes. (omalley)
+
+ HADOOP-4485. Add a compile-native ant task, as a shorthand. (enis)
+
+ HADOOP-4454. Allow # comments in slaves file. (Rama Ramasamy via omalley)
+
+ HADOOP-3461. Remove hdfs.StringBytesWritable. (szetszwo)
+
+ HADOOP-4437. Use Halton sequence instead of java.util.Random in
+ PiEstimator. (szetszwo)
+
+ HADOOP-4572. Change INode and its sub-classes to package private.
+ (szetszwo)
+
+ HADOOP-4187. Does a runtime lookup for JobConf/JobConfigurable, and if
+ found, invokes the appropriate configure method. (Sharad Agarwal via ddas)
+
+ HADOOP-4453. Improve ssl configuration and handling in HsftpFileSystem,
+ particularly when used with DistCp. (Kan Zhang via cdouglas)
+
+ HADOOP-4583. Several code optimizations in HDFS. (Suresh Srinivas via
+ szetszwo)
+
+ HADOOP-3923. Remove org.apache.hadoop.mapred.StatusHttpServer. (szetszwo)
+
+ HADOOP-4622. Explicitly specify interpretor for non-native
+ pipes binaries. (Fredrik Hedberg via johan)
+
+ HADOOP-4505. Add a unit test to test faulty setup task and cleanup
+ task killing the job. (Amareshwari Sriramadasu via johan)
+
+ HADOOP-4608. Don't print a stack trace when the example driver gets an
+ unknown program to run. (Edward Yoon via omalley)
+
+ HADOOP-4645. Package HdfsProxy contrib project without the extra level
+ of directories. (Kan Zhang via omalley)
+
+ HADOOP-4126. Allow access to HDFS web UI on EC2 (tomwhite via omalley)
+
+ HADOOP-4612. Removes RunJar's dependency on JobClient.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-4185. Adds setVerifyChecksum() method to FileSystem.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-4523. Prevent too many tasks scheduled on a node from bringing
+ it down by monitoring for cumulative memory usage across tasks.
+ (Vinod Kumar Vavilapalli via yhemanth)
+
+ HADOOP-4640. Adds an input format that can split lzo compressed
+ text files. (johan)
+
+ HADOOP-4666. Launch reduces only after a few maps have run in the
+ Fair Scheduler. (Matei Zaharia via johan)
+
+ HADOOP-4339. Remove redundant calls from FileSystem/FsShell when
+ generating/processing ContentSummary. (David Phillips via cdouglas)
+
+ HADOOP-2774. Add counters tracking records spilled to disk in MapTask and
+ ReduceTask. (Ravi Gummadi via cdouglas)
+
+ HADOOP-4513. Initialize jobs asynchronously in the capacity scheduler.
+ (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-4649. Improve abstraction for spill indices. (cdouglas)
+
+ HADOOP-3770. Add gridmix2, an iteration on the gridmix benchmark. (Runping
+ Qi via cdouglas)
+
+ HADOOP-4708. Add support for dfsadmin commands in TestCLI. (Boris Shkolnik
+ via cdouglas)
+
+ HADOOP-4758. Add a splitter for metrics contexts to support more than one
+ type of collector. (cdouglas)
+
+ HADOOP-4722. Add tests for dfsadmin quota error messages. (Boris Shkolnik
+ via cdouglas)
+
+ HADOOP-4690. fuse-dfs - create source file/function + utils + config +
+ main source files. (pete wyckoff via mahadev)
+
+ HADOOP-3750. Fix and enforce module dependencies. (Sharad Agarwal via
+ tomwhite)
+
+ HADOOP-4747. Speed up FsShell::ls by removing redundant calls to the
+ filesystem. (David Phillips via cdouglas)
+
+ HADOOP-4305. Improves the blacklisting strategy, whereby, tasktrackers
+ that are blacklisted are not given tasks to run from other jobs, subject
+ to the following conditions (all must be met):
+ 1) The TaskTracker has been blacklisted by at least 4 jobs (configurable)
+ 2) The TaskTracker has been blacklisted 50% more number of times than
+ the average (configurable)
+ 3) The cluster has less than 50% trackers blacklisted
+ Once in 24 hours, a TaskTracker blacklisted for all jobs is given a chance.
+ Restarting the TaskTracker moves it out of the blacklist.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4688. Modify the MiniMRDFSSort unit test to spill multiple times,
+ exercising the map-side merge code. (cdouglas)
+
+ HADOOP-4737. Adds the KILLED notification when jobs get killed.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4728. Add a test exercising different namenode configurations.
+ (Boris Shkolnik via cdouglas)
+
+ HADOOP-4807. Adds JobClient commands to get the active/blacklisted tracker
+ names. Also adds commands to display running/completed task attempt IDs.
+ (ddas)
+
+ HADOOP-4699. Remove checksum validation from map output servlet. (cdouglas)
+
+ HADOOP-4838. Added a registry to automate metrics and mbeans management.
+ (Sanjay Radia via acmurthy)
+
+ HADOOP-3136. Fixed the default scheduler to assign multiple tasks to each
+ tasktracker per heartbeat, when feasible. To ensure locality isn't hurt
+ too badly, the scheudler will not assign more than one off-switch task per
+ heartbeat. The heartbeat interval is also halved since the task-tracker is
+ fixed to no longer send out heartbeats on each task completion. A
+ slow-start for scheduling reduces is introduced to ensure that reduces
+ aren't started till sufficient number of maps are done, else reduces of
+ jobs whose maps aren't scheduled might swamp the cluster.
+ Configuration changes to mapred-default.xml:
+ add mapred.reduce.slowstart.completed.maps
+ (acmurthy)
+
+ HADOOP-4545. Add example and test case of secondary sort for the reduce.
+ (omalley)
+
+ HADOOP-4753. Refactor gridmix2 to reduce code duplication. (cdouglas)
+
+ HADOOP-4909. Fix Javadoc and make some of the API more consistent in their
+ use of the JobContext instead of Configuration. (omalley)
+
+ HADOOP-4830. Add end-to-end test cases for testing queue capacities.
+ (Vinod Kumar Vavilapalli via yhemanth)
+
+ HADOOP-4980. Improve code layout of capacity scheduler to make it
+ easier to fix some blocker bugs. (Vivek Ratan via yhemanth)
+
+ HADOOP-4916. Make user/location of Chukwa installation configurable by an
+ external properties file. (Eric Yang via cdouglas)
+
+ HADOOP-4950. Make the CompressorStream, DecompressorStream,
+ BlockCompressorStream, and BlockDecompressorStream public to facilitate
+ non-Hadoop codecs. (omalley)
+
+ HADOOP-4843. Collect job history and configuration in Chukwa. (Eric Yang
+ via cdouglas)
+
+ HADOOP-5030. Build Chukwa RPM to install into configured directory. (Eric
+ Yang via cdouglas)
+
+ HADOOP-4828. Updates documents to do with configuration (HADOOP-4631).
+ (Sharad Agarwal via ddas)
+
+ HADOOP-4939. Adds a test that would inject random failures for tasks in
+ large jobs and would also inject TaskTracker failures. (ddas)
+
+ HADOOP-4920. Stop storing Forrest output in Subversion. (cutting)
+
+ HADOOP-4944. A configuration file can include other configuration
+ files. (Rama Ramasamy via dhruba)
+
+ HADOOP-4804. Provide Forrest documentation for the Fair Scheduler.
+ (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-5248. A testcase that checks for the existence of job directory
+ after the job completes. Fails if it exists. (ddas)
+
+ HADOOP-4664. Introduces multiple job initialization threads, where the
+ number of threads are configurable via mapred.jobinit.threads.
+ (Matei Zaharia and Jothi Padmanabhan via ddas)
+
+ HADOOP-4191. Adds a testcase for JobHistory. (Ravi Gummadi via ddas)
+
+ HADOOP-5466. Change documenation CSS style for headers and code. (Corinne
+ Chandel via szetszwo)
+
+ HADOOP-5275. Add ivy directory and files to built tar.
+ (Giridharan Kesavan via nigel)
+
+ HADOOP-5468. Add sub-menus to forrest documentation and make some minor
+ edits. (Corinne Chandel via szetszwo)
+
+ HADOOP-5437. Fix TestMiniMRDFSSort to properly test jvm-reuse. (omalley)
+
+ HADOOP-5521. Removes dependency of TestJobInProgress on RESTART_COUNT
+ JobHistory tag. (Ravi Gummadi via ddas)
+
+ OPTIMIZATIONS
+
+ HADOOP-3293. Fixes FileInputFormat to do provide locations for splits
+ based on the rack/host that has the most number of bytes.
+ (Jothi Padmanabhan via ddas)
+
+ HADOOP-4683. Fixes Reduce shuffle scheduler to invoke
+ getMapCompletionEvents in a separate thread. (Jothi Padmanabhan
+ via ddas)
+
+ BUG FIXES
+
+ HADOOP-5379. CBZip2InputStream to throw IOException on data crc error.
+ (Rodrigo Schmidt via zshao)
+
+ HADOOP-5326. Fixes CBZip2OutputStream data corruption problem.
+ (Rodrigo Schmidt via zshao)
+
+ HADOOP-4204. Fix findbugs warnings related to unused variables, naive
+ Number subclass instantiation, Map iteration, and badly scoped inner
+ classes. (Suresh Srinivas via cdouglas)
+
+ HADOOP-4207. Update derby jar file to release 10.4.2 release.
+ (Prasad Chakka via dhruba)
+
+ HADOOP-4325. SocketInputStream.read() should return -1 in case EOF.
+ (Raghu Angadi)
+
+ HADOOP-4408. FsAction functions need not create new objects. (cdouglas)
+
+ HADOOP-4440. TestJobInProgressListener tests for jobs killed in queued
+ state (Amar Kamat via ddas)
+
+ HADOOP-4346. Implement blocking connect so that Hadoop is not affected
+ by selector problem with JDK default implementation. (Raghu Angadi)
+
+ HADOOP-4388. If there are invalid blocks in the transfer list, Datanode
+ should handle them and keep transferring the remaining blocks. (Suresh
+ Srinivas via szetszwo)
+
+ HADOOP-4587. Fix a typo in Mapper javadoc. (Koji Noguchi via szetszwo)
+
+ HADOOP-4530. In fsck, HttpServletResponse sendError fails with
+ IllegalStateException. (hairong)
+
+ HADOOP-4377. Fix a race condition in directory creation in
+ NativeS3FileSystem. (David Phillips via cdouglas)
+
+ HADOOP-4621. Fix javadoc warnings caused by duplicate jars. (Kan Zhang via
+ cdouglas)
+
+ HADOOP-4566. Deploy new hive code to support more types.
+ (Zheng Shao via dhruba)
+
+ HADOOP-4571. Add chukwa conf files to svn:ignore list. (Eric Yang via
+ szetszwo)
+
+ HADOOP-4589. Correct PiEstimator output messages and improve the code
+ readability. (szetszwo)
+
+ HADOOP-4650. Correct a mismatch between the default value of
+ local.cache.size in the config and the source. (Jeff Hammerbacher via
+ cdouglas)
+
+ HADOOP-4606. Fix cygpath error if the log directory does not exist.
+ (szetszwo via omalley)
+
+ HADOOP-4141. Fix bug in ScriptBasedMapping causing potential infinite
+ loop on misconfigured hadoop-site. (Aaron Kimball via tomwhite)
+
+ HADOOP-4691. Correct a link in the javadoc of IndexedSortable. (szetszwo)
+
+ HADOOP-4598. '-setrep' command skips under-replicated blocks. (hairong)
+
+ HADOOP-4429. Set defaults for user, group in UnixUserGroupInformation so
+ login fails more predictably when misconfigured. (Alex Loddengaard via
+ cdouglas)
+
+ HADOOP-4676. Fix broken URL in blacklisted tasktrackers page. (Amareshwari
+ Sriramadasu via cdouglas)
+
+ HADOOP-3422 Ganglia counter metrics are all reported with the metric
+ name "value", so the counter values can not be seen. (Jason Attributor
+ and Brian Bockelman via stack)
+
+ HADOOP-4704. Fix javadoc typos "the the". (szetszwo)
+
+ HADOOP-4677. Fix semantics of FileSystem::getBlockLocations to return
+ meaningful values. (Hong Tang via cdouglas)
+
+ HADOOP-4669. Use correct operator when evaluating whether access time is
+ enabled (Dhruba Borthakur via cdouglas)
+
+ HADOOP-4732. Pass connection and read timeouts in the correct order when
+ setting up fetch in reduce. (Amareshwari Sriramadasu via cdouglas)
+
+ HADOOP-4558. Fix capacity reclamation in capacity scheduler.
+ (Amar Kamat via yhemanth)
+
+ HADOOP-4770. Fix rungridmix_2 script to work with RunJar. (cdouglas)
+
+ HADOOP-4738. When using git, the saveVersion script will use only the
+ commit hash for the version and not the message, which requires escaping.
+ (cdouglas)
+
+ HADOOP-4576. Show pending job count instead of task count in the UI per
+ queue in capacity scheduler. (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-4623. Maintain running tasks even if speculative execution is off.
+ (Amar Kamat via yhemanth)
+
+ HADOOP-4786. Fix broken compilation error in
+ TestTrackerBlacklistAcrossJobs. (yhemanth)
+
+ HADOOP-4785. Fixes theJobTracker heartbeat to not make two calls to
+ System.currentTimeMillis(). (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4792. Add generated Chukwa configuration files to version control
+ ignore lists. (cdouglas)
+
+ HADOOP-4796. Fix Chukwa test configuration, remove unused components. (Eric
+ Yang via cdouglas)
+
+ HADOOP-4708. Add binaries missed in the initial checkin for Chukwa. (Eric
+ Yang via cdouglas)
+
+ HADOOP-4805. Remove black list collector from Chukwa Agent HTTP Sender.
+ (Eric Yang via cdouglas)
+
+ HADOOP-4837. Move HADOOP_CONF_DIR configuration to chukwa-env.sh (Jerome
+ Boulon via cdouglas)
+
+ HADOOP-4825. Use ps instead of jps for querying process status in Chukwa.
+ (Eric Yang via cdouglas)
+
+ HADOOP-4844. Fixed javadoc for
+ org.apache.hadoop.fs.permission.AccessControlException to document that
+ it's deprecated in favour of
+ org.apache.hadoop.security.AccessControlException. (acmurthy)
+
+ HADOOP-4706. Close the underlying output stream in
+ IFileOutputStream::close. (Jothi Padmanabhan via cdouglas)
+
+ HADOOP-4855. Fixed command-specific help messages for refreshServiceAcl in
+ DFSAdmin and MRAdmin. (acmurthy)
+
+ HADOOP-4820. Remove unused method FSNamesystem::deleteInSafeMode. (Suresh
+ Srinivas via cdouglas)
+
+ HADOOP-4698. Lower io.sort.mb to 10 in the tests and raise the junit memory
+ limit to 512m from 256m. (Nigel Daley via cdouglas)
+
+ HADOOP-4860. Split TestFileTailingAdapters into three separate tests to
+ avoid contention. (Eric Yang via cdouglas)
+
+ HADOOP-3921. Fixed clover (code coverage) target to work with JDK 6.
+ (tomwhite via nigel)
+
+ HADOOP-4845. Modify the reduce input byte counter to record only the
+ compressed size and add a human-readable label. (Yongqiang He via cdouglas)
+
+ HADOOP-4458. Add a test creating symlinks in the working directory.
+ (Amareshwari Sriramadasu via cdouglas)
+
+ HADOOP-4879. Fix org.apache.hadoop.mapred.Counters to correctly define
+ Object.equals rather than depend on contentEquals api. (omalley via
+ acmurthy)
+
+ HADOOP-4791. Fix rpm build process for Chukwa. (Eric Yang via cdouglas)
+
+ HADOOP-4771. Correct initialization of the file count for directories
+ with quotas. (Ruyue Ma via shv)
+
+ HADOOP-4878. Fix eclipse plugin classpath file to point to ivy's resolved
+ lib directory and added the same to test-patch.sh. (Giridharan Kesavan via
+ acmurthy)
+
+ HADOOP-4774. Fix default values of some capacity scheduler configuration
+ items which would otherwise not work on a fresh checkout.
+ (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-4876. Fix capacity scheduler reclamation by updating count of
+ pending tasks correctly. (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-4849. Documentation for Service Level Authorization implemented in
+ HADOOP-4348. (acmurthy)
+
+ HADOOP-4827. Replace Consolidator with Aggregator macros in Chukwa (Eric
+ Yang via cdouglas)
+
+ HADOOP-4894. Correctly parse ps output in Chukwa jettyCollector.sh. (Ari
+ Rabkin via cdouglas)
+
+ HADOOP-4892. Close fds out of Chukwa ExecPlugin. (Ari Rabkin via cdouglas)
+
+ HADOOP-4889. Fix permissions in RPM packaging. (Eric Yang via cdouglas)
+
+ HADOOP-4869. Fixes the TT-JT heartbeat to have an explicit flag for
+ restart apart from the initialContact flag that there was earlier.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4716. Fixes ReduceTask.java to clear out the mapping between
+ hosts and MapOutputLocation upon a JT restart (Amar Kamat via ddas)
+
+ HADOOP-4880. Removes an unnecessary testcase from TestJobTrackerRestart.
+ (Amar Kamat via ddas)
+
+ HADOOP-4924. Fixes a race condition in TaskTracker re-init. (ddas)
+
+ HADOOP-4854. Read reclaim capacity interval from capacity scheduler
+ configuration. (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-4896. HDFS Fsck does not load HDFS configuration. (Raghu Angadi)
+
+ HADOOP-4956. Creates TaskStatus for failed tasks with an empty Counters
+ object instead of null. (ddas)
+
+ HADOOP-4979. Fix capacity scheduler to block cluster for failed high
+ RAM requirements across task types. (Vivek Ratan via yhemanth)
+
+ HADOOP-4949. Fix native compilation. (Chris Douglas via acmurthy)
+
+ HADOOP-4787. Fixes the testcase TestTrackerBlacklistAcrossJobs which was
+ earlier failing randomly. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4914. Add description fields to Chukwa init.d scripts (Eric Yang via
+ cdouglas)
+
+ HADOOP-4884. Make tool tip date format match standard HICC format. (Eric
+ Yang via cdouglas)
+
+ HADOOP-4925. Make Chukwa sender properties configurable. (Ari Rabkin via
+ cdouglas)
+
+ HADOOP-4947. Make Chukwa command parsing more forgiving of whitespace. (Ari
+ Rabkin via cdouglas)
+
+ HADOOP-5026. Make chukwa/bin scripts executable in repository. (Andy
+ Konwinski via cdouglas)
+
+ HADOOP-4977. Fix a deadlock between the reclaimCapacity and assignTasks
+ in capacity scheduler. (Vivek Ratan via yhemanth)
+
+ HADOOP-4988. Fix reclaim capacity to work even when there are queues with
+ no capacity. (Vivek Ratan via yhemanth)
+
+ HADOOP-5065. Remove generic parameters from argument to
+ setIn/OutputFormatClass so that it works with SequenceIn/OutputFormat.
+ (cdouglas via omalley)
+
+ HADOOP-4818. Pass user config to instrumentation API. (Eric Yang via
+ cdouglas)
+
+ HADOOP-4993. Fix Chukwa agent configuration and startup to make it both
+ more modular and testable. (Ari Rabkin via cdouglas)
+
+ HADOOP-5048. Fix capacity scheduler to correctly cleanup jobs that are
+ killed after initialization, but before running.
+ (Sreekanth Ramakrishnan via yhemanth)
+
+ HADOOP-4671. Mark loop control variables shared between threads as
+ volatile. (cdouglas)
+
+ HADOOP-5079. HashFunction inadvertently destroys some randomness
+ (Jonathan Ellis via stack)
+
+ HADOOP-4999. A failure to write to FsEditsLog results in
+ IndexOutOfBounds exception. (Boris Shkolnik via rangadi)
+
+ HADOOP-5139. Catch IllegalArgumentException during metrics registration
+ in RPC. (Hairong Kuang via szetszwo)
+
+ HADOOP-5085. Copying a file to local with Crc throws an exception.
+ (hairong)
+
+ HADOOP-4759. Removes temporary output directory for failed and
+ killed tasks by launching special CLEANUP tasks for the same.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5211. Fix check for job completion in TestSetupAndCleanupFailure.
+ (enis)
+
+ HADOOP-5254. The Configuration class should be able to work with XML
+ parsers that do not support xmlinclude. (Steve Loughran via dhruba)
+
+ HADOOP-4692. Namenode in infinite loop for replicating/deleting corrupt
+ blocks. (hairong)
+
+ HADOOP-5255. Fix use of Math.abs to avoid overflow. (Jonathan Ellis via
+ cdouglas)
+
+ HADOOP-5269. Fixes a problem to do with tasktracker holding on to
+ FAILED_UNCLEAN or KILLED_UNCLEAN tasks forever. (Amareshwari Sriramadasu
+ via ddas)
+
+ HADOOP-5214. Fixes a ConcurrentModificationException while the Fairshare
+ Scheduler accesses the tasktrackers stored by the JobTracker.
+ (Rahul Kumar Singh via yhemanth)
+
+ HADOOP-5233. Addresses the three issues - Race condition in updating
+ status, NPE in TaskTracker task localization when the conf file is missing
+ (HADOOP-5234) and NPE in handling KillTaskAction of a cleanup task
+ (HADOOP-5235). (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5247. Introduces a broadcast of KillJobAction to all trackers when
+ a job finishes. This fixes a bunch of problems to do with NPE when a
+ completed job is not in memory and a tasktracker comes to the jobtracker
+ with a status report of a task belonging to that job. (Amar Kamat via ddas)
+
+ HADOOP-5282. Fixed job history logs for task attempts that are
+ failed by the JobTracker, say due to lost task trackers. (Amar
+ Kamat via yhemanth)
+
+ HADOOP-4963. Fixes a logging to do with getting the location of
+ map output file. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5292. Fix NPE in KFS::getBlockLocations. (Sriram Rao via lohit)
+
+ HADOOP-5241. Fixes a bug in disk-space resource estimation. Makes
+ the estimation formula linear where blowUp =
+ Total-Output/Total-Input. (Sharad Agarwal via ddas)
+
+ HADOOP-5142. Fix MapWritable#putAll to store key/value classes.
+ (Do??acan G??ney via enis)
+
+ HADOOP-4744. Workaround for jetty6 returning -1 when getLocalPort
+ is invoked on the connector. The workaround patch retries a few
+ times before failing. (Jothi Padmanabhan via yhemanth)
+
+ HADOOP-5280. Adds a check to prevent a task state transition from
+ FAILED to any of UNASSIGNED, RUNNING, COMMIT_PENDING or
+ SUCCEEDED. (ddas)
+
+ HADOOP-5272. Fixes a problem to do with detecting whether an
+ attempt is the first attempt of a Task. This affects JobTracker
+ restart. (Amar Kamat via ddas)
+
+ HADOOP-5306. Fixes a problem to do with logging/parsing the http port of a
+ lost tracker. Affects JobTracker restart. (Amar Kamat via ddas)
+
+ HADOOP-5111. Fix Job::set* methods to work with generics. (cdouglas)
+
+ HADOOP-5274. Fix gridmix2 dependency on wordcount example. (cdouglas)
+
+ HADOOP-5145. Balancer sometimes runs out of memory after running
+ days or weeks. (hairong)
+
+ HADOOP-5338. Fix jobtracker restart to clear task completion
+ events cached by tasktrackers forcing them to fetch all events
+ afresh, thus avoiding missed task completion events on the
+ tasktrackers. (Amar Kamat via yhemanth)
+
+ HADOOP-4695. Change TestGlobalFilter so that it allows a web page to be
+ filtered more than once for a single access. (Kan Zhang via szetszwo)
+
+ HADOOP-5298. Change TestServletFilter so that it allows a web page to be
+ filtered more than once for a single access. (szetszwo)
+
+ HADOOP-5432. Disable ssl during unit tests in hdfsproxy, as it is unused
+ and causes failures. (cdouglas)
+
+ HADOOP-5416. Correct the shell command "fs -test" forrest doc description.
+ (Ravi Phulari via szetszwo)
+
+ HADOOP-5327. Fixed job tracker to remove files from system directory on
+ ACL check failures and also check ACLs on restart.
+ (Amar Kamat via yhemanth)
+
+ HADOOP-5395. Change the exception message when a job is submitted to an
+ invalid queue. (Rahul Kumar Singh via yhemanth)
+
+ HADOOP-5276. Fixes a problem to do with updating the start time of
+ a task when the tracker that ran the task is lost. (Amar Kamat via
+ ddas)
+
+ HADOOP-5278. Fixes a problem to do with logging the finish time of
+ a task during recovery (after a JobTracker restart). (Amar Kamat
+ via ddas)
+
+ HADOOP-5490. Fixes a synchronization problem in the
+ EagerTaskInitializationListener class. (Jothi Padmanabhan via
+ ddas)
+
+ HADOOP-5493. The shuffle copier threads return the codecs back to
+ the pool when the shuffle completes. (Jothi Padmanabhan via ddas)
+
+ HADOOP-5505. Fix JspHelper initialization in the context of
+ MiniDFSCluster. (Raghu Angadi)
+
+ HADOOP-5414. Fixes IO exception while executing hadoop fs -touchz
+ fileName by making sure that lease renewal thread exits before dfs
+ client exits. (hairong)
+
+ HADOOP-5103. FileInputFormat now reuses the clusterMap network
+ topology object and that brings down the log messages in the
+ JobClient to do with NetworkTopology.add significantly. (Jothi
+ Padmanabhan via ddas)
+
+ HADOOP-5483. Fixes a problem in the Directory Cleanup Thread due to which
+ TestMiniMRWithDFS sometimes used to fail. (ddas)
+
+ HADOOP-5281. Prevent sharing incompatible ZlibCompressor instances between
+ GzipCodec and DefaultCodec. (cdouglas)
+
+ HADOOP-5463. Balancer throws "Not a host:port pair" unless port is
+ specified in fs.default.name. (Stuart White via hairong)
+
+ HADOOP-5514. Fix JobTracker metrics and add metrics for wating, failed
+ tasks. (cdouglas)
+
+ HADOOP-5516. Fix NullPointerException in TaskMemoryManagerThread
+ that comes when monitored processes disappear when the thread is
+ running. (Vinod Kumar Vavilapalli via yhemanth)
+
+ HADOOP-5382. Support combiners in the new context object API. (omalley)
+
+ HADOOP-5471. Fixes a problem to do with updating the log.index file in the
+ case where a cleanup task is run. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5534. Fixed a deadlock in Fair scheduler's servlet.
+ (Rahul Kumar Singh via yhemanth)
+
+ HADOOP-5328. Fixes a problem in the renaming of job history files during
+ job recovery. Amar Kamat via ddas)
+
+ HADOOP-5417. Don't ignore InterruptedExceptions that happen when calling
+ into rpc. (omalley)
+
+ HADOOP-5320. Add a close() in TestMapReduceLocal. (Jothi Padmanabhan
+ via szetszwo)
+
+ HADOOP-5520. Fix a typo in disk quota help message. (Ravi Phulari
+ via szetszwo)
+
+ HADOOP-5519. Remove claims from mapred-default.xml that prime numbers
+ of tasks are helpful. (Owen O'Malley via szetszwo)
+
+ HADOOP-5484. TestRecoveryManager fails wtih FileAlreadyExistsException.
+ (Amar Kamat via hairong)
+
+ HADOOP-5564. Limit the JVM heap size in the java command for initializing
+ JAVA_PLATFORM. (Suresh Srinivas via szetszwo)
+
+ HADOOP-5565. Add API for failing/finalized jobs to the JT metrics
+ instrumentation. (Jerome Boulon via cdouglas)
+
+ HADOOP-5390. Remove duplicate jars from tarball, src from binary tarball
+ added by hdfsproxy. (Zhiyong Zhang via cdouglas)
+
+ HADOOP-5066. Building binary tarball should not build docs/javadocs, copy
+ src, or run jdiff. (Giridharan Kesavan via cdouglas)
+
+ HADOOP-5459. Fix undetected CRC errors where intermediate output is closed
+ before it has been completely consumed. (cdouglas)
+
+ HADOOP-5571. Remove widening primitive conversion in TupleWritable mask
+ manipulation. (Jingkei Ly via cdouglas)
+
+ HADOOP-5588. Remove an unnecessary call to listStatus(..) in
+ FileSystem.globStatusInternal(..). (Hairong Kuang via szetszwo)
+
+ HADOOP-5473. Solves a race condition in killing a task - the state is KILLED
+ if there is a user request pending to kill the task and the TT reported
+ the state as SUCCESS. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5576. Fix LocalRunner to work with the new context object API in
+ mapreduce. (Tom White via omalley)
+
+ HADOOP-4374. Installs a shutdown hook in the Task JVM so that log.index is
+ updated before the JVM exits. Also makes the update to log.index atomic.
+ (Ravi Gummadi via ddas)
+
+ HADOOP-5577. Add a verbose flag to mapreduce.Job.waitForCompletion to get
+ the running job's information printed to the user's stdout as it runs.
+ (omalley)
+
+ HADOOP-5607. Fix NPE in TestCapacityScheduler. (cdouglas)
+
+ HADOOP-5605. All the replicas incorrectly got marked as corrupt. (hairong)
+
+ HADOOP-5337. JobTracker, upon restart, now waits for the TaskTrackers to
+ join back before scheduling new tasks. This fixes race conditions associated
+ with greedy scheduling as was the case earlier. (Amar Kamat via ddas)
+
+ HADOOP-5227. Fix distcp so -update and -delete can be meaningfully
+ combined. (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-5305. Increase number of files and print debug messages in
+ TestCopyFiles. (szetszwo)
+
+ HADOOP-5548. Add synchronization for JobTracker methods in RecoveryManager.
+ (Amareshwari Sriramadasu via sharad)
+
+ HADOOP-3810. NameNode seems unstable on a cluster with little space left.
+ (hairong)
+
+ HADOOP-5068. Fix NPE in TestCapacityScheduler. (Vinod Kumar Vavilapalli
+ via szetszwo)
+
+ HADOOP-5585. Clear FileSystem statistics between tasks when jvm-reuse
+ is enabled. (omalley)
+
+ HADOOP-5394. JobTracker might schedule 2 attempts of the same task
+ with the same attempt id across restarts. (Amar Kamat via sharad)
+
+ HADOOP-5645. After HADOOP-4920 we need a place to checkin
+ releasenotes.html. (nigel)
+
+Release 0.19.2 - Unreleased
+
+ BUG FIXES
+
+ HADOOP-5154. Fixes a deadlock in the fairshare scheduler.
+ (Matei Zaharia via yhemanth)
+
+ HADOOP-5146. Fixes a race condition that causes LocalDirAllocator to miss
+ files. (Devaraj Das via yhemanth)
+
+ HADOOP-4638. Fixes job recovery to not crash the job tracker for problems
+ with a single job file. (Amar Kamat via yhemanth)
+
+ HADOOP-5384. Fix a problem that DataNodeCluster creates blocks with
+ generationStamp == 1. (szetszwo)
+
+ HADOOP-5376. Fixes the code handling lost tasktrackers to set the task state
+ to KILLED_UNCLEAN only for relevant type of tasks.
+ (Amareshwari Sriramadasu via yhemanth)
+
+ HADOOP-5285. Fixes the issues - (1) obtainTaskCleanupTask checks whether job is
+ inited before trying to lock the JobInProgress (2) Moves the CleanupQueue class
+ outside the TaskTracker and makes it a generic class that is used by the
+ JobTracker also for deleting the paths on the job's output fs. (3) Moves the
+ references to completedJobStore outside the block where the JobTracker is locked.
+ (ddas)
+
+ HADOOP-5392. Fixes a problem to do with JT crashing during recovery when
+ the job files are garbled. (Amar Kamat vi ddas)
+
+ HADOOP-5332. Appending to files is not allowed (by default) unless
+ dfs.support.append is set to true. (dhruba)
+
+ HADOOP-5333. libhdfs supports appending to files. (dhruba)
+
+ HADOOP-3998. Fix dfsclient exception when JVM is shutdown. (dhruba)
+
+ HADOOP-5440. Fixes a problem to do with removing a taskId from the list
+ of taskIds that the TaskTracker's TaskMemoryManager manages.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5446. Restore TaskTracker metrics. (cdouglas)
+
+ HADOOP-5449. Fixes the history cleaner thread.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5479. NameNode should not send empty block replication request to
+ DataNode. (hairong)
+
+ HADOOP-5259. Job with output hdfs:/user//outputpath (no
+ authority) fails with Wrong FS. (Doug Cutting via hairong)
+
+ HADOOP-5522. Documents the setup/cleanup tasks in the mapred tutorial.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-5549. ReplicationMonitor should schedule both replication and
+ deletion work in one iteration. (hairong)
+
+ HADOOP-5554. DataNodeCluster and CreateEditsLog should create blocks with
+ the same generation stamp value. (hairong via szetszwo)
+
+ HADOOP-5231. Clones the TaskStatus before passing it to the JobInProgress.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4719. Fix documentation of 'ls' format for FsShell. (Ravi Phulari
+ via cdouglas)
+
+ HADOOP-5374. Fixes a NPE problem in getTasksToSave method.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4780. Cache the size of directories in DistributedCache, avoiding
+ long delays in recalculating it. (He Yongqiang via cdouglas)
+
+ HADOOP-5551. Prevent directory destruction on file create.
+ (Brian Bockelman via shv)
+
+Release 0.19.1 - 2009-02-23
+
+ IMPROVEMENTS
+
+ HADOOP-4739. Fix spelling and grammar, improve phrasing of some sections in
+ mapred tutorial. (Vivek Ratan via cdouglas)
+
+ HADOOP-3894. DFSClient logging improvements. (Steve Loughran via shv)
+
+ HADOOP-5126. Remove empty file BlocksWithLocations.java (shv)
+
+ HADOOP-5127. Remove public methods in FSDirectory. (Jakob Homan via shv)
+
+ BUG FIXES
+
+ HADOOP-4697. Fix getBlockLocations in KosmosFileSystem to handle multiple
+ blocks correctly. (Sriram Rao via cdouglas)
+
+ HADOOP-4420. Add null checks for job, caused by invalid job IDs.
+ (Aaron Kimball via tomwhite)
+
+ HADOOP-4632. Fix TestJobHistoryVersion to use test.build.dir instead of the
+ current workding directory for scratch space. (Amar Kamat via cdouglas)
+
+ HADOOP-4508. Fix FSDataOutputStream.getPos() for append. (dhruba via
+ szetszwo)
+
+ HADOOP-4727. Fix a group checking bug in fill_stat_structure(...) in
+ fuse-dfs. (Brian Bockelman via szetszwo)
+
+ HADOOP-4836. Correct typos in mapred related documentation. (Jord? Polo
+ via szetszwo)
+
+ HADOOP-4821. Usage description in the Quotas guide documentations are
+ incorrect. (Boris Shkolnik via hairong)
+
+ HADOOP-4847. Moves the loading of OutputCommitter to the Task.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4966. Marks completed setup tasks for removal.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4982. TestFsck should run in Eclipse. (shv)
+
+ HADOOP-5008. TestReplication#testPendingReplicationRetry leaves an opened
+ fd unclosed. (hairong)
+
+ HADOOP-4906. Fix TaskTracker OOM by keeping a shallow copy of JobConf in
+ TaskTracker.TaskInProgress. (Sharad Agarwal via acmurthy)
+
+ HADOOP-4918. Fix bzip2 compression to work with Sequence Files.
+ (Zheng Shao via dhruba).
+
+ HADOOP-4965. TestFileAppend3 should close FileSystem. (shv)
+
+ HADOOP-4967. Fixes a race condition in the JvmManager to do with killing
+ tasks. (ddas)
+
+ HADOOP-5009. DataNode#shutdown sometimes leaves data block scanner
+ verification log unclosed. (hairong)
+
+ HADOOP-5086. Use the appropriate FileSystem for trash URIs. (cdouglas)
+
+ HADOOP-4955. Make DBOutputFormat us column names from setOutput().
+ (Kevin Peterson via enis)
+
+ HADOOP-4862. Minor : HADOOP-3678 did not remove all the cases of
+ spurious IOExceptions logged by DataNode. (Raghu Angadi)
+
+ HADOOP-5034. NameNode should send both replication and deletion requests
+ to DataNode in one reply to a heartbeat. (hairong)
+
+ HADOOP-5156. TestHeartbeatHandling uses MiiDFSCluster.getNamesystem()
+ which does not exit in branch 0.19 and 0.20. (hairong)
+
+ HADOOP-5161. Accepted sockets do not get placed in
+ DataXceiverServer#childSockets. (hairong)
+
+ HADOOP-5193. Correct calculation of edits modification time. (shv)
+
+ HADOOP-4494. Allow libhdfs to append to files.
+ (Pete Wyckoff via dhruba)
+
+ HADOOP-5166. Fix JobTracker restart to work when ACLs are configured
+ for the JobTracker. (Amar Kamat via yhemanth).
+
+ HADOOP-5067. Fixes TaskInProgress.java to keep track of count of failed and
+ killed tasks correctly. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4760. HDFS streams should not throw exceptions when closed twice.
+ (enis)
+
+Release 0.19.0 - 2008-11-18
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-3595. Remove deprecated methods for mapred.combine.once
+ functionality, which was necessary to providing backwards
+ compatible combiner semantics for 0.18. (cdouglas via omalley)
+
+ HADOOP-3667. Remove the following deprecated methods from JobConf:
+ addInputPath(Path)
+ getInputPaths()
+ getMapOutputCompressionType()
+ getOutputPath()
+ getSystemDir()
+ setInputPath(Path)
+ setMapOutputCompressionType(CompressionType style)
+ setOutputPath(Path)
+ (Amareshwari Sriramadasu via omalley)
+
+ HADOOP-3652. Remove deprecated class OutputFormatBase.
+ (Amareshwari Sriramadasu via cdouglas)
+
+ HADOOP-2885. Break the hadoop.dfs package into separate packages under
+ hadoop.hdfs that reflect whether they are client, server, protocol,
+ etc. DistributedFileSystem and DFSClient have moved and are now
+ considered package private. (Sanjay Radia via omalley)
+
+ HADOOP-2325. Require Java 6. (cutting)
+
+ HADOOP-372. Add support for multiple input paths with a different
+ InputFormat and Mapper for each path. (Chris Smith via tomwhite)
+
+ HADOOP-1700. Support appending to file in HDFS. (dhruba)
+
+ HADOOP-3792. Make FsShell -test consistent with unix semantics, returning
+ zero for true and non-zero for false. (Ben Slusky via cdouglas)
+
+ HADOOP-3664. Remove the deprecated method InputFormat.validateInput,
+ which is no longer needed. (tomwhite via omalley)
+
+ HADOOP-3549. Give more meaningful errno's in libhdfs. In particular,
+ EACCES is returned for permission problems. (Ben Slusky via omalley)
+
+ HADOOP-4036. ResourceStatus was added to TaskTrackerStatus by HADOOP-3759,
+ so increment the InterTrackerProtocol version. (Hemanth Yamijala via
+ omalley)
+
+ HADOOP-3150. Moves task promotion to tasks. Defines a new interface for
+ committing output files. Moves job setup to jobclient, and moves jobcleanup
+ to a separate task. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3446. Keep map outputs in memory during the reduce. Remove
+ fs.inmemory.size.mb and replace with properties defining in memory map
+ output retention during the shuffle and reduce relative to maximum heap
+ usage. (cdouglas)
+
+ HADOOP-3245. Adds the feature for supporting JobTracker restart. Running
+ jobs can be recovered from the history file. The history file format has
+ been modified to support recovery. The task attempt ID now has the
+ JobTracker start time to disinguish attempts of the same TIP across
+ restarts. (Amar Ramesh Kamat via ddas)
+
+ HADOOP-4007. REMOVE DFSFileInfo - FileStatus is sufficient.
+ (Sanjay Radia via hairong)
+
+ HADOOP-3722. Fixed Hadoop Streaming and Hadoop Pipes to use the Tool
+ interface and GenericOptionsParser. (Enis Soztutar via acmurthy)
+
+ HADOOP-2816. Cluster summary at name node web reports the space
+ utilization as:
+ Configured Capacity: capacity of all the data directories - Reserved space
+ Present Capacity: Space available for dfs,i.e. remaining+used space
+ DFS Used%: DFS used space/Present Capacity
+ (Suresh Srinivas via hairong)
+
+ HADOOP-3938. Disk space quotas for HDFS. This is similar to namespace
+ quotas in 0.18. (rangadi)
+
+ HADOOP-4293. Make Configuration Writable and remove unreleased
+ WritableJobConf. Configuration.write is renamed to writeXml. (omalley)
+
+ HADOOP-4281. Change dfsadmin to report available disk space in a format
+ consistent with the web interface as defined in HADOOP-2816. (Suresh
+ Srinivas via cdouglas)
+
+ HADOOP-4430. Further change the cluster summary at name node web that was
+ changed in HADOOP-2816:
+ Non DFS Used - This indicates the disk space taken by non DFS file from
+ the Configured capacity
+ DFS Used % - DFS Used % of Configured Capacity
+ DFS Remaining % - Remaing % Configured Capacity available for DFS use
+ DFS command line report reflects the same change. Config parameter
+ dfs.datanode.du.pct is no longer used and is removed from the
+ hadoop-default.xml. (Suresh Srinivas via hairong)
+
+ HADOOP-4116. Balancer should provide better resource management. (hairong)
+
+ HADOOP-4599. BlocksMap and BlockInfo made package private. (shv)
+
+ NEW FEATURES
+
+ HADOOP-3341. Allow streaming jobs to specify the field separator for map
+ and reduce input and output. The new configuration values are:
+ stream.map.input.field.separator
+ stream.map.output.field.separator
+ stream.reduce.input.field.separator
+ stream.reduce.output.field.separator
+ All of them default to "\t". (Zheng Shao via omalley)
+
+ HADOOP-3479. Defines the configuration file for the resource manager in
+ Hadoop. You can configure various parameters related to scheduling, such
+ as queues and queue properties here. The properties for a queue follow a
+ naming convention,such as, hadoop.rm.queue.queue-name.property-name.
+ (Hemanth Yamijala via ddas)
+
+ HADOOP-3149. Adds a way in which map/reducetasks can create multiple
+ outputs. (Alejandro Abdelnur via ddas)
+
+ HADOOP-3714. Add a new contrib, bash-tab-completion, which enables
+ bash tab completion for the bin/hadoop script. See the README file
+ in the contrib directory for the installation. (Chris Smith via enis)
+
+ HADOOP-3730. Adds a new JobConf constructor that disables loading
+ default configurations. (Alejandro Abdelnur via ddas)
+
+ HADOOP-3772. Add a new Hadoop Instrumentation api for the JobTracker and
+ the TaskTracker, refactor Hadoop Metrics as an implementation of the api.
+ (Ari Rabkin via acmurthy)
+
+ HADOOP-2302. Provides a comparator for numerical sorting of key fields.
+ (ddas)
+
+ HADOOP-153. Provides a way to skip bad records. (Sharad Agarwal via ddas)
+
+ HADOOP-657. Free disk space should be modelled and used by the scheduler
+ to make scheduling decisions. (Ari Rabkin via omalley)
+
+ HADOOP-3719. Initial checkin of Chukwa, which is a data collection and
+ analysis framework. (Jerome Boulon, Andy Konwinski, Ari Rabkin,
+ and Eric Yang)
+
+ HADOOP-3873. Add -filelimit and -sizelimit options to distcp to cap the
+ number of files/bytes copied in a particular run to support incremental
+ updates and mirroring. (TszWo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3585. FailMon package for hardware failure monitoring and
+ analysis of anomalies. (Ioannis Koltsidas via dhruba)
+
+ HADOOP-1480. Add counters to the C++ Pipes API. (acmurthy via omalley)
+
+ HADOOP-3854. Add support for pluggable servlet filters in the HttpServers.
+ (Tsz Wo (Nicholas) Sze via omalley)
+
+ HADOOP-3759. Provides ability to run memory intensive jobs without
+ affecting other running tasks on the nodes. (Hemanth Yamijala via ddas)
+
+ HADOOP-3746. Add a fair share scheduler. (Matei Zaharia via omalley)
+
+ HADOOP-3754. Add a thrift interface to access HDFS. (dhruba via omalley)
+
+ HADOOP-3828. Provides a way to write skipped records to DFS.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-3948. Separate name-node edits and fsimage directories.
+ (Lohit Vijayarenu via shv)
+
+ HADOOP-3939. Add an option to DistCp to delete files at the destination
+ not present at the source. (Tsz Wo (Nicholas) Sze via cdouglas)
+
+ HADOOP-3601. Add a new contrib module for Hive, which is a sql-like
+ query processing tool that uses map/reduce. (Ashish Thusoo via omalley)
+
+ HADOOP-3866. Added sort and multi-job updates in the JobTracker web ui.
+ (Craig Weisenfluh via omalley)
+
+ HADOOP-3698. Add access control to control who is allowed to submit or
+ modify jobs in the JobTracker. (Hemanth Yamijala via omalley)
+
+ HADOOP-1869. Support access times for HDFS files. (dhruba)
+
+ HADOOP-3941. Extend FileSystem API to return file-checksums.
+ (szetszwo)
+
+ HADOOP-3581. Prevents memory intensive user tasks from taking down
+ nodes. (Vinod K V via ddas)
+
+ HADOOP-3970. Provides a way to recover counters written to JobHistory.
+ (Amar Kamat via ddas)
+
+ HADOOP-3702. Adds ChainMapper and ChainReducer classes allow composing
+ chains of Maps and Reduces in a single Map/Reduce job, something like
+ MAP+ / REDUCE MAP*. (Alejandro Abdelnur via ddas)
+
+ HADOOP-3445. Add capacity scheduler that provides guaranteed capacities to
+ queues as a percentage of the cluster. (Vivek Ratan via omalley)
+
+ HADOOP-3992. Add a synthetic load generation facility to the test
+ directory. (hairong via szetszwo)
+
+ HADOOP-3981. Implement a distributed file checksum algorithm in HDFS
+ and change DistCp to use file checksum for comparing src and dst files
+ (szetszwo)
+
+ HADOOP-3829. Narrown down skipped records based on user acceptable value.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-3930. Add common interfaces for the pluggable schedulers and the
+ cli & gui clients. (Sreekanth Ramakrishnan via omalley)
+
+ HADOOP-4176. Implement getFileChecksum(Path) in HftpFileSystem. (szetszwo)
+
+ HADOOP-249. Reuse JVMs across Map-Reduce Tasks.
+ Configuration changes to hadoop-default.xml:
+ add mapred.job.reuse.jvm.num.tasks
+ (Devaraj Das via acmurthy)
+
+ HADOOP-4070. Provide a mechanism in Hive for registering UDFs from the
+ query language. (tomwhite)
+
+ HADOOP-2536. Implement a JDBC based database input and output formats to
+ allow Map-Reduce applications to work with databases. (Fredrik Hedberg and
+ Enis Soztutar via acmurthy)
+
+ HADOOP-3019. A new library to support total order partitions.
+ (cdouglas via omalley)
+
+ HADOOP-3924. Added a 'KILLED' job status. (Subramaniam Krishnan via
+ acmurthy)
+
+ IMPROVEMENTS
+
+ HADOOP-4205. hive: metastore and ql to use the refactored SerDe library.
+ (zshao)
+
+ HADOOP-4106. libhdfs: add time, permission and user attribute support
+ (part 2). (Pete Wyckoff through zshao)
+
+ HADOOP-4104. libhdfs: add time, permission and user attribute support.
+ (Pete Wyckoff through zshao)
+
+ HADOOP-3908. libhdfs: better error message if llibhdfs.so doesn't exist.
+ (Pete Wyckoff through zshao)
+
+ HADOOP-3732. Delay intialization of datanode block verification till
+ the verification thread is started. (rangadi)
+
+ HADOOP-1627. Various small improvements to 'dfsadmin -report' output.
+ (rangadi)
+
+ HADOOP-3577. Tools to inject blocks into name node and simulated
+ data nodes for testing. (Sanjay Radia via hairong)
+
+ HADOOP-2664. Add a lzop compatible codec, so that files compressed by lzop
+ may be processed by map/reduce. (cdouglas via omalley)
+
+ HADOOP-3655. Add additional ant properties to control junit. (Steve
+ Loughran via omalley)
+
+ HADOOP-3543. Update the copyright year to 2008. (cdouglas via omalley)
+
+ HADOOP-3587. Add a unit test for the contrib/data_join framework.
+ (cdouglas)
+
+ HADOOP-3402. Add terasort example program (omalley)
+
+ HADOOP-3660. Add replication factor for injecting blocks in simulated
+ datanodes. (Sanjay Radia via cdouglas)
+
+ HADOOP-3684. Add a cloning function to the contrib/data_join framework
+ permitting users to define a more efficient method for cloning values from
+ the reduce than serialization/deserialization. (Runping Qi via cdouglas)
+
+ HADOOP-3478. Improves the handling of map output fetching. Now the
+ randomization is by the hosts (and not the map outputs themselves).
+ (Jothi Padmanabhan via ddas)
+
+ HADOOP-3617. Removed redundant checks of accounting space in MapTask and
+ makes the spill thread persistent so as to avoid creating a new one for
+ each spill. (Chris Douglas via acmurthy)
+
+ HADOOP-3412. Factor the scheduler out of the JobTracker and make
+ it pluggable. (Tom White and Brice Arnould via omalley)
+
+ HADOOP-3756. Minor. Remove unused dfs.client.buffer.dir from
+ hadoop-default.xml. (rangadi)
+
+ HADOOP-3747. Adds counter suport for MultipleOutputs.
+ (Alejandro Abdelnur via ddas)
+
+ HADOOP-3169. LeaseChecker daemon should not be started in DFSClient
+ constructor. (TszWo (Nicholas), SZE via hairong)
+
+ HADOOP-3824. Move base functionality of StatusHttpServer to a core
+ package. (TszWo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3646. Add a bzip2 compatible codec, so bzip compressed data
+ may be processed by map/reduce. (Abdul Qadeer via cdouglas)
+
+ HADOOP-3861. MapFile.Reader and Writer should implement Closeable.
+ (tomwhite via omalley)
+
+ HADOOP-3791. Introduce generics into ReflectionUtils. (Chris Smith via
+ cdouglas)
+
+ HADOOP-3694. Improve unit test performance by changing
+ MiniDFSCluster to listen only on 127.0.0.1. (cutting)
+
+ HADOOP-3620. Namenode should synchronously resolve a datanode's network
+ location when the datanode registers. (hairong)
+
+ HADOOP-3860. NNThroughputBenchmark is extended with rename and delete
+ benchmarks. (shv)
+
+ HADOOP-3892. Include unix group name in JobConf. (Matei Zaharia via johan)
+
+ HADOOP-3875. Change the time period between heartbeats to be relative to
+ the end of the heartbeat rpc, rather than the start. This causes better
+ behavior if the JobTracker is overloaded. (acmurthy via omalley)
+
+ HADOOP-3853. Move multiple input format (HADOOP-372) extension to
+ library package. (tomwhite via johan)
+
+ HADOOP-9. Use roulette scheduling for temporary space when the size
+ is not known. (Ari Rabkin via omalley)
+
+ HADOOP-3202. Use recursive delete rather than FileUtil.fullyDelete.
+ (Amareshwari Sriramadasu via omalley)
+
+ HADOOP-3368. Remove common-logging.properties from conf. (Steve Loughran
+ via omalley)
+
+ HADOOP-3851. Fix spelling mistake in FSNamesystemMetrics. (Steve Loughran
+ via omalley)
+
+ HADOOP-3780. Remove asynchronous resolution of network topology in the
+ JobTracker (Amar Kamat via omalley)
+
+ HADOOP-3852. Add ShellCommandExecutor.toString method to make nicer
+ error messages. (Steve Loughran via omalley)
+
+ HADOOP-3844. Include message of local exception in RPC client failures.
+ (Steve Loughran via omalley)
+
+ HADOOP-3935. Split out inner classes from DataNode.java. (johan)
+
+ HADOOP-3905. Create generic interfaces for edit log streams. (shv)
+
+ HADOOP-3062. Add metrics to DataNode and TaskTracker to record network
+ traffic for HDFS reads/writes and MR shuffling. (cdouglas)
+
+ HADOOP-3742. Remove HDFS from public java doc and add javadoc-dev for
+ generative javadoc for developers. (Sanjay Radia via omalley)
+
+ HADOOP-3944. Improve documentation for public TupleWritable class in
+ join package. (Chris Douglas via enis)
+
+ HADOOP-2330. Preallocate HDFS transaction log to improve performance.
+ (dhruba and hairong)
+
+ HADOOP-3965. Convert DataBlockScanner into a package private class. (shv)
+
+ HADOOP-3488. Prevent hadoop-daemon from rsync'ing log files (Stefan
+ Groshupf and Craig Macdonald via omalley)
+
+ HADOOP-3342. Change the kill task actions to require http post instead of
+ get to prevent accidental crawls from triggering it. (enis via omalley)
+
+ HADOOP-3937. Limit the job name in the job history filename to 50
+ characters. (Matei Zaharia via omalley)
+
+ HADOOP-3943. Remove unnecessary synchronization in
+ NetworkTopology.pseudoSortByDistance. (hairong via omalley)
+
+ HADOOP-3498. File globbing alternation should be able to span path
+ components. (tomwhite)
+
+ HADOOP-3361. Implement renames for NativeS3FileSystem.
+ (Albert Chern via tomwhite)
+
+ HADOOP-3605. Make EC2 scripts show an error message if AWS_ACCOUNT_ID is
+ unset. (Al Hoang via tomwhite)
+
+ HADOOP-4147. Remove unused class JobWithTaskContext from class
+ JobInProgress. (Amareshwari Sriramadasu via johan)
+
+ HADOOP-4151. Add a byte-comparable interface that both Text and
+ BytesWritable implement. (cdouglas via omalley)
+
+ HADOOP-4174. Move fs image/edit log methods from ClientProtocol to
+ NamenodeProtocol. (shv via szetszwo)
+
+ HADOOP-4181. Include a .gitignore and saveVersion.sh change to support
+ developing under git. (omalley)
+
+ HADOOP-4186. Factor LineReader out of LineRecordReader. (tomwhite via
+ omalley)
+
+ HADOOP-4184. Break the module dependencies between core, hdfs, and
+ mapred. (tomwhite via omalley)
+
+ HADOOP-4075. test-patch.sh now spits out ant commands that it runs.
+ (Ramya R via nigel)
+
+ HADOOP-4117. Improve configurability of Hadoop EC2 instances.
+ (tomwhite)
+
+ HADOOP-2411. Add support for larger CPU EC2 instance types.
+ (Chris K Wensel via tomwhite)
+
+ HADOOP-4083. Changed the configuration attribute queue.name to
+ mapred.job.queue.name. (Hemanth Yamijala via acmurthy)
+
+ HADOOP-4194. Added the JobConf and JobID to job-related methods in
+ JobTrackerInstrumentation for better metrics. (Mac Yang via acmurthy)
+
+ HADOOP-3975. Change test-patch script to report working the dir
+ modifications preventing the suite from being run. (Ramya R via cdouglas)
+
+ HADOOP-4124. Added a command-line switch to allow users to set job
+ priorities, also allow it to be manipulated via the web-ui. (Hemanth
+ Yamijala via acmurthy)
+
+ HADOOP-2165. Augmented JobHistory to include the URIs to the tasks'
+ userlogs. (Vinod Kumar Vavilapalli via acmurthy)
+
+ HADOOP-4062. Remove the synchronization on the output stream when a
+ connection is closed and also remove an undesirable exception when
+ a client is stoped while there is no pending RPC request. (hairong)
+
+ HADOOP-4227. Remove the deprecated class org.apache.hadoop.fs.ShellCommand.
+ (szetszwo)
+
+ HADOOP-4006. Clean up FSConstants and move some of the constants to
+ better places. (Sanjay Radia via rangadi)
+
+ HADOOP-4279. Trace the seeds of random sequences in append unit tests to
+ make itermitant failures reproducible. (szetszwo via cdouglas)
+
+ HADOOP-4209. Remove the change to the format of task attempt id by
+ incrementing the task attempt numbers by 1000 when the job restarts.
+ (Amar Kamat via omalley)
+
+ HADOOP-4301. Adds forrest doc for the skip bad records feature.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-4354. Separate TestDatanodeDeath.testDatanodeDeath() into 4 tests.
+ (szetszwo)
+
+ HADOOP-3790. Add more unit tests for testing HDFS file append. (szetszwo)
+
+ HADOOP-4321. Include documentation for the capacity scheduler. (Hemanth
+ Yamijala via omalley)
+
+ HADOOP-4424. Change menu layout for Hadoop documentation (Boris Shkolnik
+ via cdouglas).
+
+ HADOOP-4438. Update forrest documentation to include missing FsShell
+ commands. (Suresh Srinivas via cdouglas)
+
+ HADOOP-4105. Add forrest documentation for libhdfs.
+ (Pete Wyckoff via cutting)
+
+ HADOOP-4510. Make getTaskOutputPath public. (Chris Wensel via omalley)
+
+ OPTIMIZATIONS
+
+ HADOOP-3556. Removed lock contention in MD5Hash by changing the
+ singleton MessageDigester by an instance per Thread using
+ ThreadLocal. (Iv?n de Prado via omalley)
+
+ HADOOP-3328. When client is writing data to DFS, only the last
+ datanode in the pipeline needs to verify the checksum. Saves around
+ 30% CPU on intermediate datanodes. (rangadi)
+
+ HADOOP-3863. Use a thread-local string encoder rather than a static one
+ that is protected by a lock. (acmurthy via omalley)
+
+ HADOOP-3864. Prevent the JobTracker from locking up when a job is being
+ initialized. (acmurthy via omalley)
+
+ HADOOP-3816. Faster directory listing in KFS. (Sriram Rao via omalley)
+
+ HADOOP-2130. Pipes submit job should have both blocking and non-blocking
+ versions. (acmurthy via omalley)
+
+ HADOOP-3769. Make the SampleMapper and SampleReducer from
+ GenericMRLoadGenerator public, so they can be used in other contexts.
+ (Lingyun Yang via omalley)
+
+ HADOOP-3514. Inline the CRCs in intermediate files as opposed to reading
+ it from a different .crc file. (Jothi Padmanabhan via ddas)
+
+ HADOOP-3638. Caches the iFile index files in memory to reduce seeks
+ (Jothi Padmanabhan via ddas)
+
+ HADOOP-4225. FSEditLog.logOpenFile() should persist accessTime
+ rather than modificationTime. (shv)
+
+ HADOOP-4380. Made several new classes (Child, JVMId,
+ JobTrackerInstrumentation, QueueManager, ResourceEstimator,
+ TaskTrackerInstrumentation, and TaskTrackerMetricsInst) in
+ org.apache.hadoop.mapred package private instead of public. (omalley)
+
+ BUG FIXES
+
+ HADOOP-3563. Refactor the distributed upgrade code so that it is
+ easier to identify datanode and namenode related code. (dhruba)
+
+ HADOOP-3640. Fix the read method in the NativeS3InputStream. (tomwhite via
+ omalley)
+
+ HADOOP-3711. Fixes the Streaming input parsing to properly find the
+ separator. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3725. Prevent TestMiniMRMapDebugScript from swallowing exceptions.
+ (Steve Loughran via cdouglas)
+
+ HADOOP-3726. Throw exceptions from TestCLI setup and teardown instead of
+ swallowing them. (Steve Loughran via cdouglas)
+
+ HADOOP-3721. Refactor CompositeRecordReader and related mapred.join classes
+ to make them clearer. (cdouglas)
+
+ HADOOP-3720. Re-read the config file when dfsadmin -refreshNodes is invoked
+ so dfs.hosts and dfs.hosts.exclude are observed. (lohit vijayarenu via
+ cdouglas)
+
+ HADOOP-3485. Allow writing to files over fuse.
+ (Pete Wyckoff via dhruba)
+
+ HADOOP-3723. The flags to the libhdfs.create call can be treated as
+ a bitmask. (Pete Wyckoff via dhruba)
+
+ HADOOP-3643. Filter out completed tasks when asking for running tasks in
+ the JobTracker web/ui. (Amar Kamat via omalley)
+
+ HADOOP-3777. Ensure that Lzo compressors/decompressors correctly handle the
+ case where native libraries aren't available. (Chris Douglas via acmurthy)
+
+ HADOOP-3728. Fix SleepJob so that it doesn't depend on temporary files,
+ this ensures we can now run more than one instance of SleepJob
+ simultaneously. (Chris Douglas via acmurthy)
+
+ HADOOP-3795. Fix saving image files on Namenode with different checkpoint
+ stamps. (Lohit Vijayarenu via mahadev)
+
+ HADOOP-3624. Improving createeditslog to create tree directory structure.
+ (Lohit Vijayarenu via mahadev)
+
+ HADOOP-3778. DFSInputStream.seek() did not retry in case of some errors.
+ (LN via rangadi)
+
+ HADOOP-3661. The handling of moving files deleted through fuse-dfs to
+ Trash made similar to the behaviour from dfs shell.
+ (Pete Wyckoff via dhruba)
+
+ HADOOP-3819. Unset LANG and LC_CTYPE in saveVersion.sh to make it
+ compatible with non-English locales. (Rong-En Fan via cdouglas)
+
+ HADOOP-3848. Cache calls to getSystemDir in the TaskTracker instead of
+ calling it for each task start. (acmurthy via omalley)
+
+ HADOOP-3131. Fix reduce progress reporting for compressed intermediate
+ data. (Matei Zaharia via acmurthy)
+
+ HADOOP-3796. fuse-dfs configuration is implemented as file system
+ mount options. (Pete Wyckoff via dhruba)
+
+ HADOOP-3836. Fix TestMultipleOutputs to correctly clean up. (Alejandro
+ Abdelnur via acmurthy)
+
+ HADOOP-3805. Improve fuse-dfs write performance.
+ (Pete Wyckoff via zshao)
+
+ HADOOP-3846. Fix unit test CreateEditsLog to generate paths correctly.
+ (Lohit Vjayarenu via cdouglas)
+
+ HADOOP-3904. Fix unit tests using the old dfs package name.
+ (TszWo (Nicholas), SZE via johan)
+
+ HADOOP-3319. Fix some HOD error messages to go stderr instead of
+ stdout. (Vinod Kumar Vavilapalli via omalley)
+
+ HADOOP-3907. Move INodeDirectoryWithQuota to its own .java file.
+ (Tsz Wo (Nicholas), SZE via hairong)
+
+ HADOOP-3919. Fix attribute name in hadoop-default for
+ mapred.jobtracker.instrumentation. (Ari Rabkin via omalley)
+
+ HADOOP-3903. Change the package name for the servlets to be hdfs instead of
+ dfs. (Tsz Wo (Nicholas) Sze via omalley)
+
+ HADOOP-3773. Change Pipes to set the default map output key and value
+ types correctly. (Koji Noguchi via omalley)
+
+ HADOOP-3952. Fix compilation error in TestDataJoin referencing dfs package.
+ (omalley)
+
+ HADOOP-3951. Fix package name for FSNamesystem logs and modify other
+ hard-coded Logs to use the class name. (cdouglas)
+
+ HADOOP-3889. Improve error reporting from HftpFileSystem, handling in
+ DistCp. (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3946. Fix TestMapRed after hadoop-3664. (tomwhite via omalley)
+
+ HADOOP-3949. Remove duplicate jars from Chukwa. (Jerome Boulon via omalley)
+
+ HADOOP-3933. DataNode sometimes sends up to io.byte.per.checksum bytes
+ more than required to client. (Ning Li via rangadi)
+
+ HADOOP-3962. Shell command "fs -count" should support paths with different
+ file systems. (Tsz Wo (Nicholas), SZE via mahadev)
+
+ HADOOP-3957. Fix javac warnings in DistCp and TestCopyFiles. (Tsz Wo
+ (Nicholas), SZE via cdouglas)
+
+ HADOOP-3958. Fix TestMapRed to check the success of test-job. (omalley via
+ acmurthy)
+
+ HADOOP-3985. Fix TestHDFSServerPorts to use random ports. (Hairong Kuang
+ via omalley)
+
+ HADOOP-3964. Fix javadoc warnings introduced by FailMon. (dhruba)
+
+ HADOOP-3785. Fix FileSystem cache to be case-insensitive for scheme and
+ authority. (Bill de hOra via cdouglas)
+
+ HADOOP-3506. Fix a rare NPE caused by error handling in S3. (Tom White via
+ cdouglas)
+
+ HADOOP-3705. Fix mapred.join parser to accept InputFormats named with
+ underscore and static, inner classes. (cdouglas)
+
+ HADOOP-4023. Fix javadoc warnings introduced when the HDFS javadoc was
+ made private. (omalley)
+
+ HADOOP-4030. Remove lzop from the default list of codecs. (Arun Murthy via
+ cdouglas)
+
+ HADOOP-3961. Fix task disk space requirement estimates for virtual
+ input jobs. Delays limiting task placement until after 10% of the maps
+ have finished. (Ari Rabkin via omalley)
+
+ HADOOP-2168. Fix problem with C++ record reader's progress not being
+ reported to framework. (acmurthy via omalley)
+
+ HADOOP-3966. Copy findbugs generated output files to PATCH_DIR while
+ running test-patch. (Ramya R via lohit)
+
+ HADOOP-4037. Fix the eclipse plugin for versions of kfs and log4j. (nigel
+ via omalley)
+
+ HADOOP-3950. Cause the Mini MR cluster to wait for task trackers to
+ register before continuing. (enis via omalley)
+
+ HADOOP-3910. Remove unused ClusterTestDFSNamespaceLogging and
+ ClusterTestDFS. (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3954. Disable record skipping by default. (Sharad Agarwal via
+ cdouglas)
+
+ HADOOP-4050. Fix TestFairScheduler to use absolute paths for the work
+ directory. (Matei Zaharia via omalley)
+
+ HADOOP-4069. Keep temporary test files from TestKosmosFileSystem under
+ test.build.data instead of /tmp. (lohit via omalley)
+
+ HADOOP-4078. Create test files for TestKosmosFileSystem in separate
+ directory under test.build.data. (lohit)
+
+ HADOOP-3968. Fix getFileBlockLocations calls to use FileStatus instead
+ of Path reflecting the new API. (Pete Wyckoff via lohit)
+
+ HADOOP-3963. libhdfs does not exit on its own, instead it returns error
+ to the caller and behaves as a true library. (Pete Wyckoff via dhruba)
+
+ HADOOP-4100. Removes the cleanupTask scheduling from the Scheduler
+ implementations and moves it to the JobTracker.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4097. Make hive work well with speculative execution turned on.
+ (Joydeep Sen Sarma via dhruba)
+
+ HADOOP-4113. Changes to libhdfs to not exit on its own, rather return
+ an error code to the caller. (Pete Wyckoff via dhruba)
+
+ HADOOP-4054. Remove duplicate lease removal during edit log loading.
+ (hairong)
+
+ HADOOP-4071. FSNameSystem.isReplicationInProgress should add an
+ underReplicated block to the neededReplication queue using method
+ "add" not "update". (hairong)
+
+ HADOOP-4154. Fix type warnings in WritableUtils. (szetszwo via omalley)
+
+ HADOOP-4133. Log files generated by Hive should reside in the
+ build directory. (Prasad Chakka via dhruba)
+
+ HADOOP-4094. Hive now has hive-default.xml and hive-site.xml similar
+ to core hadoop. (Prasad Chakka via dhruba)
+
+ HADOOP-4112. Handles cleanupTask in JobHistory
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3831. Very slow reading clients sometimes failed while reading.
+ (rangadi)
+
+ HADOOP-4155. Use JobTracker's start time while initializing JobHistory's
+ JobTracker Unique String. (lohit)
+
+ HADOOP-4099. Fix null pointer when using HFTP from an 0.18 server.
+ (dhruba via omalley)
+
+ HADOOP-3570. Includes user specified libjar files in the client side
+ classpath path. (Sharad Agarwal via ddas)
+
+ HADOOP-4129. Changed memory limits of TaskTracker and Tasks to be in
+ KiloBytes rather than bytes. (Vinod Kumar Vavilapalli via acmurthy)
+
+ HADOOP-4139. Optimize Hive multi group-by.
+ (Namin Jain via dhruba)
+
+ HADOOP-3911. Add a check to fsck options to make sure -files is not
+ the first option to resolve conflicts with GenericOptionsParser
+ (lohit)
+
+ HADOOP-3623. Refactor LeaseManager. (szetszwo)
+
+ HADOOP-4125. Handles Reduce cleanup tip on the web ui.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4087. Hive Metastore API for php and python clients.
+ (Prasad Chakka via dhruba)
+
+ HADOOP-4197. Update DATA_TRANSFER_VERSION for HADOOP-3981. (szetszwo)
+
+ HADOOP-4138. Refactor the Hive SerDe library to better structure
+ the interfaces to the serializer and de-serializer.
+ (Zheng Shao via dhruba)
+
+ HADOOP-4195. Close compressor before returning to codec pool.
+ (acmurthy via omalley)
+
+ HADOOP-2403. Escapes some special characters before logging to
+ history files. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4200. Fix a bug in the test-patch.sh script.
+ (Ramya R via nigel)
+
+ HADOOP-4084. Add explain plan capabilities to Hive Query Language.
+ (Ashish Thusoo via dhruba)
+
+ HADOOP-4121. Preserve cause for exception if the initialization of
+ HistoryViewer for JobHistory fails. (Amareshwari Sri Ramadasu via
+ acmurthy)
+
+ HADOOP-4213. Fixes NPE in TestLimitTasksPerJobTaskScheduler.
+ (Sreekanth Ramakrishnan via ddas)
+
+ HADOOP-4077. Setting access and modification time for a file
+ requires write permissions on the file. (dhruba)
+
+ HADOOP-3592. Fix a couple of possible file leaks in FileUtil
+ (Bill de hOra via rangadi)
+
+ HADOOP-4120. Hive interactive shell records the time taken by a
+ query. (Raghotham Murthy via dhruba)
+
+ HADOOP-4090. The hive scripts pick up hadoop from HADOOP_HOME
+ and then the path. (Raghotham Murthy via dhruba)
+
+ HADOOP-4242. Remove extra ";" in FSDirectory that blocks compilation
+ in some IDE's. (szetszwo via omalley)
+
+ HADOOP-4249. Fix eclipse path to include the hsqldb.jar. (szetszwo via
+ omalley)
+
+ HADOOP-4247. Move InputSampler into org.apache.hadoop.mapred.lib, so that
+ examples.jar doesn't depend on tools.jar. (omalley)
+
+ HADOOP-4269. Fix the deprecation of LineReader by extending the new class
+ into the old name and deprecating it. Also update the tests to test the
+ new class. (cdouglas via omalley)
+
+ HADOOP-4280. Fix conversions between seconds in C and milliseconds in
+ Java for access times for files. (Pete Wyckoff via rangadi)
+
+ HADOOP-4254. -setSpaceQuota command does not convert "TB" extenstion to
+ terabytes properly. Implementation now uses StringUtils for parsing this.
+ (Raghu Angadi)
+
+ HADOOP-4259. Findbugs should run over tools.jar also. (cdouglas via
+ omalley)
+
+ HADOOP-4275. Move public method isJobValidName from JobID to a private
+ method in JobTracker. (omalley)
+
+ HADOOP-4173. fix failures in TestProcfsBasedProcessTree and
+ TestTaskTrackerMemoryManager tests. ProcfsBasedProcessTree and
+ memory management in TaskTracker are disabled on Windows.
+ (Vinod K V via rangadi)
+
+ HADOOP-4189. Fixes the history blocksize & intertracker protocol version
+ issues introduced as part of HADOOP-3245. (Amar Kamat via ddas)
+
+ HADOOP-4190. Fixes the backward compatibility issue with Job History.
+ introduced by HADOOP-3245 and HADOOP-2403. (Amar Kamat via ddas)
+
+ HADOOP-4237. Fixes the TestStreamingBadRecords.testNarrowDown testcase.
+ (Sharad Agarwal via ddas)
+
+ HADOOP-4274. Capacity scheduler accidently modifies the underlying
+ data structures when browing the job lists. (Hemanth Yamijala via omalley)
+
+ HADOOP-4309. Fix eclipse-plugin compilation. (cdouglas)
+
+ HADOOP-4232. Fix race condition in JVM reuse when multiple slots become
+ free. (ddas via acmurthy)
+
+ HADOOP-4302. Fix a race condition in TestReduceFetch that can yield false
+ negatvies. (cdouglas)
+
+ HADOOP-3942. Update distcp documentation to include features introduced in
+ HADOOP-3873, HADOOP-3939. (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-4319. fuse-dfs dfs_read function returns as many bytes as it is
+ told to read unlesss end-of-file is reached. (Pete Wyckoff via dhruba)
+
+ HADOOP-4246. Ensure we have the correct lower bound on the number of
+ retries for fetching map-outputs; also fixed the case where the reducer
+ automatically kills on too many unique map-outputs could not be fetched
+ for small jobs. (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-4163. Report FSErrors from map output fetch threads instead of
+ merely logging them. (Sharad Agarwal via cdouglas)
+
+ HADOOP-4261. Adds a setup task for jobs. This is required so that we
+ don't setup jobs that haven't been inited yet (since init could lead
+ to job failure). Only after the init has successfully happened do we
+ launch the setupJob task. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4256. Removes Completed and Failed Job tables from
+ jobqueue_details.jsp. (Sreekanth Ramakrishnan via ddas)
+
+ HADOOP-4267. Occasional exceptions during shutting down HSQLDB is logged
+ but not rethrown. (enis)
+
+ HADOOP-4018. The number of tasks for a single job cannot exceed a
+ pre-configured maximum value. (dhruba)
+
+ HADOOP-4288. Fixes a NPE problem in CapacityScheduler.
+ (Amar Kamat via ddas)
+
+ HADOOP-4014. Create hard links with 'fsutil hardlink' on Windows. (shv)
+
+ HADOOP-4393. Merged org.apache.hadoop.fs.permission.AccessControlException
+ and org.apache.hadoop.security.AccessControlIOException into a single
+ class hadoop.security.AccessControlException. (omalley via acmurthy)
+
+ HADOOP-4287. Fixes an issue to do with maintaining counts of running/pending
+ maps/reduces. (Sreekanth Ramakrishnan via ddas)
+
+ HADOOP-4361. Makes sure that jobs killed from command line are killed
+ fast (i.e., there is a slot to run the cleanup task soon).
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4400. Add "hdfs://" to fs.default.name on quickstart.html.
+ (Jeff Hammerbacher via omalley)
+
+ HADOOP-4378. Fix TestJobQueueInformation to use SleepJob rather than
+ WordCount via TestMiniMRWithDFS. (Sreekanth Ramakrishnan via acmurthy)
+
+ HADOOP-4376. Fix formatting in hadoop-default.xml for
+ hadoop.http.filter.initializers. (Enis Soztutar via acmurthy)
+
+ HADOOP-4410. Adds an extra arg to the API FileUtil.makeShellPath to
+ determine whether to canonicalize file paths or not.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4236. Ensure un-initialized jobs are killed correctly on
+ user-demand. (Sharad Agarwal via acmurthy)
+
+ HADOOP-4373. Fix calculation of Guaranteed Capacity for the
+ capacity-scheduler. (Hemanth Yamijala via acmurthy)
+
+ HADOOP-4053. Schedulers must be notified when jobs complete. (Amar Kamat via omalley)
+
+ HADOOP-4335. Fix FsShell -ls for filesystems without owners/groups. (David
+ Phillips via cdouglas)
+
+ HADOOP-4426. TestCapacityScheduler broke due to the two commits HADOOP-4053
+ and HADOOP-4373. This patch fixes that. (Hemanth Yamijala via ddas)
+
+ HADOOP-4418. Updates documentation in forrest for Mapred, streaming and pipes.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3155. Ensure that there is only one thread fetching
+ TaskCompletionEvents on TaskTracker re-init. (Dhruba Borthakur via
+ acmurthy)
+
+ HADOOP-4425. Fix EditLogInputStream to overload the bulk read method.
+ (cdouglas)
+
+ HADOOP-4427. Adds the new queue/job commands to the manual.
+ (Sreekanth Ramakrishnan via ddas)
+
+ HADOOP-4278. Increase debug logging for unit test TestDatanodeDeath.
+ Fix the case when primary is dead. (dhruba via szetszwo)
+
+ HADOOP-4423. Keep block length when the block recovery is triggered by
+ append. (szetszwo)
+
+ HADOOP-4449. Fix dfsadmin usage. (Raghu Angadi via cdouglas)
+
+ HADOOP-4455. Added TestSerDe so that unit tests can run successfully.
+ (Ashish Thusoo via dhruba)
+
+ HADOOP-4457. Fixes an input split logging problem introduced by
+ HADOOP-3245. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-4464. Separate out TestFileCreationClient from TestFileCreation.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-4404. saveFSImage() removes files from a storage directory that do
+ not correspond to its type. (shv)
+
+ HADOOP-4149. Fix handling of updates to the job priority, by changing the
+ list of jobs to be keyed by the priority, submit time, and job tracker id.
+ (Amar Kamat via omalley)
+
+ HADOOP-4296. Fix job client failures by not retiring a job as soon as it
+ is finished. (dhruba)
+
+ HADOOP-4439. Remove configuration variables that aren't usable yet, in
+ particular mapred.tasktracker.tasks.maxmemory and mapred.task.max.memory.
+ (Hemanth Yamijala via omalley)
+
+ HADOOP-4230. Fix for serde2 interface, limit operator, select * operator,
+ UDF trim functions and sampling. (Ashish Thusoo via dhruba)
+
+ HADOOP-4358. No need to truncate access time in INode. Also fixes NPE
+ in CreateEditsLog. (Raghu Angadi)
+
+ HADOOP-4387. TestHDFSFileSystemContract fails on windows nightly builds.
+ (Raghu Angadi)
+
+ HADOOP-4466. Ensure that SequenceFileOutputFormat isn't tied to Writables
+ and can be used with other Serialization frameworks. (Chris Wensel via
+ acmurthy)
+
+ HADOOP-4525. Fix ipc.server.ipcnodelay originally missed in in HADOOP-2232.
+ (cdouglas via Clint Morgan)
+
+ HADOOP-4498. Ensure that JobHistory correctly escapes the job name so that
+ regex patterns work. (Chris Wensel via acmurthy)
+
+ HADOOP-4446. Modify guaranteed capacity labels in capacity scheduler's UI
+ to reflect the information being displayed. (Sreekanth Ramakrishnan via
+ yhemanth)
+
+ HADOOP-4282. Some user facing URLs are not filtered by user filters.
+ (szetszwo)
+
+ HADOOP-4595. Fixes two race conditions - one to do with updating free slot count,
+ and another to do with starting the MapEventsFetcher thread. (ddas)
+
+ HADOOP-4552. Fix a deadlock in RPC server. (Raghu Angadi)
+
+ HADOOP-4471. Sort running jobs by priority in the capacity scheduler.
+ (Amar Kamat via yhemanth)
+
+ HADOOP-4500. Fix MultiFileSplit to get the FileSystem from the relevant
+ path rather than the JobClient. (Joydeep Sen Sarma via cdouglas)
+
+Release 0.18.4 - Unreleased
+
+ BUG FIXES
+
+ HADOOP-5114. Remove timeout for accept() in DataNode. This makes accept()
+ fail in JDK on Windows and causes many tests to fail. (Raghu Angadi)
+
+ HADOOP-5192. Block receiver should not remove a block that's created or
+ being written by other threads. (hairong)
+
+ HADOOP-5134. FSNamesystem#commitBlockSynchronization adds under-construction
+ block locations to blocksMap. (Dhruba Borthakur via hairong)
+
+ HADOOP-5412. Simulated DataNode should not write to a block that's being
+ written by another thread. (hairong)
+
+ HADOOP-5465. Fix the problem of blocks remaining under-replicated by
+ providing synchronized modification to the counter xmitsInProgress in
+ DataNode. (hairong)
+
+ HADOOP-5557. Fixes some minor problems in TestOverReplicatedBlocks.
+ (szetszwo)
+
+Release 0.18.3 - 2009-01-27
+
+ IMPROVEMENTS
+
+ HADOOP-4150. Include librecordio in hadoop releases. (Giridharan Kesavan
+ via acmurthy)
+
+ HADOOP-4668. Improve documentation for setCombinerClass to clarify the
+ restrictions on combiners. (omalley)
+
+ BUG FIXES
+
+ HADOOP-4499. DFSClient should invoke checksumOk only once. (Raghu Angadi)
+
+ HADOOP-4597. Calculate mis-replicated blocks when safe-mode is turned
+ off manually. (shv)
+
+ HADOOP-3121. lsr should keep listing the remaining items but not
+ terminate if there is any IOException. (szetszwo)
+
+ HADOOP-4610. Always calculate mis-replicated blocks when safe-mode is
+ turned off. (shv)
+
+ HADOOP-3883. Limit namenode to assign at most one generation stamp for
+ a particular block within a short period. (szetszwo)
+
+ HADOOP-4556. Block went missing. (hairong)
+
+ HADOOP-4643. NameNode should exclude excessive replicas when counting
+ live replicas for a block. (hairong)
+
+ HADOOP-4703. Should not wait for proxy forever in lease recovering.
+ (szetszwo)
+
+ HADOOP-4647. NamenodeFsck should close the DFSClient it has created.
+ (szetszwo)
+
+ HADOOP-4616. Fuse-dfs can handle bad values from FileSystem.read call.
+ (Pete Wyckoff via dhruba)
+
+ HADOOP-4061. Throttle Datanode decommission monitoring in Namenode.
+ (szetszwo)
+
+ HADOOP-4659. Root cause of connection failure is being lost to code that
+ uses it for delaying startup. (Steve Loughran and Hairong via hairong)
+
+ HADOOP-4614. Lazily open segments when merging map spills to avoid using
+ too many file descriptors. (Yuri Pradkin via cdouglas)
+
+ HADOOP-4257. The DFS client should pick only one datanode as the candidate
+ to initiate lease recovery. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-4713. Fix librecordio to handle records larger than 64k. (Christian
+ Kunz via cdouglas)
+
+ HADOOP-4635. Fix a memory leak in fuse dfs. (pete wyckoff via mahadev)
+
+ HADOOP-4714. Report status between merges and make the number of records
+ between progress reports configurable. (Jothi Padmanabhan via cdouglas)
+
+ HADOOP-4726. Fix documentation typos "the the". (Edward J. Yoon via
+ szetszwo)
+
+ HADOOP-4679. Datanode prints tons of log messages: waiting for threadgroup
+ to exit, active threads is XX. (hairong)
+
+ HADOOP-4746. Job output directory should be normalized. (hairong)
+
+ HADOOP-4717. Removal of default port# in NameNode.getUri() causes a
+ map/reduce job failed to prompt temporary output. (hairong)
+
+ HADOOP-4778. Check for zero size block meta file when updating a block.
+ (szetszwo)
+
+ HADOOP-4742. Replica gets deleted by mistake. (Wang Xu via hairong)
+
+ HADOOP-4702. Failed block replication leaves an incomplete block in
+ receiver's tmp data directory. (hairong)
+
+ HADOOP-4613. Fix block browsing on Web UI. (Johan Oskarsson via shv)
+
+ HADOOP-4806. HDFS rename should not use src path as a regular expression.
+ (szetszwo)
+
+ HADOOP-4795. Prevent lease monitor getting into an infinite loop when
+ leases and the namespace tree does not match. (szetszwo)
+
+ HADOOP-4620. Fixes Streaming to handle well the cases of map/reduce with empty
+ input/output. (Ravi Gummadi via ddas)
+
+ HADOOP-4857. Fixes TestUlimit to have exactly 1 map in the jobs spawned.
+ (Ravi Gummadi via ddas)
+
+ HADOOP-4810. Data lost at cluster startup time. (hairong)
+
+ HADOOP-4797. Improve how RPC server reads and writes large buffers. Avoids
+ soft-leak of direct buffers and excess copies in NIO layer. (Raghu Angadi)
+
+ HADOOP-4840. TestNodeCount sometimes fails with NullPointerException.
+ (hairong)
+
+ HADOOP-4904. Fix deadlock while leaving safe mode. (shv)
+
+ HADOOP-1980. 'dfsadmin -safemode enter' should prevent the namenode from
+ leaving safemode automatically. (shv)
+
+ HADOOP-4951. Lease monitor should acquire the LeaseManager lock but not the
+ Monitor lock. (szetszwo)
+
+ HADOOP-4935. processMisReplicatedBlocks() should not clear
+ excessReplicateMap. (shv)
+
+ HADOOP-4961. Fix ConcurrentModificationException in lease recovery
+ of empty files. (shv)
+
+ HADOOP-4971. A long (unexpected) delay at datanodes could make subsequent
+ block reports from many datanode at the same time. (Raghu Angadi)
+
+ HADOOP-4910. NameNode should exclude replicas when choosing excessive
+ replicas to delete to avoid data lose. (hairong)
+
+ HADOOP-4983. Fixes a problem in updating Counters in the status reporting.
+ (Amareshwari Sriramadasu via ddas)
+
+Release 0.18.2 - 2008-11-03
+
+ BUG FIXES
+
+ HADOOP-3614. Fix a bug that Datanode may use an old GenerationStamp to get
+ meta file. (szetszwo)
+
+ HADOOP-4314. Simulated datanodes should not include blocks that are still
+ being written in their block report. (Raghu Angadi)
+
+ HADOOP-4228. dfs datanode metrics, bytes_read and bytes_written, overflow
+ due to incorrect type used. (hairong)
+
+ HADOOP-4395. The FSEditLog loading is incorrect for the case OP_SET_OWNER.
+ (szetszwo)
+
+ HADOOP-4351. FSNamesystem.getBlockLocationsInternal throws
+ ArrayIndexOutOfBoundsException. (hairong)
+
+ HADOOP-4403. Make TestLeaseRecovery and TestFileCreation more robust.
+ (szetszwo)
+
+ HADOOP-4292. Do not support append() for LocalFileSystem. (hairong)
+
+ HADOOP-4399. Make fuse-dfs multi-thread access safe.
+ (Pete Wyckoff via dhruba)
+
+ HADOOP-4369. Use setMetric(...) instead of incrMetric(...) for metrics
+ averages. (Brian Bockelman via szetszwo)
+
+ HADOOP-4469. Rename and add the ant task jar file to the tar file. (nigel)
+
+ HADOOP-3914. DFSClient sends Checksum Ok only once for a block.
+ (Christian Kunz via hairong)
+
+ HADOOP-4467. SerializationFactory now uses the current context ClassLoader
+ allowing for user supplied Serialization instances. (Chris Wensel via
+ acmurthy)
+
+ HADOOP-4517. Release FSDataset lock before joining ongoing create threads.
+ (szetszwo)
+
+ HADOOP-4526. fsck failing with NullPointerException. (hairong)
+
+ HADOOP-4483 Honor the max parameter in DatanodeDescriptor.getBlockArray(..)
+ (Ahad Rana and Hairong Kuang via szetszwo)
+
+ HADOOP-4340. Correctly set the exit code from JobShell.main so that the
+ 'hadoop jar' command returns the right code to the user. (acmurthy)
+
+ NEW FEATURES
+
+ HADOOP-2421. Add jdiff output to documentation, listing all API
+ changes from the prior release. (cutting)
+
+Release 0.18.1 - 2008-09-17
+
+ IMPROVEMENTS
+
+ HADOOP-3934. Upgrade log4j to 1.2.15. (omalley)
+
+ BUG FIXES
+
+ HADOOP-3995. In case of quota failure on HDFS, rename does not restore
+ source filename. (rangadi)
+
+ HADOOP-3821. Prevent SequenceFile and IFile from duplicating codecs in
+ CodecPool when closed more than once. (Arun Murthy via cdouglas)
+
+ HADOOP-4040. Remove coded default of the IPC idle connection timeout
+ from the TaskTracker, which was causing HDFS client connections to not be
+ collected. (ddas via omalley)
+
+ HADOOP-4046. Made WritableComparable's constructor protected instead of
+ private to re-enable class derivation. (cdouglas via omalley)
+
+ HADOOP-3940. Fix in-memory merge condition to wait when there are no map
+ outputs or when the final map outputs are being fetched without contention.
+ (cdouglas)
+
+Release 0.18.0 - 2008-08-19
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-2703. The default options to fsck skips checking files
+ that are being written to. The output of fsck is incompatible
+ with previous release. (lohit vijayarenu via dhruba)
+
+ HADOOP-2865. FsShell.ls() printout format changed to print file names
+ in the end of the line. (Edward J. Yoon via shv)
+
+ HADOOP-3283. The Datanode has a RPC server. It currently supports
+ two RPCs: the first RPC retrives the metadata about a block and the
+ second RPC sets the generation stamp of an existing block.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2797. Code related to upgrading to 0.14 (Block CRCs) is
+ removed. As result, upgrade to 0.18 or later from 0.13 or earlier
+ is not supported. If upgrading from 0.13 or earlier is required,
+ please upgrade to an intermediate version (0.14-0.17) and then
+ to this version. (rangadi)
+
+ HADOOP-544. This issue introduces new classes JobID, TaskID and
+ TaskAttemptID, which should be used instead of their string counterparts.
+ Functions in JobClient, TaskReport, RunningJob, jobcontrol.Job and
+ TaskCompletionEvent that use string arguments are deprecated in favor
+ of the corresponding ones that use ID objects. Applications can use
+ xxxID.toString() and xxxID.forName() methods to convert/restore objects
+ to/from strings. (Enis Soztutar via ddas)
+
+ HADOOP-2188. RPC client sends a ping rather than throw timeouts.
+ RPC server does not throw away old RPCs. If clients and the server are on
+ different versions, they are not able to function well. In addition,
+ The property ipc.client.timeout is removed from the default hadoop
+ configuration. It also removes metrics RpcOpsDiscardedOPsNum. (hairong)
+
+ HADOOP-2181. This issue adds logging for input splits in Jobtracker log
+ and jobHistory log. Also adds web UI for viewing input splits in job UI
+ and history UI. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3226. Run combiners multiple times over map outputs as they
+ are merged in both the map and the reduce tasks. (cdouglas via omalley)
+
+ HADOOP-3329. DatanodeDescriptor objects should not be stored in the
+ fsimage. (dhruba)
+
+ HADOOP-2656. The Block object has a generation stamp inside it.
+ Existing blocks get a generation stamp of 0. This is needed to support
+ appends. (dhruba)
+
+ HADOOP-3390. Removed deprecated ClientProtocol.abandonFileInProgress().
+ (Tsz Wo (Nicholas), SZE via rangadi)
+
+ HADOOP-3405. Made some map/reduce internal classes non-public:
+ MapTaskStatus, ReduceTaskStatus, JobSubmissionProtocol,
+ CompletedJobStatusStore. (enis via omaley)
+
+ HADOOP-3265. Removed depcrecated API getFileCacheHints().
+ (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3310. The namenode instructs the primary datanode to do lease
+ recovery. The block gets a new generation stamp.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2909. Improve IPC idle connection management. Property
+ ipc.client.maxidletime is removed from the default configuration,
+ instead it is defined as twice of the ipc.client.connection.maxidletime.
+ A connection with outstanding requests won't be treated as idle.
+ (hairong)
+
+ HADOOP-3459. Change in the output format of dfs -ls to more closely match
+ /bin/ls. New format is: perm repl owner group size date name
+ (Mukund Madhugiri via omally)
+
+ HADOOP-3113. An fsync invoked on a HDFS file really really
+ persists data! The datanode moves blocks in the tmp directory to
+ the real block directory on a datanode-restart. (dhruba)
+
+ HADOOP-3452. Change fsck to return non-zero status for a corrupt
+ FileSystem. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3193. Include the address of the client that found the corrupted
+ block in the log. Also include a CorruptedBlocks metric to track the size
+ of the corrupted block map. (cdouglas)
+
+ HADOOP-3512. Separate out the tools into a tools jar. (omalley)
+
+ HADOOP-3598. Ensure that temporary task-output directories are not created
+ if they are not necessary e.g. for Maps with no side-effect files.
+ (acmurthy)
+
+ HADOOP-3665. Modify WritableComparator so that it only creates instances
+ of the keytype if the type does not define a WritableComparator. Calling
+ the superclass compare will throw a NullPointerException. Also define
+ a RawComparator for NullWritable and permit it to be written as a key
+ to SequenceFiles. (cdouglas)
+
+ HADOOP-3673. Avoid deadlock caused by DataNode RPC receoverBlock().
+ (Tsz Wo (Nicholas), SZE via rangadi)
+
+ NEW FEATURES
+
+ HADOOP-3074. Provides a UrlStreamHandler for DFS and other FS,
+ relying on FileSystem (taton)
+
+ HADOOP-2585. Name-node imports namespace data from a recent checkpoint
+ accessible via a NFS mount. (shv)
+
+ HADOOP-3061. Writable types for doubles and bytes. (Andrzej
+ Bialecki via omalley)
+
+ HADOOP-2857. Allow libhdfs to set jvm options. (Craig Macdonald
+ via omalley)
+
+ HADOOP-3317. Add default port for HDFS namenode. The port in
+ "hdfs:" URIs now defaults to 8020, so that one may simply use URIs
+ of the form "hdfs://example.com/dir/file". (cutting)
+
+ HADOOP-2019. Adds support for .tar, .tgz and .tar.gz files in
+ DistributedCache (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3058. Add FSNamesystem status metrics.
+ (Lohit Vjayarenu via rangadi)
+
+ HADOOP-1915. Allow users to specify counters via strings instead
+ of enumerations. (tomwhite via omalley)
+
+ HADOOP-2065. Delay invalidating corrupt replicas of block until its
+ is removed from under replicated state. If all replicas are found to
+ be corrupt, retain all copies and mark the block as corrupt.
+ (Lohit Vjayarenu via rangadi)
+
+ HADOOP-3221. Adds org.apache.hadoop.mapred.lib.NLineInputFormat, which
+ splits files into splits each of N lines. N can be specified by
+ configuration property "mapred.line.input.format.linespermap", which
+ defaults to 1. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3336. Direct a subset of annotated FSNamesystem calls for audit
+ logging. (cdouglas)
+
+ HADOOP-3400. A new API FileSystem.deleteOnExit() that facilitates
+ handling of temporary files in HDFS. (dhruba)
+
+ HADOOP-4. Add fuse-dfs to contrib, permitting one to mount an
+ HDFS filesystem on systems that support FUSE, e.g., Linux.
+ (Pete Wyckoff via cutting)
+
+ HADOOP-3246. Add FTPFileSystem. (Ankur Goel via cutting)
+
+ HADOOP-3250. Extend FileSystem API to allow appending to files.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3177. Implement Syncable interface for FileSystem.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-1328. Implement user counters in streaming. (tomwhite via
+ omalley)
+
+ HADOOP-3187. Quotas for namespace management. (Hairong Kuang via ddas)
+
+ HADOOP-3307. Support for Archives in Hadoop. (Mahadev Konar via ddas)
+
+ HADOOP-3460. Add SequenceFileAsBinaryOutputFormat to permit direct
+ writes of serialized data. (Koji Noguchi via cdouglas)
+
+ HADOOP-3230. Add ability to get counter values from command
+ line. (tomwhite via omalley)
+
+ HADOOP-930. Add support for native S3 files. (tomwhite via cutting)
+
+ HADOOP-3502. Quota API needs documentation in Forrest. (hairong)
+
+ HADOOP-3413. Allow SequenceFile.Reader to use serialization
+ framework. (tomwhite via omalley)
+
+ HADOOP-3541. Import of the namespace from a checkpoint documented
+ in hadoop user guide. (shv)
+
+ IMPROVEMENTS
+
+ HADOOP-3677. Simplify generation stamp upgrade by making is a
+ local upgrade on datandodes. Deleted distributed upgrade.
+ (rangadi)
+
+ HADOOP-2928. Remove deprecated FileSystem.getContentLength().
+ (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3130. Make the connect timeout smaller for getFile.
+ (Amar Ramesh Kamat via ddas)
+
+ HADOOP-3160. Remove deprecated exists() from ClientProtocol and
+ FSNamesystem (Lohit Vjayarenu via rangadi)
+
+ HADOOP-2910. Throttle IPC Clients during bursts of requests or
+ server slowdown. Clients retry connection for up to 15 minutes
+ when socket connection times out. (hairong)
+
+ HADOOP-3295. Allow TextOutputFormat to use configurable spearators.
+ (Zheng Shao via cdouglas).
+
+ HADOOP-3308. Improve QuickSort by excluding values eq the pivot from the
+ partition. (cdouglas)
+
+ HADOOP-2461. Trim property names in configuration.
+ (Tsz Wo (Nicholas), SZE via shv)
+
+ HADOOP-2799. Deprecate o.a.h.io.Closable in favor of java.io.Closable.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3345. Enhance the hudson-test-patch target to cleanup messages,
+ fix minor defects, and add eclipse plugin and python unit tests. (nigel)
+
+ HADOOP-3144. Improve robustness of LineRecordReader by defining a maximum
+ line length (mapred.linerecordreader.maxlength), thereby avoiding reading
+ too far into the following split. (Zheng Shao via cdouglas)
+
+ HADOOP-3334. Move lease handling from FSNamesystem into a seperate class.
+ (Tsz Wo (Nicholas), SZE via rangadi)
+
+ HADOOP-3332. Reduces the amount of logging in Reducer's shuffle phase.
+ (Devaraj Das)
+
+ HADOOP-3355. Enhances Configuration class to accept hex numbers for getInt
+ and getLong. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3350. Add an argument to distcp to permit the user to limit the
+ number of maps. (cdouglas)
+
+ HADOOP-3013. Add corrupt block reporting to fsck.
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-3377. Remove TaskRunner::replaceAll and replace with equivalent
+ String::replace. (Brice Arnould via cdouglas)
+
+ HADOOP-3398. Minor improvement to a utility function in that participates
+ in backoff calculation. (cdouglas)
+
+ HADOOP-3381. Clear referenced when directories are deleted so that
+ effect of memory leaks are not multiplied. (rangadi)
+
+ HADOOP-2867. Adds the task's CWD to its LD_LIBRARY_PATH.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3232. DU class runs the 'du' command in a seperate thread so
+ that it does not block user. DataNode misses heartbeats in large
+ nodes otherwise. (Johan Oskarsson via rangadi)
+
+ HADOOP-3035. During block transfers between datanodes, the receiving
+ datanode, now can report corrupt replicas received from src node to
+ the namenode. (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3434. Retain the cause of the bind failure in Server::bind.
+ (Steve Loughran via cdouglas)
+
+ HADOOP-3429. Increases the size of the buffers used for the communication
+ for Streaming jobs. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3486. Change default for initial block report to 0 seconds
+ and document it. (Sanjay Radia via omalley)
+
+ HADOOP-3448. Improve the text in the assertion making sure the
+ layout versions are consistent in the data node. (Steve Loughran
+ via omalley)
+
+ HADOOP-2095. Improve the Map-Reduce shuffle/merge by cutting down
+ buffer-copies; changed intermediate sort/merge to use the new IFile format
+ rather than SequenceFiles and compression of map-outputs is now
+ implemented by compressing the entire file rather than SequenceFile
+ compression. Shuffle also has been changed to use a simple byte-buffer
+ manager rather than the InMemoryFileSystem.
+ Configuration changes to hadoop-default.xml:
+ deprecated mapred.map.output.compression.type
+ (acmurthy)
+
+ HADOOP-236. JobTacker now refuses connection from a task tracker with a
+ different version number. (Sharad Agarwal via ddas)
+
+ HADOOP-3427. Improves the shuffle scheduler. It now waits for notifications
+ from shuffle threads when it has scheduled enough, before scheduling more.
+ (ddas)
+
+ HADOOP-2393. Moves the handling of dir deletions in the tasktracker to
+ a separate thread. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3501. Deprecate InMemoryFileSystem. (cutting via omalley)
+
+ HADOOP-3366. Stall the shuffle while in-memory merge is in progress.
+ (acmurthy)
+
+ HADOOP-2916. Refactor src structure, but leave package structure alone.
+ (Raghu Angadi via mukund)
+
+ HADOOP-3492. Add forrest documentation for user archives.
+ (Mahadev Konar via hairong)
+
+ HADOOP-3467. Improve documentation for FileSystem::deleteOnExit.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3379. Documents stream.non.zero.exit.status.is.failure for Streaming.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3096. Improves documentation about the Task Execution Environment in
+ the Map-Reduce tutorial. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-2984. Add forrest documentation for DistCp. (cdouglas)
+
+ HADOOP-3406. Add forrest documentation for Profiling.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-2762. Add forrest documentation for controls of memory limits on
+ hadoop daemons and Map-Reduce tasks. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3535. Fix documentation and name of IOUtils.close to
+ reflect that it should only be used in cleanup contexts. (omalley)
+
+ HADOOP-3593. Updates the mapred tutorial. (ddas)
+
+ HADOOP-3547. Documents the way in which native libraries can be distributed
+ via the DistributedCache. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3606. Updates the Streaming doc. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3532. Add jdiff reports to the build scripts. (omalley)
+
+ HADOOP-3100. Develop tests to test the DFS command line interface. (mukund)
+
+ HADOOP-3688. Fix up HDFS docs. (Robert Chansler via hairong)
+
+ OPTIMIZATIONS
+
+ HADOOP-3274. The default constructor of BytesWritable creates empty
+ byte array. (Tsz Wo (Nicholas), SZE via shv)
+
+ HADOOP-3272. Remove redundant copy of Block object in BlocksMap.
+ (Lohit Vjayarenu via shv)
+
+ HADOOP-3164. Reduce DataNode CPU usage by using FileChannel.tranferTo().
+ On Linux DataNode takes 5 times less CPU while serving data. Results may
+ vary on other platforms. (rangadi)
+
+ HADOOP-3248. Optimization of saveFSImage. (Dhruba via shv)
+
+ HADOOP-3297. Fetch more task completion events from the job
+ tracker and task tracker. (ddas via omalley)
+
+ HADOOP-3364. Faster image and log edits loading. (shv)
+
+ HADOOP-3369. Fast block processing during name-node startup. (shv)
+
+ HADOOP-1702. Reduce buffer copies when data is written to DFS.
+ DataNodes take 30% less CPU while writing data. (rangadi)
+
+ HADOOP-3095. Speed up split generation in the FileInputSplit,
+ especially for non-HDFS file systems. Deprecates
+ InputFormat.validateInput. (tomwhite via omalley)
+
+ HADOOP-3552. Add forrest documentation for Hadoop commands.
+ (Sharad Agarwal via cdouglas)
+
+ BUG FIXES
+
+ HADOOP-2905. 'fsck -move' triggers NPE in NameNode.
+ (Lohit Vjayarenu via rangadi)
+
+ Increment ClientProtocol.versionID missed by HADOOP-2585. (shv)
+
+ HADOOP-3254. Restructure internal namenode methods that process
+ heartbeats to use well-defined BlockCommand object(s) instead of
+ using the base java Object. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-3176. Change lease record when a open-for-write-file
+ gets renamed. (dhruba)
+
+ HADOOP-3269. Fix a case when namenode fails to restart
+ while processing a lease record. ((Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-3282. Port issues in TestCheckpoint resolved. (shv)
+
+ HADOOP-3268. file:// URLs issue in TestUrlStreamHandler under Windows.
+ (taton)
+
+ HADOOP-3127. Deleting files in trash should really remove them.
+ (Brice Arnould via omalley)
+
+ HADOOP-3300. Fix locking of explicit locks in NetworkTopology.
+ (tomwhite via omalley)
+
+ HADOOP-3270. Constant DatanodeCommands are stored in static final
+ immutable variables for better code clarity.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2793. Fix broken links for worst performing shuffle tasks in
+ the job history page. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3313. Avoid unnecessary calls to System.currentTimeMillis
+ in RPC::Invoker. (cdouglas)
+
+ HADOOP-3318. Recognize "Darwin" as an alias for "Mac OS X" to
+ support Soylatte. (Sam Pullara via omalley)
+
+ HADOOP-3301. Fix misleading error message when S3 URI hostname
+ contains an underscore. (tomwhite via omalley)
+
+ HADOOP-3338. Fix Eclipse plugin to compile after HADOOP-544 was
+ committed. Updated all references to use the new JobID representation.
+ (taton via nigel)
+
+ HADOOP-3337. Loading FSEditLog was broken by HADOOP-3283 since it
+ changed Writable serialization of DatanodeInfo. This patch handles it.
+ (Tsz Wo (Nicholas), SZE via rangadi)
+
+ HADOOP-3101. Prevent JobClient from throwing an exception when printing
+ usage. (Edward J. Yoon via cdouglas)
+
+ HADOOP-3119. Update javadoc for Text::getBytes to better describe its
+ behavior. (Tim Nelson via cdouglas)
+
+ HADOOP-2294. Fix documentation in libhdfs to refer to the correct free
+ function. (Craig Macdonald via cdouglas)
+
+ HADOOP-3335. Prevent the libhdfs build from deleting the wrong
+ files on make clean. (cutting via omalley)
+
+ HADOOP-2930. Make {start,stop}-balancer.sh work even if hadoop-daemon.sh
+ is not in the PATH. (Spiros Papadimitriou via hairong)
+
+ HADOOP-3085. Catch Exception in metrics util classes to ensure that
+ misconfigured metrics don't prevent others from updating. (cdouglas)
+
+ HADOOP-3299. CompositeInputFormat should configure the sub-input
+ formats. (cdouglas via omalley)
+
+ HADOOP-3309. Lower io.sort.mb and fs.inmemory.size.mb for MiniMRDFSSort
+ unit test so it passes on Windows. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3348. TestUrlStreamHandler should set URLStreamFactory after
+ DataNodes are initialized. (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3371. Ignore InstanceAlreadyExistsException from
+ MBeanUtil::registerMBean. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3349. A file rename was incorrectly changing the name inside a
+ lease record. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-3365. Removes an unnecessary copy of the key from SegmentDescriptor
+ to MergeQueue. (Devaraj Das)
+
+ HADOOP-3388. Fix for TestDatanodeBlockScanner to handle blocks with
+ generation stamps in them. (dhruba)
+
+ HADOOP-3203. Fixes TaskTracker::localizeJob to pass correct file sizes
+ for the jarfile and the jobfile. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3391. Fix a findbugs warning introduced by HADOOP-3248 (rangadi)
+
+ HADOOP-3393. Fix datanode shutdown to call DataBlockScanner::shutdown and
+ close its log, even if the scanner thread is not running. (lohit vijayarenu
+ via cdouglas)
+
+ HADOOP-3399. A debug message was logged at info level. (rangadi)
+
+ HADOOP-3396. TestDatanodeBlockScanner occationally fails.
+ (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3339. Some of the failures on 3rd datanode in DFS write pipelie
+ are not detected properly. This could lead to hard failure of client's
+ write operation. (rangadi)
+
+ HADOOP-3409. Namenode should save the root inode into fsimage. (hairong)
+
+ HADOOP-3296. Fix task cache to work for more than two levels in the cache
+ hierarchy. This also adds a new counter to track cache hits at levels
+ greater than two. (Amar Kamat via cdouglas)
+
+ HADOOP-3375. Lease paths were sometimes not removed from
+ LeaseManager.sortedLeasesByPath. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-3424. Values returned by getPartition should be checked to
+ make sure they are in the range 0 to #reduces - 1 (cdouglas via
+ omalley)
+
+ HADOOP-3408. Change FSNamesystem to send its metrics as integers to
+ accommodate collectors that don't support long values. (lohit vijayarenu
+ via cdouglas)
+
+ HADOOP-3403. Fixes a problem in the JobTracker to do with handling of lost
+ tasktrackers. (Arun Murthy via ddas)
+
+ HADOOP-1318. Completed maps are not failed if the number of reducers are
+ zero. (Amareshwari Sriramadasu via ddas).
+
+ HADOOP-3351. Fixes the history viewer tool to not do huge StringBuffer
+ allocations. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3419. Fixes TestFsck to wait for updates to happen before
+ checking results to make the test more reliable. (Lohit Vijaya
+ Renu via omalley)
+
+ HADOOP-3259. Makes failure to read system properties due to a
+ security manager non-fatal. (Edward Yoon via omalley)
+
+ HADOOP-3451. Update libhdfs to use FileSystem::getFileBlockLocations
+ instead of removed getFileCacheHints. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3401. Update FileBench to set the new
+ "mapred.work.output.dir" property to work post-3041. (cdouglas via omalley)
+
+ HADOOP-2669. DFSClient locks pendingCreates appropriately. (dhruba)
+
+ HADOOP-3410. Fix KFS implemenation to return correct file
+ modification time. (Sriram Rao via cutting)
+
+ HADOOP-3340. Fix DFS metrics for BlocksReplicated, HeartbeatsNum, and
+ BlockReportsAverageTime. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3435. Remove the assuption in the scripts that bash is at
+ /bin/bash and fix the test patch to require bash instead of sh.
+ (Brice Arnould via omalley)
+
+ HADOOP-3471. Fix spurious errors from TestIndexedSort and add additional
+ logging to let failures be reproducible. (cdouglas)
+
+ HADOOP-3443. Avoid copying map output across partitions when renaming a
+ single spill. (omalley via cdouglas)
+
+ HADOOP-3454. Fix Text::find to search only valid byte ranges. (Chad Whipkey
+ via cdouglas)
+
+ HADOOP-3417. Removes the static configuration variable,
+ commandLineConfig from JobClient. Moves the cli parsing from
+ JobShell to GenericOptionsParser. Thus removes the class
+ org.apache.hadoop.mapred.JobShell. (Amareshwari Sriramadasu via
+ ddas)
+
+ HADOOP-2132. Only RUNNING/PREP jobs can be killed. (Jothi Padmanabhan
+ via ddas)
+
+ HADOOP-3476. Code cleanup in fuse-dfs.
+ (Peter Wyckoff via dhruba)
+
+ HADOOP-2427. Ensure that the cwd of completed tasks is cleaned-up
+ correctly on task-completion. (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-2565. Remove DFSPath cache of FileStatus.
+ (Tsz Wo (Nicholas), SZE via hairong)
+
+ HADOOP-3326. Cleanup the local-fs and in-memory merge in the ReduceTask by
+ spawing only one thread each for the on-disk and in-memory merge.
+ (Sharad Agarwal via acmurthy)
+
+ HADOOP-3493. Fix TestStreamingFailure to use FileUtil.fullyDelete to
+ ensure correct cleanup. (Lohit Vijayarenu via acmurthy)
+
+ HADOOP-3455. Fix NPE in ipc.Client in case of connection failure and
+ improve its synchronization. (hairong)
+
+ HADOOP-3240. Fix a testcase to not create files in the current directory.
+ Instead the file is created in the test directory (Mahadev Konar via ddas)
+
+ HADOOP-3496. Fix failure in TestHarFileSystem.testArchives due to change
+ in HADOOP-3095. (tomwhite)
+
+ HADOOP-3135. Get the system directory from the JobTracker instead of from
+ the conf. (Subramaniam Krishnan via ddas)
+
+ HADOOP-3503. Fix a race condition when client and namenode start
+ simultaneous recovery of the same block. (dhruba & Tsz Wo
+ (Nicholas), SZE)
+
+ HADOOP-3440. Fixes DistributedCache to not create symlinks for paths which
+ don't have fragments even when createSymLink is true.
+ (Abhijit Bagri via ddas)
+
+ HADOOP-3463. Hadoop-daemons script should cd to $HADOOP_HOME. (omalley)
+
+ HADOOP-3489. Fix NPE in SafeModeMonitor. (Lohit Vijayarenu via shv)
+
+ HADOOP-3509. Fix NPE in FSNamesystem.close. (Tsz Wo (Nicholas), SZE via
+ shv)
+
+ HADOOP-3491. Name-node shutdown causes InterruptedException in
+ ResolutionMonitor. (Lohit Vijayarenu via shv)
+
+ HADOOP-3511. Fixes namenode image to not set the root's quota to an
+ invalid value when the quota was not saved in the image. (hairong)
+
+ HADOOP-3516. Ensure the JobClient in HadoopArchives is initialized
+ with a configuration. (Subramaniam Krishnan via omalley)
+
+ HADOOP-3513. Improve NNThroughputBenchmark log messages. (shv)
+
+ HADOOP-3519. Fix NPE in DFS FileSystem rename. (hairong via tomwhite)
+
+ HADOOP-3528. Metrics FilesCreated and files_deleted metrics
+ do not match. (Lohit via Mahadev)
+
+ HADOOP-3418. When a directory is deleted, any leases that point to files
+ in the subdirectory are removed. ((Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-3542. Diables the creation of _logs directory for the archives
+ directory. (Mahadev Konar via ddas)
+
+ HADOOP-3544. Fixes a documentation issue for hadoop archives.
+ (Mahadev Konar via ddas)
+
+ HADOOP-3517. Fixes a problem in the reducer due to which the last InMemory
+ merge may be missed. (Arun Murthy via ddas)
+
+ HADOOP-3548. Fixes build.xml to copy all *.jar files to the dist.
+ (Owen O'Malley via ddas)
+
+ HADOOP-3363. Fix unformatted storage detection in FSImage. (shv)
+
+ HADOOP-3560. Fixes a problem to do with split creation in archives.
+ (Mahadev Konar via ddas)
+
+ HADOOP-3545. Fixes a overflow problem in archives.
+ (Mahadev Konar via ddas)
+
+ HADOOP-3561. Prevent the trash from deleting its parent directories.
+ (cdouglas)
+
+ HADOOP-3575. Fix the clover ant target after package refactoring.
+ (Nigel Daley via cdouglas)
+
+ HADOOP-3539. Fix the tool path in the bin/hadoop script under
+ cygwin. (Tsz Wo (Nicholas), Sze via omalley)
+
+ HADOOP-3520. TestDFSUpgradeFromImage triggers a race condition in the
+ Upgrade Manager. Fixed. (dhruba)
+
+ HADOOP-3586. Provide deprecated, backwards compatibile semantics for the
+ combiner to be run once and only once on each record. (cdouglas)
+
+ HADOOP-3533. Add deprecated methods to provide API compatibility
+ between 0.18 and 0.17. Remove the deprecated methods in trunk. (omalley)
+
+ HADOOP-3580. Fixes a problem to do with specifying a har as an input to
+ a job. (Mahadev Konar via ddas)
+
+ HADOOP-3333. Don't assign a task to a tasktracker that it failed to
+ execute earlier (used to happen in the case of lost tasktrackers where
+ the tasktracker would reinitialize and bind to a different port).
+ (Jothi Padmanabhan and Arun Murthy via ddas)
+
+ HADOOP-3534. Log IOExceptions that happen in closing the name
+ system when the NameNode shuts down. (Tsz Wo (Nicholas) Sze via omalley)
+
+ HADOOP-3546. TaskTracker re-initialization gets stuck in cleaning up.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3576. Fix NullPointerException when renaming a directory
+ to its subdirectory. (Tse Wo (Nicholas), SZE via hairong)
+
+ HADOOP-3320. Fix NullPointerException in NetworkTopology.getDistance().
+ (hairong)
+
+ HADOOP-3569. KFS input stream read() now correctly reads 1 byte
+ instead of 4. (Sriram Rao via omalley)
+
+ HADOOP-3599. Fix JobConf::setCombineOnceOnly to modify the instance rather
+ than a parameter. (Owen O'Malley via cdouglas)
+
+ HADOOP-3590. Null pointer exception in JobTracker when the task tracker is
+ not yet resolved. (Amar Ramesh Kamat via ddas)
+
+ HADOOP-3603. Fix MapOutputCollector to spill when io.sort.spill.percent is
+ 1.0 and to detect spills when emitted records write no data. (cdouglas)
+
+ HADOOP-3615. Set DatanodeProtocol.versionID to the correct value.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3559. Fix the libhdfs test script and config to work with the
+ current semantics. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3480. Need to update Eclipse template to reflect current trunk.
+ (Brice Arnould via tomwhite)
+
+ HADOOP-3588. Fixed usability issues with archives. (mahadev)
+
+ HADOOP-3635. Uncaught exception in DataBlockScanner.
+ (Tsz Wo (Nicholas), SZE via hairong)
+
+ HADOOP-3639. Exception when closing DFSClient while multiple files are
+ open. (Benjamin Gufler via hairong)
+
+ HADOOP-3572. SetQuotas usage interface has some minor bugs. (hairong)
+
+ HADOOP-3649. Fix bug in removing blocks from the corrupted block map.
+ (Lohit Vijayarenu via shv)
+
+ HADOOP-3604. Work around a JVM synchronization problem observed while
+ retrieving the address of direct buffers from compression code by obtaining
+ a lock during this call. (Arun C Murthy via cdouglas)
+
+ HADOOP-3683. Fix dfs metrics to count file listings rather than files
+ listed. (lohit vijayarenu via cdouglas)
+
+ HADOOP-3597. Fix SortValidator to use filesystems other than the default as
+ input. Validation job still runs on default fs.
+ (Jothi Padmanabhan via cdouglas)
+
+ HADOOP-3693. Fix archives, distcp and native library documentation to
+ conform to style guidelines. (Amareshwari Sriramadasu via cdouglas)
+
+ HADOOP-3653. Fix test-patch target to properly account for Eclipse
+ classpath jars. (Brice Arnould via nigel)
+
+ HADOOP-3692. Fix documentation for Cluster setup and Quick start guides.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3691. Fix streaming and tutorial docs. (Jothi Padmanabhan via ddas)
+
+ HADOOP-3630. Fix NullPointerException in CompositeRecordReader from empty
+ sources (cdouglas)
+
+ HADOOP-3706. Fix a ClassLoader issue in the mapred.join Parser that
+ prevents it from loading user-specified InputFormats.
+ (Jingkei Ly via cdouglas)
+
+ HADOOP-3718. Fix KFSOutputStream::write(int) to output a byte instead of
+ an int, per the OutputStream contract. (Sriram Rao via cdouglas)
+
+ HADOOP-3647. Add debug logs to help track down a very occassional,
+ hard-to-reproduce, bug in shuffle/merge on the reducer. (acmurthy)
+
+ HADOOP-3716. Prevent listStatus in KosmosFileSystem from returning
+ null for valid, empty directories. (Sriram Rao via cdouglas)
+
+ HADOOP-3752. Fix audit logging to record rename events. (cdouglas)
+
+ HADOOP-3737. Fix CompressedWritable to call Deflater::end to release
+ compressor memory. (Grant Glouser via cdouglas)
+
+ HADOOP-3670. Fixes JobTracker to clear out split bytes when no longer
+ required. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3755. Update gridmix to work with HOD 0.4 (Runping Qi via cdouglas)
+
+ HADOOP-3743. Fix -libjars, -files, -archives options to work even if
+ user code does not implement tools. (Amareshwari Sriramadasu via mahadev)
+
+ HADOOP-3774. Fix typos in shell output. (Tsz Wo (Nicholas), SZE via
+ cdouglas)
+
+ HADOOP-3762. Fixed FileSystem cache to work with the default port. (cutting
+ via omalley)
+
+ HADOOP-3798. Fix tests compilation. (Mukund Madhugiri via omalley)
+
+ HADOOP-3794. Return modification time instead of zero for KosmosFileSystem.
+ (Sriram Rao via cdouglas)
+
+ HADOOP-3806. Remove debug statement to stdout from QuickSort. (cdouglas)
+
+ HADOOP-3776. Fix NPE at NameNode when datanode reports a block after it is
+ deleted at NameNode. (rangadi)
+
+ HADOOP-3537. Disallow adding a datanode to a network topology when its
+ network location is not resolved. (hairong)
+
+ HADOOP-3571. Fix bug in block removal used in lease recovery. (shv)
+
+ HADOOP-3645. MetricsTimeVaryingRate returns wrong value for
+ metric_avg_time. (Lohit Vijayarenu via hairong)
+
+ HADOOP-3521. Reverted the missing cast to float for sending Counters' values
+ to Hadoop metrics which was removed by HADOOP-544. (acmurthy)
+
+ HADOOP-3820. Fixes two problems in the gridmix-env - a syntax error, and a
+ wrong definition of USE_REAL_DATASET by default. (Arun Murthy via ddas)
+
+ HADOOP-3724. Fixes two problems related to storing and recovering lease
+ in the fsimage. (dhruba)
+
+ HADOOP-3827. Fixed compression of empty map-outputs. (acmurthy)
+
+ HADOOP-3865. Remove reference to FSNamesystem from metrics preventing
+ garbage collection. (Lohit Vijayarenu via cdouglas)
+
+ HADOOP-3884. Fix so that Eclipse plugin builds against recent
+ Eclipse releases. (cutting)
+
+ HADOOP-3837. Streaming jobs report progress status. (dhruba)
+
+ HADOOP-3897. Fix a NPE in secondary namenode. (Lohit Vijayarenu via
+ cdouglas)
+
+ HADOOP-3901. Fix bin/hadoop to correctly set classpath under cygwin.
+ (Tsz Wo (Nicholas) Sze via omalley)
+
+ HADOOP-3947. Fix a problem in tasktracker reinitialization.
+ (Amareshwari Sriramadasu via ddas)
+
+Release 0.17.3 - Unreleased
+
+ IMPROVEMENTS
+
+ HADOOP-4164. Chinese translation of the documentation. (Xuebing Yan via
+ omalley)
+
+ BUG FIXES
+
+ HADOOP-4277. Checksum verification was mistakenly disabled for
+ LocalFileSystem. (Raghu Angadi)
+
+ HADOOP-4271. Checksum input stream can sometimes return invalid
+ data to the user. (Ning Li via rangadi)
+
+ HADOOP-4318. DistCp should use absolute paths for cleanup. (szetszwo)
+
+ HADOOP-4326. ChecksumFileSystem does not override create(...) correctly.
+ (szetszwo)
+
+Release 0.17.2 - 2008-08-11
+
+ BUG FIXES
+
+ HADOOP-3678. Avoid spurious exceptions logged at DataNode when clients
+ read from DFS. (rangadi)
+
+ HADOOP-3707. NameNode keeps a count of number of blocks scheduled
+ to be written to a datanode and uses it to avoid allocating more
+ blocks than a datanode can hold. (rangadi)
+
+ HADOOP-3760. Fix a bug with HDFS file close() mistakenly introduced
+ by HADOOP-3681. (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3681. DFSClient can get into an infinite loop while closing
+ a file if there are some errors. (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3002. Hold off block removal while in safe mode. (shv)
+
+ HADOOP-3685. Unbalanced replication target. (hairong)
+
+ HADOOP-3758. Shutdown datanode on version mismatch instead of retrying
+ continuously, preventing excessive logging at the namenode.
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-3633. Correct exception handling in DataXceiveServer, and throttle
+ the number of xceiver threads in a data-node. (shv)
+
+ HADOOP-3370. Ensure that the TaskTracker.runningJobs data-structure is
+ correctly cleaned-up on task completion. (Zheng Shao via acmurthy)
+
+ HADOOP-3813. Fix task-output clean-up on HDFS to use the recursive
+ FileSystem.delete rather than the FileUtil.fullyDelete. (Amareshwari
+ Sri Ramadasu via acmurthy)
+
+ HADOOP-3859. Allow the maximum number of xceivers in the data node to
+ be configurable. (Johan Oskarsson via omalley)
+
+ HADOOP-3931. Fix corner case in the map-side sort that causes some values
+ to be counted as too large and cause pre-mature spills to disk. Some values
+ will also bypass the combiner incorrectly. (cdouglas via omalley)
+
+Release 0.17.1 - 2008-06-23
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-3565. Fix the Java serialization, which is not enabled by
+ default, to clear the state of the serializer between objects.
+ (tomwhite via omalley)
+
+ IMPROVEMENTS
+
+ HADOOP-3522. Improve documentation on reduce pointing out that
+ input keys and values will be reused. (omalley)
+
+ HADOOP-3487. Balancer uses thread pools for managing its threads;
+ therefore provides better resource management. (hairong)
+
+ BUG FIXES
+
+ HADOOP-2159 Namenode stuck in safemode. The counter blockSafe should
+ not be decremented for invalid blocks. (hairong)
+
+ HADOOP-3472 MapFile.Reader getClosest() function returns incorrect results
+ when before is true (Todd Lipcon via Stack)
+
+ HADOOP-3442. Limit recursion depth on the stack for QuickSort to prevent
+ StackOverflowErrors. To avoid O(n*n) cases, when partitioning depth exceeds
+ a multiple of log(n), change to HeapSort. (cdouglas)
+
+ HADOOP-3477. Fix build to not package contrib/*/bin twice in
+ distributions. (Adam Heath via cutting)
+
+ HADOOP-3475. Fix MapTask to correctly size the accounting allocation of
+ io.sort.mb. (cdouglas)
+
+ HADOOP-3550. Fix the serialization data structures in MapTask where the
+ value lengths are incorrectly calculated. (cdouglas)
+
+ HADOOP-3526. Fix contrib/data_join framework by cloning values retained
+ in the reduce. (Spyros Blanas via cdouglas)
+
+ HADOOP-1979. Speed up fsck by adding a buffered stream. (Lohit
+ Vijaya Renu via omalley)
+
+Release 0.17.0 - 2008-05-18
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-2786. Move hbase out of hadoop core
+
+ HADOOP-2345. New HDFS transactions to support appending
+ to files. Disk layout version changed from -11 to -12. (dhruba)
+
+ HADOOP-2192. Error messages from "dfs mv" command improved.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-1902. "dfs du" command without any arguments operates on the
+ current working directory. (Mahadev Konar via dhruba)
+
+ HADOOP-2873. Fixed bad disk format introduced by HADOOP-2345.
+ Disk layout version changed from -12 to -13. See changelist 630992
+ (dhruba)
+
+ HADOOP-1985. This addresses rack-awareness for Map tasks and for
+ HDFS in a uniform way. (ddas)
+
+ HADOOP-1986. Add support for a general serialization mechanism for
+ Map Reduce. (tomwhite)
+
+ HADOOP-771. FileSystem.delete() takes an explicit parameter that
+ specifies whether a recursive delete is intended.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-2470. Remove getContentLength(String), open(String, long, long)
+ and isDir(String) from ClientProtocol. ClientProtocol version changed
+ from 26 to 27. (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-2822. Remove deprecated code for classes InputFormatBase and
+ PhasedFileSystem. (Amareshwari Sriramadasu via enis)
+
+ HADOOP-2116. Changes the layout of the task execution directory.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-2828. The following deprecated methods in Configuration.java
+ have been removed
+ getObject(String name)
+ setObject(String name, Object value)
+ get(String name, Object defaultValue)
+ set(String name, Object value)
+ Iterator entries()
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-2824. Removes one deprecated constructor from MiniMRCluster.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-2823. Removes deprecated methods getColumn(), getLine() from
+ org.apache.hadoop.record.compiler.generated.SimpleCharStream.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3060. Removes one unused constructor argument from MiniMRCluster.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-2854. Remove deprecated o.a.h.ipc.Server::getUserInfo().
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-2563. Remove deprecated FileSystem::listPaths.
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-2818. Remove deprecated methods in Counters.
+ (Amareshwari Sriramadasu via tomwhite)
+
+ HADOOP-2831. Remove deprecated o.a.h.dfs.INode::getAbsoluteName()
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-2839. Remove deprecated FileSystem::globPaths.
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-2634. Deprecate ClientProtocol::exists.
+ (lohit vijayarenu via cdouglas)
+
+ HADOOP-2410. Make EC2 cluster nodes more independent of each other.
+ Multiple concurrent EC2 clusters are now supported, and nodes may be
+ added to a cluster on the fly with new nodes starting in the same EC2
+ availability zone as the cluster. Ganglia monitoring and large
+ instance sizes have also been added. (Chris K Wensel via tomwhite)
+
+ HADOOP-2826. Deprecated FileSplit.getFile(), LineRecordReader.readLine().
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3239. getFileInfo() returns null for non-existing files instead
+ of throwing FileNotFoundException. (Lohit Vijayarenu via shv)
+
+ HADOOP-3266. Removed HOD changes from CHANGES.txt, as they are now inside
+ src/contrib/hod (Hemanth Yamijala via ddas)
+
+ HADOOP-3280. Separate the configuration of the virtual memory size
+ (mapred.child.ulimit) from the jvm heap size, so that 64 bit
+ streaming applications are supported even when running with 32 bit
+ jvms. (acmurthy via omalley)
+
+ NEW FEATURES
+
+ HADOOP-1398. Add HBase in-memory block cache. (tomwhite)
+
+ HADOOP-2178. Job History on DFS. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2063. A new parameter to dfs -get command to fetch a file
+ even if it is corrupted. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2219. A new command "df -count" that counts the number of
+ files and directories. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2906. Add an OutputFormat capable of using keys, values, and
+ config params to map records to different output files.
+ (Runping Qi via cdouglas)
+
+ HADOOP-2346. Utilities to support timeout while writing to sockets.
+ DFSClient and DataNode sockets have 10min write timeout. (rangadi)
+
+ HADOOP-2951. Add a contrib module that provides a utility to
+ build or update Lucene indexes using Map/Reduce. (Ning Li via cutting)
+
+ HADOOP-1622. Allow multiple jar files for map reduce.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-2055. Allows users to set PathFilter on the FileInputFormat.
+ (Alejandro Abdelnur via ddas)
+
+ HADOOP-2551. More environment variables like HADOOP_NAMENODE_OPTS
+ for better control of HADOOP_OPTS for each component. (rangadi)
+
+ HADOOP-3001. Add job counters that measure the number of bytes
+ read and written to HDFS, S3, KFS, and local file systems. (omalley)
+
+ HADOOP-3048. A new Interface and a default implementation to convert
+ and restore serializations of objects to/from strings. (enis)
+
+ IMPROVEMENTS
+
+ HADOOP-2655. Copy on write for data and metadata files in the
+ presence of snapshots. Needed for supporting appends to HDFS
+ files. (dhruba)
+
+ HADOOP-1967. When a Path specifies the same scheme as the default
+ FileSystem but no authority, the default FileSystem's authority is
+ used. Also add warnings for old-format FileSystem names, accessor
+ methods for fs.default.name, and check for null authority in HDFS.
+ (cutting)
+
+ HADOOP-2895. Let the profiling string be configurable.
+ (Martin Traverso via cdouglas)
+
+ HADOOP-910. Enables Reduces to do merges for the on-disk map output files
+ in parallel with their copying. (Amar Kamat via ddas)
+
+ HADOOP-730. Use rename rather than copy for local renames. (cdouglas)
+
+ HADOOP-2810. Updated the Hadoop Core logo. (nigel)
+
+ HADOOP-2057. Streaming should optionally treat a non-zero exit status
+ of a child process as a failed task. (Rick Cox via tomwhite)
+
+ HADOOP-2765. Enables specifying ulimits for streaming/pipes tasks (ddas)
+
+ HADOOP-2888. Make gridmix scripts more readily configurable and amenable
+ to automated execution. (Mukund Madhugiri via cdouglas)
+
+ HADOOP-2908. A document that describes the DFS Shell command.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-2981. Update README.txt to reflect the upcoming use of
+ cryptography. (omalley)
+
+ HADOOP-2804. Add support to publish CHANGES.txt as HTML when running
+ the Ant 'docs' target. (nigel)
+
+ HADOOP-2559. Change DFS block placement to allocate the first replica
+ locally, the second off-rack, and the third intra-rack from the
+ second. (lohit vijayarenu via cdouglas)
+
+ HADOOP-2939. Make the automated patch testing process an executable
+ Ant target, test-patch. (nigel)
+
+ HADOOP-2239. Add HsftpFileSystem to permit transferring files over ssl.
+ (cdouglas)
+
+ HADOOP-2886. Track individual RPC metrics.
+ (girish vaitheeswaran via dhruba)
+
+ HADOOP-2373. Improvement in safe-mode reporting. (shv)
+
+ HADOOP-3091. Modify FsShell command -put to accept multiple sources.
+ (Lohit Vijaya Renu via cdouglas)
+
+ HADOOP-3092. Show counter values from job -status command.
+ (Tom White via ddas)
+
+ HADOOP-1228. Ant task to generate Eclipse project files. (tomwhite)
+
+ HADOOP-3093. Adds Configuration.getStrings(name, default-value) and
+ the corresponding setStrings. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3106. Adds documentation in forrest for debugging.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3099. Add an option to distcp to preserve user, group, and
+ permission information. (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-2841. Unwrap AccessControlException and FileNotFoundException
+ from RemoteException for DFSClient. (shv)
+
+ HADOOP-3152. Make index interval configuable when using
+ MapFileOutputFormat for map-reduce job. (Rong-En Fan via cutting)
+
+ HADOOP-3143. Decrease number of slaves from 4 to 3 in TestMiniMRDFSSort,
+ as Hudson generates false negatives under the current load.
+ (Nigel Daley via cdouglas)
+
+ HADOOP-3174. Illustrative example for MultipleFileInputFormat. (Enis
+ Soztutar via acmurthy)
+
+ HADOOP-2993. Clarify the usage of JAVA_HOME in the Quick Start guide.
+ (acmurthy via nigel)
+
+ HADOOP-3124. Make DataNode socket write timeout configurable. (rangadi)
+
+ OPTIMIZATIONS
+
+ HADOOP-2790. Fixed inefficient method hasSpeculativeTask by removing
+ repetitive calls to get the current time and late checking to see if
+ we want speculation on at all. (omalley)
+
+ HADOOP-2758. Reduce buffer copies in DataNode when data is read from
+ HDFS, without negatively affecting read throughput. (rangadi)
+
+ HADOOP-2399. Input key and value to combiner and reducer is reused.
+ (Owen O'Malley via ddas).
+
+ HADOOP-2423. Code optimization in FSNamesystem.mkdirs.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2606. ReplicationMonitor selects data-nodes to replicate directly
+ from needed replication blocks instead of looking up for the blocks for
+ each live data-node. (shv)
+
+ HADOOP-2148. Eliminate redundant data-node blockMap lookups. (shv)
+
+ HADOOP-2027. Return the number of bytes in each block in a file
+ via a single rpc to the namenode to speed up job planning.
+ (Lohit Vijaya Renu via omalley)
+
+ HADOOP-2902. Replace uses of "fs.default.name" with calls to the
+ accessor methods added in HADOOP-1967. (cutting)
+
+ HADOOP-2119. Optimize scheduling of jobs with large numbers of
+ tasks by replacing static arrays with lists of runnable tasks.
+ (Amar Kamat via omalley)
+
+ HADOOP-2919. Reduce the number of memory copies done during the
+ map output sorting. Also adds two config variables:
+ io.sort.spill.percent - the percentages of io.sort.mb that should
+ cause a spill (default 80%)
+ io.sort.record.percent - the percent of io.sort.mb that should
+ hold key/value indexes (default 5%)
+ (cdouglas via omalley)
+
+ HADOOP-3140. Doesn't add a task in the commit queue if the task hadn't
+ generated any output. (Amar Kamat via ddas)
+
+ HADOOP-3168. Reduce the amount of logging in streaming to an
+ exponentially increasing number of records (up to 10,000
+ records/log). (Zheng Shao via omalley)
+
+ BUG FIXES
+
+ HADOOP-2195. '-mkdir' behaviour is now closer to Linux shell in case of
+ errors. (Mahadev Konar via rangadi)
+
+ HADOOP-2190. bring behaviour '-ls' and '-du' closer to Linux shell
+ commands in case of errors. (Mahadev Konar via rangadi)
+
+ HADOOP-2193. 'fs -rm' and 'fs -rmr' show error message when the target
+ file does not exist. (Mahadev Konar via rangadi)
+
+ HADOOP-2738 Text is not subclassable because set(Text) and compareTo(Object)
+ access the other instance's private members directly. (jimk)
+
+ HADOOP-2779. Remove the references to HBase in the build.xml. (omalley)
+
+ HADOOP-2194. dfs cat on a non-existent file throws FileNotFoundException.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-2767. Fix for NetworkTopology erroneously skipping the last leaf
+ node on a rack. (Hairong Kuang and Mark Butler via dhruba)
+
+ HADOOP-1593. FsShell works with paths in non-default FileSystem.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-2191. du and dus command on non-existent directory gives
+ appropriate error message. (Mahadev Konar via dhruba)
+
+ HADOOP-2832. Remove tabs from code of DFSClient for better
+ indentation. (dhruba)
+
+ HADOOP-2844. distcp closes file handles for sequence files.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2727. Fix links in Web UI of the hadoop daemons and some docs
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2871. Fixes a problem to do with file: URI in the JobHistory init.
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2800. Deprecate SetFile.Writer constructor not the whole class.
+ (Johan Oskarsson via tomwhite)
+
+ HADOOP-2891. DFSClient.close() closes all open files. (dhruba)
+
+ HADOOP-2845. Fix dfsadmin disk utilization report on Solaris.
+ (Martin Traverso via tomwhite)
+
+ HADOOP-2912. MiniDFSCluster restart should wait for namenode to exit
+ safemode. This was causing TestFsck to fail. (Mahadev Konar via dhruba)
+
+ HADOOP-2820. The following classes in streaming are removed :
+ StreamLineRecordReader StreamOutputFormat StreamSequenceRecordReader.
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2819. The following methods in JobConf are removed:
+ getInputKeyClass() setInputKeyClass getInputValueClass()
+ setInputValueClass(Class theClass) setSpeculativeExecution
+ getSpeculativeExecution() (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2817. Removes deprecated mapred.tasktracker.tasks.maximum and
+ ClusterStatus.getMaxTasks(). (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2821. Removes deprecated ShellUtil and ToolBase classes from
+ the util package. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2934. The namenode was encountreing a NPE while loading
+ leases from the fsimage. Fixed. (dhruba)
+
+ HADOOP-2938. Some fs commands did not glob paths.
+ (Tsz Wo (Nicholas), SZE via rangadi)
+
+ HADOOP-2943. Compression of intermediate map output causes failures
+ in the merge. (cdouglas)
+
+ HADOOP-2870. DataNode and NameNode closes all connections while
+ shutting down. (Hairong Kuang via dhruba)
+
+ HADOOP-2973. Fix TestLocalDFS for Windows platform.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2971. select multiple times if it returns early in
+ SocketIOWithTimeout. (rangadi)
+
+ HADOOP-2955. Fix TestCrcCorruption test failures caused by HADOOP-2758
+ (rangadi)
+
+ HADOOP-2657. A flush call on the DFSOutputStream flushes the last
+ partial CRC chunk too. (dhruba)
+
+ HADOOP-2974. IPC unit tests used "0.0.0.0" to connect to server, which
+ is not always supported. (rangadi)
+
+ HADOOP-2996. Fixes uses of StringBuffer in StreamUtils class.
+ (Dave Brosius via ddas)
+
+ HADOOP-2995. Fixes StreamBaseRecordReader's getProgress to return a
+ floating point number. (Dave Brosius via ddas)
+
+ HADOOP-2972. Fix for a NPE in FSDataset.invalidate.
+ (Mahadev Konar via dhruba)
+
+ HADOOP-2994. Code cleanup for DFSClient: remove redundant
+ conversions from string to string. (Dave Brosius via dhruba)
+
+ HADOOP-3009. TestFileCreation sometimes fails because restarting
+ minidfscluster sometimes creates datanodes with ports that are
+ different from their original instance. (dhruba)
+
+ HADOOP-2992. Distributed Upgrade framework works correctly with
+ more than one upgrade object. (Konstantin Shvachko via dhruba)
+
+ HADOOP-2679. Fix a typo in libhdfs. (Jason via dhruba)
+
+ HADOOP-2976. When a lease expires, the Namenode ensures that
+ blocks of the file are adequately replicated. (dhruba)
+
+ HADOOP-2901. Fixes the creation of info servers in the JobClient
+ and JobTracker. Removes the creation from JobClient and removes
+ additional info server from the JobTracker. Also adds the command
+ line utility to view the history files (HADOOP-2896), and fixes
+ bugs in JSPs to do with analysis - HADOOP-2742, HADOOP-2792.
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2890. If different datanodes report the same block but
+ with different sizes to the namenode, the namenode picks the
+ replica(s) with the largest size as the only valid replica(s). (dhruba)
+
+ HADOOP-2825. Deprecated MapOutputLocation.getFile() is removed.
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2806. Fixes a streaming document.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3008. SocketIOWithTimeout throws InterruptedIOException if the
+ thread is interrupted while it is waiting. (rangadi)
+
+ HADOOP-3006. Fix wrong packet size reported by DataNode when a block
+ is being replicated. (rangadi)
+
+ HADOOP-3029. Datanode prints log message "firstbadlink" only if
+ it detects a bad connection to another datanode in the pipeline. (dhruba)
+
+ HADOOP-3030. Release reserved space for file in InMemoryFileSystem if
+ checksum reservation fails. (Devaraj Das via cdouglas)
+
+ HADOOP-3036. Fix findbugs warnings in UpgradeUtilities. (Konstantin
+ Shvachko via cdouglas)
+
+ HADOOP-3025. ChecksumFileSystem supports the delete method with
+ the recursive flag. (Mahadev Konar via dhruba)
+
+ HADOOP-3012. dfs -mv file to user home directory throws exception if
+ the user home directory does not exist. (Mahadev Konar via dhruba)
+
+ HADOOP-3066. Should not require superuser privilege to query if hdfs is in
+ safe mode (jimk)
+
+ HADOOP-3040. If the input line starts with the separator char, the key
+ is set as empty. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3080. Removes flush calls from JobHistory.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3086. Adds the testcase missed during commit of hadoop-3040.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3046. Fix the raw comparators for Text and BytesWritables
+ to use the provided length rather than recompute it. (omalley)
+
+ HADOOP-3094. Fix BytesWritable.toString to avoid extending the sign bit
+ (Owen O'Malley via cdouglas)
+
+ HADOOP-3067. DFSInputStream's position read does not close the sockets.
+ (rangadi)
+
+ HADOOP-3073. close() on SocketInputStream or SocketOutputStream should
+ close the underlying channel. (rangadi)
+
+ HADOOP-3087. Fixes a problem to do with refreshing of loadHistory.jsp.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3065. Better logging message if the rack location of a datanode
+ cannot be determined. (Devaraj Das via dhruba)
+
+ HADOOP-3064. Commas in a file path should not be treated as delimiters.
+ (Hairong Kuang via shv)
+
+ HADOOP-2997. Adds test for non-writable serialier. Also fixes a problem
+ introduced by HADOOP-2399. (Tom White via ddas)
+
+ HADOOP-3114. Fix TestDFSShell on Windows. (Lohit Vijaya Renu via cdouglas)
+
+ HADOOP-3118. Fix Namenode NPE while loading fsimage after a cluster
+ upgrade from older disk format. (dhruba)
+
+ HADOOP-3161. Fix FIleUtil.HardLink.getLinkCount on Mac OS. (nigel
+ via omalley)
+
+ HADOOP-2927. Fix TestDU to acurately calculate the expected file size.
+ (shv via nigel)
+
+ HADOOP-3123. Fix the native library build scripts to work on Solaris.
+ (tomwhite via omalley)
+
+ HADOOP-3089. Streaming should accept stderr from task before
+ first key arrives. (Rick Cox via tomwhite)
+
+ HADOOP-3146. A DFSOutputStream.flush method is renamed as
+ DFSOutputStream.fsync. (dhruba)
+
+ HADOOP-3165. -put/-copyFromLocal did not treat input file "-" as stdin.
+ (Lohit Vijayarenu via rangadi)
+
+ HADOOP-3041. Deprecate JobConf.setOutputPath and JobConf.getOutputPath.
+ Deprecate OutputFormatBase. Add FileOutputFormat. Existing output formats
+ extending OutputFormatBase, now extend FileOutputFormat. Add the following
+ APIs in FileOutputFormat: setOutputPath, getOutputPath, getWorkOutputPath.
+ (Amareshwari Sriramadasu via nigel)
+
+ HADOOP-3083. The fsimage does not store leases. This would have to be
+ reworked in the next release to support appends. (dhruba)
+
+ HADOOP-3166. Fix an ArrayIndexOutOfBoundsException in the spill thread
+ and make exception handling more promiscuous to catch this condition.
+ (cdouglas)
+
+ HADOOP-3050. DataNode sends one and only one block report after
+ it registers with the namenode. (Hairong Kuang)
+
+ HADOOP-3044. NNBench sets the right configuration for the mapper.
+ (Hairong Kuang)
+
+ HADOOP-3178. Fix GridMix scripts for small and medium jobs
+ to handle input paths differently. (Mukund Madhugiri via nigel)
+
+ HADOOP-1911. Fix an infinite loop in DFSClient when all replicas of a
+ block are bad (cdouglas)
+
+ HADOOP-3157. Fix path handling in DistributedCache and TestMiniMRLocalFS.
+ (Doug Cutting via rangadi)
+
+ HADOOP-3018. Fix the eclipse plug-in contrib wrt removed deprecated
+ methods (taton)
+
+ HADOOP-3183. Fix TestJobShell to use 'ls' instead of java.io.File::exists
+ since cygwin symlinks are unsupported.
+ (Mahadev konar via cdouglas)
+
+ HADOOP-3175. Fix FsShell.CommandFormat to handle "-" in arguments.
+ (Edward J. Yoon via rangadi)
+
+ HADOOP-3220. Safemode message corrected. (shv)
+
+ HADOOP-3208. Fix WritableDeserializer to set the Configuration on
+ deserialized Writables. (Enis Soztutar via cdouglas)
+
+ HADOOP-3224. 'dfs -du /dir' does not return correct size.
+ (Lohit Vjayarenu via rangadi)
+
+ HADOOP-3223. Fix typo in help message for -chmod. (rangadi)
+
+ HADOOP-1373. checkPath() should ignore case when it compares authoriy.
+ (Edward J. Yoon via rangadi)
+
+ HADOOP-3204. Fixes a problem to do with ReduceTask's LocalFSMerger not
+ catching Throwable. (Amar Ramesh Kamat via ddas)
+
+ HADOOP-3229. Report progress when collecting records from the mapper and
+ the combiner. (Doug Cutting via cdouglas)
+
+ HADOOP-3225. Unwrapping methods of RemoteException should initialize
+ detailedMassage field. (Mahadev Konar, shv, cdouglas)
+
+ HADOOP-3247. Fix gridmix scripts to use the correct globbing syntax and
+ change maxentToSameCluster to run the correct number of jobs.
+ (Runping Qi via cdouglas)
+
+ HADOOP-3242. Fix the RecordReader of SequenceFileAsBinaryInputFormat to
+ correctly read from the start of the split and not the beginning of the
+ file. (cdouglas via acmurthy)
+
+ HADOOP-3256. Encodes the job name used in the filename for history files.
+ (Arun Murthy via ddas)
+
+ HADOOP-3162. Ensure that comma-separated input paths are treated correctly
+ as multiple input paths. (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-3263. Ensure that the job-history log file always follows the
+ pattern of hostname_timestamp_jobid_username_jobname even if username
+ and/or jobname are not specfied. This helps to avoid wrong assumptions
+ made about the job-history log filename in jobhistory.jsp. (acmurthy)
+
+ HADOOP-3251. Fixes getFilesystemName in JobTracker and LocalJobRunner to
+ use FileSystem.getUri instead of FileSystem.getName. (Arun Murthy via ddas)
+
+ HADOOP-3237. Fixes TestDFSShell.testErrOutPut on Windows platform.
+ (Mahadev Konar via ddas)
+
+ HADOOP-3279. TaskTracker checks for SUCCEEDED task status in addition to
+ COMMIT_PENDING status when it fails maps due to lost map.
+ (Devaraj Das)
+
+ HADOOP-3286. Prevent collisions in gridmix output dirs by increasing the
+ granularity of the timestamp. (Runping Qi via cdouglas)
+
+ HADOOP-3285. Fix input split locality when the splits align to
+ fs blocks. (omalley)
+
+ HADOOP-3372. Fix heap management in streaming tests. (Arun Murthy via
+ cdouglas)
+
+ HADOOP-3031. Fix javac warnings in test classes. (cdouglas)
+
+ HADOOP-3382. Fix memory leak when files are not cleanly closed (rangadi)
+
+ HADOOP-3322. Fix to push MetricsRecord for rpc metrics. (Eric Yang via
+ mukund)
+
+Release 0.16.4 - 2008-05-05
+
+ BUG FIXES
+
+ HADOOP-3138. DFS mkdirs() should not throw an exception if the directory
+ already exists. (rangadi via mukund)
+
+ HADOOP-3294. Fix distcp to check the destination length and retry the copy
+ if it doesn't match the src length. (Tsz Wo (Nicholas), SZE via mukund)
+
+ HADOOP-3186. Fix incorrect permission checkding for mv and renameTo
+ in HDFS. (Tsz Wo (Nicholas), SZE via mukund)
+
+Release 0.16.3 - 2008-04-16
+
+ BUG FIXES
+
+ HADOOP-3010. Fix ConcurrentModificationException in ipc.Server.Responder.
+ (rangadi)
+
+ HADOOP-3154. Catch all Throwables from the SpillThread in MapTask, rather
+ than IOExceptions only. (ddas via cdouglas)
+
+ HADOOP-3159. Avoid file system cache being overwritten whenever
+ configuration is modified. (Tsz Wo (Nicholas), SZE via hairong)
+
+ HADOOP-3139. Remove the consistency check for the FileSystem cache in
+ closeAll() that causes spurious warnings and a deadlock.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3195. Fix TestFileSystem to be deterministic.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-3069. Primary name-node should not truncate image when transferring
+ it from the secondary. (shv)
+
+ HADOOP-3182. Change permissions of the job-submission directory to 777
+ from 733 to ensure sharing of HOD clusters works correctly. (Tsz Wo
+ (Nicholas), Sze and Amareshwari Sri Ramadasu via acmurthy)
+
+Release 0.16.2 - 2008-04-02
+
+ BUG FIXES
+
+ HADOOP-3011. Prohibit distcp from overwriting directories on the
+ destination filesystem with files. (cdouglas)
+
+ HADOOP-3033. The BlockReceiver thread in the datanode writes data to
+ the block file, changes file position (if needed) and flushes all by
+ itself. The PacketResponder thread does not flush block file. (dhruba)
+
+ HADOOP-2978. Fixes the JobHistory log format for counters.
+ (Runping Qi via ddas)
+
+ HADOOP-2985. Fixes LocalJobRunner to tolerate null job output path.
+ Also makes the _temporary a constant in MRConstants.java.
+ (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3003. FileSystem cache key is updated after a
+ FileSystem object is created. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-3042. Updates the Javadoc in JobConf.getOutputPath to reflect
+ the actual temporary path. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3007. Tolerate mirror failures while DataNode is replicating
+ blocks as it used to before. (rangadi)
+
+ HADOOP-2944. Fixes a "Run on Hadoop" wizard NPE when creating a
+ Location from the wizard. (taton)
+
+ HADOOP-3049. Fixes a problem in MultiThreadedMapRunner to do with
+ catching RuntimeExceptions. (Alejandro Abdelnur via ddas)
+
+ HADOOP-3039. Fixes a problem to do with exceptions in tasks not
+ killing jobs. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3027. Fixes a problem to do with adding a shutdown hook in
+ FileSystem. (Amareshwari Sriramadasu via ddas)
+
+ HADOOP-3056. Fix distcp when the target is an empty directory by
+ making sure the directory is created first. (cdouglas and acmurthy
+ via omalley)
+
+ HADOOP-3070. Protect the trash emptier thread from null pointer
+ exceptions. (Koji Noguchi via omalley)
+
+ HADOOP-3084. Fix HftpFileSystem to work for zero-lenghth files.
+ (cdouglas)
+
+ HADOOP-3107. Fix NPE when fsck invokes getListings. (dhruba)
+
+ HADOOP-3104. Limit MultithreadedMapRunner to have a fixed length queue
+ between the RecordReader and the map threads. (Alejandro Abdelnur via
+ omalley)
+
+ HADOOP-2833. Do not use "Dr. Who" as the default user in JobClient.
+ A valid user name is required. (Tsz Wo (Nicholas), SZE via rangadi)
+
+ HADOOP-3128. Throw RemoteException in setPermissions and setOwner of
+ DistributedFileSystem. (shv via nigel)
+
+Release 0.16.1 - 2008-03-13
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-2869. Deprecate SequenceFile.setCompressionType in favor of
+ SequenceFile.createWriter, SequenceFileOutputFormat.setCompressionType,
+ and JobConf.setMapOutputCompressionType. (Arun C Murthy via cdouglas)
+ Configuration changes to hadoop-default.xml:
+ deprecated io.seqfile.compression.type
+
+ IMPROVEMENTS
+
+ HADOOP-2371. User guide for file permissions in HDFS.
+ (Robert Chansler via rangadi)
+
+ HADOOP-3098. Allow more characters in user and group names while
+ using -chown and -chgrp commands. (rangadi)
+
+ BUG FIXES
+
+ HADOOP-2789. Race condition in IPC Server Responder that could close
+ connections early. (Raghu Angadi)
+
+ HADOOP-2785. minor. Fix a typo in Datanode block verification
+ (Raghu Angadi)
+
+ HADOOP-2788. minor. Fix help message for chgrp shell command (Raghu Angadi).
+
+ HADOOP-1188. fstime file is updated when a storage directory containing
+ namespace image becomes inaccessible. (shv)
+
+ HADOOP-2787. An application can set a configuration variable named
+ dfs.umask to set the umask that is used by DFS.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2780. The default socket buffer size for DataNodes is 128K.
+ (dhruba)
+
+ HADOOP-2716. Superuser privileges for the Balancer.
+ (Tsz Wo (Nicholas), SZE via shv)
+
+ HADOOP-2754. Filter out .crc files from local file system listing.
+ (Hairong Kuang via shv)
+
+ HADOOP-2733. Fix compiler warnings in test code.
+ (Tsz Wo (Nicholas), SZE via cdouglas)
+
+ HADOOP-2725. Modify distcp to avoid leaving partially copied files at
+ the destination after encountering an error. (Tsz Wo (Nicholas), SZE
+ via cdouglas)
+
+ HADOOP-2391. Cleanup job output directory before declaring a job as
+ SUCCESSFUL. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2808. Minor fix to FileUtil::copy to mind the overwrite
+ formal. (cdouglas)
+
+ HADOOP-2683. Moving UGI out of the RPC Server.
+ (Tsz Wo (Nicholas), SZE via shv)
+
+ HADOOP-2814. Fix for NPE in datanode in unit test TestDataTransferProtocol.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-2811. Dump of counters in job history does not add comma between
+ groups. (runping via omalley)
+
+ HADOOP-2735. Enables setting TMPDIR for tasks.
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2843. Fix protections on map-side join classes to enable derivation.
+ (cdouglas via omalley)
+
+ HADOOP-2840. Fix gridmix scripts to correctly invoke the java sort through
+ the proper jar. (Mukund Madhugiri via cdouglas)
+
+ HADOOP-2769. TestNNThroughputBnechmark should not use a fixed port for
+ the namenode http port. (omalley)
+
+ HADOOP-2852. Update gridmix benchmark to avoid an artifically long tail.
+ (cdouglas)
+
+ HADOOP-2894. Fix a problem to do with tasktrackers failing to connect to
+ JobTracker upon reinitialization. (Owen O'Malley via ddas).
+
+ HADOOP-2903. Fix exception generated by Metrics while using pushMetric().
+ (girish vaitheeswaran via dhruba)
+
+ HADOOP-2904. Fix to RPC metrics to log the correct host name.
+ (girish vaitheeswaran via dhruba)
+
+ HADOOP-2918. Improve error logging so that dfs writes failure with
+ "No lease on file" can be diagnosed. (dhruba)
+
+ HADOOP-2923. Add SequenceFileAsBinaryInputFormat, which was
+ missed in the commit for HADOOP-2603. (cdouglas via omalley)
+
+ HADOOP-2931. IOException thrown by DFSOutputStream had wrong stack
+ trace in some cases. (Michael Bieniosek via rangadi)
+
+ HADOOP-2883. Write failures and data corruptions on HDFS files.
+ The write timeout is back to what it was on 0.15 release. Also, the
+ datnodes flushes the block file buffered output stream before
+ sending a positive ack for the packet back to the client. (dhruba)
+
+ HADOOP-2756. NPE in DFSClient while closing DFSOutputStreams
+ under load. (rangadi)
+
+ HADOOP-2958. Fixed FileBench which broke due to HADOOP-2391 which performs
+ a check for existence of the output directory and a trivial bug in
+ GenericMRLoadGenerator where min/max word lenghts were identical since
+ they were looking at the same config variables (Chris Douglas via
+ acmurthy)
+
+ HADOOP-2915. Fixed FileSystem.CACHE so that a username is included
+ in the cache key. (Tsz Wo (Nicholas), SZE via nigel)
+
+ HADOOP-2813. TestDU unit test uses its own directory to run its
+ sequence of tests. (Mahadev Konar via dhruba)
+
+Release 0.16.0 - 2008-02-07
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-1245. Use the mapred.tasktracker.tasks.maximum value
+ configured on each tasktracker when allocating tasks, instead of
+ the value configured on the jobtracker. InterTrackerProtocol
+ version changed from 5 to 6. (Michael Bieniosek via omalley)
+
+ HADOOP-1843. Removed code from Configuration and JobConf deprecated by
+ HADOOP-785 and a minor fix to Configuration.toString. Specifically the
+ important change is that mapred-default.xml is no longer supported and
+ Configuration no longer supports the notion of default/final resources.
+ (acmurthy)
+
+ HADOOP-1302. Remove deprecated abacus code from the contrib directory.
+ This also fixes a configuration bug in AggregateWordCount, so that the
+ job now works. (enis)
+
+ HADOOP-2288. Enhance FileSystem API to support access control.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2184. RPC Support for user permissions and authentication.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-2185. RPC Server uses any available port if the specified
+ port is zero. Otherwise it uses the specified port. Also combines
+ the configuration attributes for the servers' bind address and
+ port from "x.x.x.x" and "y" to "x.x.x.x:y".
+ Deprecated configuration variables:
+ dfs.info.bindAddress
+ dfs.info.port
+ dfs.datanode.bindAddress
+ dfs.datanode.port
+ dfs.datanode.info.bindAdress
+ dfs.datanode.info.port
+ dfs.secondary.info.bindAddress
+ dfs.secondary.info.port
+ mapred.job.tracker.info.bindAddress
+ mapred.job.tracker.info.port
+ mapred.task.tracker.report.bindAddress
+ tasktracker.http.bindAddress
+ tasktracker.http.port
+ New configuration variables (post HADOOP-2404):
+ dfs.secondary.http.address
+ dfs.datanode.address
+ dfs.datanode.http.address
+ dfs.http.address
+ mapred.job.tracker.http.address
+ mapred.task.tracker.report.address
+ mapred.task.tracker.http.address
+ (Konstantin Shvachko via dhruba)
+
+ HADOOP-2401. Only the current leaseholder can abandon a block for
+ a HDFS file. ClientProtocol version changed from 20 to 21.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2381. Support permission information in FileStatus. Client
+ Protocol version changed from 21 to 22. (Raghu Angadi via dhruba)
+
+ HADOOP-2110. Block report processing creates fewer transient objects.
+ Datanode Protocol version changed from 10 to 11.
+ (Sanjay Radia via dhruba)
+
+ HADOOP-2567. Add FileSystem#getHomeDirectory(), which returns the
+ user's home directory in a FileSystem as a fully-qualified path.
+ FileSystem#getWorkingDirectory() is also changed to return a
+ fully-qualified path, which can break applications that attempt
+ to, e.g., pass LocalFileSystem#getWorkingDir().toString() directly
+ to java.io methods that accept file names. (cutting)
+
+ HADOOP-2514. Change trash feature to maintain a per-user trash
+ directory, named ".Trash" in the user's home directory. The
+ "fs.trash.root" parameter is no longer used. Full source paths
+ are also no longer reproduced within the trash.
+
+ HADOOP-2012. Periodic data verification on Datanodes.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-1707. The DFSClient does not use a local disk file to cache
+ writes to a HDFS file. Changed Data Transfer Version from 7 to 8.
+ (dhruba)
+
+ HADOOP-2652. Fix permission issues for HftpFileSystem. This is an
+ incompatible change since distcp may not be able to copy files
+ from cluster A (compiled with this patch) to cluster B (compiled
+ with previous versions). (Tsz Wo (Nicholas), SZE via dhruba)
+
+ NEW FEATURES
+
+ HADOOP-1857. Ability to run a script when a task fails to capture stack
+ traces. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2299. Defination of a login interface. A simple implementation for
+ Unix users and groups. (Hairong Kuang via dhruba)
+
+ HADOOP-1652. A utility to balance data among datanodes in a HDFS cluster.
+ (Hairong Kuang via dhruba)
+
+ HADOOP-2085. A library to support map-side joins of consistently
+ partitioned and sorted data sets. (Chris Douglas via omalley)
+
+ HADOOP-2336. Shell commands to modify file permissions. (rangadi)
+
+ HADOOP-1298. Implement file permissions for HDFS.
+ (Tsz Wo (Nicholas) & taton via cutting)
+
+ HADOOP-2447. HDFS can be configured to limit the total number of
+ objects (inodes and blocks) in the file system. (dhruba)
+
+ HADOOP-2487. Added an option to get statuses for all submitted/run jobs.
+ This information can be used to develop tools for analysing jobs.
+ (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-1873. Implement user permissions for Map/Reduce framework.
+ (Hairong Kuang via shv)
+
+ HADOOP-2532. Add to MapFile a getClosest method that returns the key
+ that comes just before if the key is not present. (stack via tomwhite)
+
+ HADOOP-1883. Add versioning to Record I/O. (Vivek Ratan via ddas)
+
+ HADOOP-2603. Add SeqeunceFileAsBinaryInputFormat, which reads
+ sequence files as BytesWritable/BytesWritable regardless of the
+ key and value types used to write the file. (cdouglas via omalley)
+
+ HADOOP-2367. Add ability to profile a subset of map/reduce tasks and fetch
+ the result to the local filesystem of the submitting application. Also
+ includes a general IntegerRanges extension to Configuration for setting
+ positive, ranged parameters. (Owen O'Malley via cdouglas)
+
+ IMPROVEMENTS
+
+ HADOOP-2045. Change committer list on website to a table, so that
+ folks can list their organization, timezone, etc. (cutting)
+
+ HADOOP-2058. Facilitate creating new datanodes dynamically in
+ MiniDFSCluster. (Hairong Kuang via dhruba)
+
+ HADOOP-1855. fsck verifies block placement policies and reports
+ violations. (Konstantin Shvachko via dhruba)
+
+ HADOOP-1604. An system administrator can finalize namenode upgrades
+ without running the cluster. (Konstantin Shvachko via dhruba)
+
+ HADOOP-1839. Link-ify the Pending/Running/Complete/Killed grid in
+ jobdetails.jsp to help quickly narrow down and see categorized TIPs'
+ details via jobtasks.jsp. (Amar Kamat via acmurthy)
+
+ HADOOP-1210. Log counters in job history. (Owen O'Malley via ddas)
+
+ HADOOP-1912. Datanode has two new commands COPY and REPLACE. These are
+ needed for supporting data rebalance. (Hairong Kuang via dhruba)
+
+ HADOOP-2086. This patch adds the ability to add dependencies to a job
+ (run via JobControl) after construction. (Adrian Woodhead via ddas)
+
+ HADOOP-1185. Support changing the logging level of a server without
+ restarting the server. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2134. Remove developer-centric requirements from overview.html and
+ keep it end-user focussed, specifically sections related to subversion and
+ building Hadoop. (Jim Kellerman via acmurthy)
+
+ HADOOP-1989. Support simulated DataNodes. This helps creating large virtual
+ clusters for testing purposes. (Sanjay Radia via dhruba)
+
+ HADOOP-1274. Support different number of mappers and reducers per
+ TaskTracker to allow administrators to better configure and utilize
+ heterogenous clusters.
+ Configuration changes to hadoop-default.xml:
+ add mapred.tasktracker.map.tasks.maximum (default value of 2)
+ add mapred.tasktracker.reduce.tasks.maximum (default value of 2)
+ remove mapred.tasktracker.tasks.maximum (deprecated for 0.16.0)
+ (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-2104. Adds a description to the ant targets. This makes the
+ output of "ant -projecthelp" sensible. (Chris Douglas via ddas)
+
+ HADOOP-2127. Added a pipes sort example to benchmark trivial pipes
+ application versus trivial java application. (omalley via acmurthy)
+
+ HADOOP-2113. A new shell command "dfs -text" to view the contents of
+ a gziped or SequenceFile. (Chris Douglas via dhruba)
+
+ HADOOP-2207. Add a "package" target for contrib modules that
+ permits each to determine what files are copied into release
+ builds. (stack via cutting)
+
+ HADOOP-1984. Makes the backoff for failed fetches exponential.
+ Earlier, it was a random backoff from an interval.
+ (Amar Kamat via ddas)
+
+ HADOOP-1327. Include website documentation for streaming. (Rob Weltman
+ via omalley)
+
+ HADOOP-2000. Rewrite NNBench to measure namenode performance accurately.
+ It now uses the map-reduce framework for load generation.
+ (Mukund Madhugiri via dhruba)
+
+ HADOOP-2248. Speeds up the framework w.r.t Counters. Also has API
+ updates to the Counters part. (Owen O'Malley via ddas)
+
+ HADOOP-2326. The initial block report at Datanode startup time has
+ a random backoff period. (Sanjay Radia via dhruba)
+
+ HADOOP-2432. HDFS includes the name of the file while throwing
+ "File does not exist" exception. (Jim Kellerman via dhruba)
+
+ HADOOP-2457. Added a 'forrest.home' property to the 'docs' target in
+ build.xml. (acmurthy)
+
+ HADOOP-2149. A new benchmark for three name-node operation: file create,
+ open, and block report, to evaluate the name-node performance
+ for optimizations or new features. (Konstantin Shvachko via shv)
+
+ HADOOP-2466. Change FileInputFormat.computeSplitSize to a protected
+ non-static method to allow sub-classes to provide alternate
+ implementations. (Alejandro Abdelnur via acmurthy)
+
+ HADOOP-2425. Change TextOutputFormat to handle Text specifically for better
+ performance. Make NullWritable implement Comparable. Make TextOutputFormat
+ treat NullWritable like null. (omalley)
+
+ HADOOP-1719. Improves the utilization of shuffle copier threads.
+ (Amar Kamat via ddas)
+
+ HADOOP-2390. Added documentation for user-controls for intermediate
+ map-outputs & final job-outputs and native-hadoop libraries. (acmurthy)
+
+ HADOOP-1660. Add the cwd of the map/reduce task to the java.library.path
+ of the child-jvm to support loading of native libraries distributed via
+ the DistributedCache. (acmurthy)
+
+ HADOOP-2285. Speeds up TextInputFormat. Also includes updates to the
+ Text API. (Owen O'Malley via cdouglas)
+
+ HADOOP-2233. Adds a generic load generator for modeling MR jobs. (cdouglas)
+
+ HADOOP-2369. Adds a set of scripts for simulating a mix of user map/reduce
+ workloads. (Runping Qi via cdouglas)
+
+ HADOOP-2547. Removes use of a 'magic number' in build.xml.
+ (Hrishikesh via nigel)
+
+ HADOOP-2268. Fix org.apache.hadoop.mapred.jobcontrol classes to use the
+ List/Map interfaces rather than concrete ArrayList/HashMap classes
+ internally. (Adrian Woodhead via acmurthy)
+
+ HADOOP-2406. Add a benchmark for measuring read/write performance through
+ the InputFormat interface, particularly with compression. (cdouglas)
+
+ HADOOP-2131. Allow finer-grained control over speculative-execution. Now
+ users can set it for maps and reduces independently.
+ Configuration changes to hadoop-default.xml:
+ deprecated mapred.speculative.execution
+ add mapred.map.tasks.speculative.execution
+ add mapred.reduce.tasks.speculative.execution
+ (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-1965. Interleave sort/spill in teh map-task along with calls to the
+ Mapper.map method. This is done by splitting the 'io.sort.mb' buffer into
+ two and using one half for collecting map-outputs and the other half for
+ sort/spill. (Amar Kamat via acmurthy)
+
+ HADOOP-2464. Unit tests for chmod, chown, and chgrp using DFS.
+ (Raghu Angadi)
+
+ HADOOP-1876. Persist statuses of completed jobs in HDFS so that the
+ JobClient can query and get information about decommissioned jobs and also
+ across JobTracker restarts.
+ Configuration changes to hadoop-default.xml:
+ add mapred.job.tracker.persist.jobstatus.active (default value of false)
+ add mapred.job.tracker.persist.jobstatus.hours (default value of 0)
+ add mapred.job.tracker.persist.jobstatus.dir (default value of
+ /jobtracker/jobsInfo)
+ (Alejandro Abdelnur via acmurthy)
+
+ HADOOP-2077. Added version and build information to STARTUP_MSG for all
+ hadoop daemons to aid error-reporting, debugging etc. (acmurthy)
+
+ HADOOP-2398. Additional instrumentation for NameNode and RPC server.
+ Add support for accessing instrumentation statistics via JMX.
+ (Sanjay radia via dhruba)
+
+ HADOOP-2449. A return of the non-MR version of NNBench.
+ (Sanjay Radia via shv)
+
+ HADOOP-1989. Remove 'datanodecluster' command from bin/hadoop.
+ (Sanjay Radia via shv)
+
+ HADOOP-1742. Improve JavaDoc documentation for ClientProtocol, DFSClient,
+ and FSNamesystem. (Konstantin Shvachko)
+
+ HADOOP-2298. Add Ant target for a binary-only distribution.
+ (Hrishikesh via nigel)
+
+ HADOOP-2509. Add Ant target for Rat report (Apache license header
+ reports). (Hrishikesh via nigel)
+
+ HADOOP-2469. WritableUtils.clone should take a Configuration
+ instead of a JobConf. (stack via omalley)
+
+ HADOOP-2659. Introduce superuser permissions for admin operations.
+ (Tsz Wo (Nicholas), SZE via shv)
+
+ HADOOP-2596. Added a SequenceFile.createWriter api which allows the user
+ to specify the blocksize, replication factor and the buffersize to be
+ used for the underlying HDFS file. (Alejandro Abdelnur via acmurthy)
+
+ HADOOP-2431. Test HDFS File Permissions. (Hairong Kuang via shv)
+
+ HADOOP-2232. Add an option to disable Nagle's algorithm in the IPC stack.
+ (Clint Morgan via cdouglas)
+
+ HADOOP-2342. Created a micro-benchmark for measuring
+ local-file versus hdfs reads. (Owen O'Malley via nigel)
+
+ HADOOP-2529. First version of HDFS User Guide. (Raghu Angadi)
+
+ HADOOP-2690. Add jar-test target to build.xml, separating compilation
+ and packaging of the test classes. (Enis Soztutar via cdouglas)
+
+ OPTIMIZATIONS
+
+ HADOOP-1898. Release the lock protecting the last time of the last stack
+ dump while the dump is happening. (Amareshwari Sri Ramadasu via omalley)
+
+ HADOOP-1900. Makes the heartbeat and task event queries interval
+ dependent on the cluster size. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2208. Counter update frequency (from TaskTracker to JobTracker) is
+ capped at 1 minute. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2284. Reduce the number of progress updates during the sorting in
+ the map task. (Amar Kamat via ddas)
+
+ BUG FIXES
+
+ HADOOP-2583. Fixes a bug in the Eclipse plug-in UI to edit locations.
+ Plug-in version is now synchronized with Hadoop version.
+
+ HADOOP-2100. Remove faulty check for existence of $HADOOP_PID_DIR and let
+ 'mkdir -p' check & create it. (Michael Bieniosek via acmurthy)
+
+ HADOOP-1642. Ensure jobids generated by LocalJobRunner are unique to
+ avoid collissions and hence job-failures. (Doug Cutting via acmurthy)
+
+ HADOOP-2096. Close open file-descriptors held by streams while localizing
+ job.xml in the JobTracker and while displaying it on the webui in
+ jobconf.jsp. (Amar Kamat via acmurthy)
+
+ HADOOP-2098. Log start & completion of empty jobs to JobHistory, which
+ also ensures that we close the file-descriptor of the job's history log
+ opened during job-submission. (Amar Kamat via acmurthy)
+
+ HADOOP-2112. Adding back changes to build.xml lost while reverting
+ HADOOP-1622 i.e. http://svn.apache.org/viewvc?view=rev&revision=588771.
+ (acmurthy)
+
+ HADOOP-2089. Fixes the command line argument handling to handle multiple
+ -cacheArchive in Hadoop streaming. (Lohit Vijayarenu via ddas)
+
+ HADOOP-2071. Fix StreamXmlRecordReader to use a BufferedInputStream
+ wrapped over the DFSInputStream since mark/reset aren't supported by
+ DFSInputStream anymore. (Lohit Vijayarenu via acmurthy)
+
+ HADOOP-1348. Allow XML comments inside configuration files.
+ (Rajagopal Natarajan and Enis Soztutar via enis)
+
+ HADOOP-1952. Improve handling of invalid, user-specified classes while
+ configuring streaming jobs such as combiner, input/output formats etc.
+ Now invalid options are caught, logged and jobs are failed early. (Lohit
+ Vijayarenu via acmurthy)
+
+ HADOOP-2151. FileSystem.globPaths validates the list of Paths that
+ it returns. (Lohit Vijayarenu via dhruba)
+
+ HADOOP-2121. Cleanup DFSOutputStream when the stream encountered errors
+ when Datanodes became full. (Raghu Angadi via dhruba)
+
+ HADOOP-1130. The FileSystem.closeAll() method closes all existing
+ DFSClients. (Chris Douglas via dhruba)
+
+ HADOOP-2204. DFSTestUtil.waitReplication was not waiting for all replicas
+ to get created, thus causing unit test failure.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-2078. An zero size file may have no blocks associated with it.
+ (Konstantin Shvachko via dhruba)
+
+ HADOOP-2212. ChecksumFileSystem.getSumBufferSize might throw
+ java.lang.ArithmeticException. The fix is to initialize bytesPerChecksum
+ to 0. (Michael Bieniosek via ddas)
+
+ HADOOP-2216. Fix jobtasks.jsp to ensure that it first collects the
+ taskids which satisfy the filtering criteria and then use that list to
+ print out only the required task-reports, previously it was oblivious to
+ the filtering and hence used the wrong index into the array of task-reports.
+ (Amar Kamat via acmurthy)
+
+ HADOOP-2272. Fix findbugs target to reflect changes made to the location
+ of the streaming jar file by HADOOP-2207. (Adrian Woodhead via nigel)
+
+ HADOOP-2244. Fixes the MapWritable.readFields to clear the instance
+ field variable every time readFields is called. (Michael Stack via ddas).
+
+ HADOOP-2245. Fixes LocalJobRunner to include a jobId in the mapId. Also,
+ adds a testcase for JobControl. (Adrian Woodhead via ddas).
+
+ HADOOP-2275. Fix erroneous detection of corrupted file when namenode
+ fails to allocate any datanodes for newly allocated block.
+ (Dhruba Borthakur via dhruba)
+
+ HADOOP-2256. Fix a buf in the namenode that could cause it to encounter
+ an infinite loop while deleting excess replicas that were created by
+ block rebalancing. (Hairong Kuang via dhruba)
+
+ HADOOP-2209. SecondaryNamenode process exits if it encounters exceptions
+ that it cannot handle. (Dhruba Borthakur via dhruba)
+
+ HADOOP-2314. Prevent TestBlockReplacement from occasionally getting
+ into an infinite loop. (Hairong Kuang via dhruba)
+
+ HADOOP-2300. This fixes a bug where mapred.tasktracker.tasks.maximum
+ would be ignored even if it was set in hadoop-site.xml.
+ (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2349. Improve code layout in file system transaction logging code.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2368. Fix unit tests on Windows.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2363. This fix allows running multiple instances of the unit test
+ in parallel. The bug was introduced in HADOOP-2185 that changed
+ port-rolling behaviour. (Konstantin Shvachko via dhruba)
+
+ HADOOP-2271. Fix chmod task to be non-parallel. (Adrian Woodhead via
+ omalley)
+
+ HADOOP-2313. Fail the build if building libhdfs fails. (nigel via omalley)
+
+ HADOOP-2359. Remove warning for interruptted exception when closing down
+ minidfs. (dhruba via omalley)
+
+ HADOOP-1841. Prevent slow clients from consuming threads in the NameNode.
+ (dhruba)
+
+ HADOOP-2323. JobTracker.close() should not print stack traces for
+ normal exit. (jimk via cutting)
+
+ HADOOP-2376. Prevents sort example from overriding the number of maps.
+ (Owen O'Malley via ddas)
+
+ HADOOP-2434. FSDatasetInterface read interface causes HDFS reads to occur
+ in 1 byte chunks, causing performance degradation.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-2459. Fix package target so that src/docs/build files are not
+ included in the release. (nigel)
+
+ HADOOP-2215. Fix documentation in cluster_setup.html &
+ mapred_tutorial.html reflect that mapred.tasktracker.tasks.maximum has
+ been superceeded by mapred.tasktracker.{map|reduce}.tasks.maximum.
+ (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-2459. Fix package target so that src/docs/build files are not
+ included in the release. (nigel)
+
+ HADOOP-2352. Remove AC_CHECK_LIB for libz and liblzo to ensure that
+ libhadoop.so doesn't have a dependency on them. (acmurthy)
+
+ HADOOP-2453. Fix the configuration for wordcount-simple example in Hadoop
+ Pipes which currently produces an XML parsing error. (Amareshwari Sri
+ Ramadasu via acmurthy)
+
+ HADOOP-2476. Unit test failure while reading permission bits of local
+ file system (on Windows) fixed. (Raghu Angadi via dhruba)
+
+ HADOOP-2247. Fine-tune the strategies for killing mappers and reducers
+ due to failures while fetching map-outputs. Now the map-completion times
+ and number of currently running reduces are taken into account by the
+ JobTracker before killing the mappers, while the progress made by the
+ reducer and the number of fetch-failures vis-a-vis total number of
+ fetch-attempts are taken into account before teh reducer kills itself.
+ (Amar Kamat via acmurthy)
+
+ HADOOP-2452. Fix eclipse plug-in build.xml to refers to the right
+ location where hadoop-*-core.jar is generated. (taton)
+
+ HADOOP-2492. Additional debugging in the rpc server to better
+ diagnose ConcurrentModificationException. (dhruba)
+
+ HADOOP-2344. Enhance the utility for executing shell commands to read the
+ stdout/stderr streams while waiting for the command to finish (to free up
+ the buffers). Also, this patch throws away stderr of the DF utility.
+ @deprecated
+ org.apache.hadoop.fs.ShellCommand for org.apache.hadoop.util.Shell
+ org.apache.hadoop.util.ShellUtil for
+ org.apache.hadoop.util.Shell.ShellCommandExecutor
+ (Amar Kamat via acmurthy)
+
+ HADOOP-2511. Fix a javadoc warning in org.apache.hadoop.util.Shell
+ introduced by HADOOP-2344. (acmurthy)
+
+ HADOOP-2442. Fix TestLocalFileSystemPermission.testLocalFSsetOwner
+ to work on more platforms. (Raghu Angadi via nigel)
+
+ HADOOP-2488. Fix a regression in random read performance.
+ (Michael Stack via rangadi)
+
+ HADOOP-2523. Fix TestDFSShell.testFilePermissions on Windows.
+ (Raghu Angadi via nigel)
+
+ HADOOP-2535. Removed support for deprecated mapred.child.heap.size and
+ fixed some indentation issues in TaskRunner. (acmurthy)
+ Configuration changes to hadoop-default.xml:
+ remove mapred.child.heap.size
+
+ HADOOP-2512. Fix error stream handling in Shell. Use exit code to
+ detect shell command errors in RawLocalFileSystem. (Raghu Angadi)
+
+ HADOOP-2446. Fixes TestHDFSServerPorts and TestMRServerPorts so they
+ do not rely on statically configured ports and cleanup better. (nigel)
+
+ HADOOP-2537. Make build process compatible with Ant 1.7.0.
+ (Hrishikesh via nigel)
+
+ HADOOP-1281. Ensure running tasks of completed map TIPs (e.g. speculative
+ tasks) are killed as soon as the TIP completed. (acmurthy)
+
+ HADOOP-2571. Suppress a suprious warning in test code. (cdouglas)
+
+ HADOOP-2481. NNBench report its progress periodically.
+ (Hairong Kuang via dhruba)
+
+ HADOOP-2601. Start name-node on a free port for TestNNThroughputBenchmark.
+ (Konstantin Shvachko)
+
+ HADOOP-2494. Set +x on contrib/*/bin/* in packaged tar bundle.
+ (stack via tomwhite)
+
+ HADOOP-2605. Remove bogus leading slash in task-tracker report bindAddress.
+ (Konstantin Shvachko)
+
+ HADOOP-2620. Trivial. 'bin/hadoop fs -help' did not list chmod, chown, and
+ chgrp. (Raghu Angadi)
+
+ HADOOP-2614. The DFS WebUI accesses are configured to be from the user
+ specified by dfs.web.ugi. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2543. Implement a "no-permission-checking" mode for smooth
+ upgrade from a pre-0.16 install of HDFS.
+ (Hairong Kuang via dhruba)
+
+ HADOOP-290. A DataNode log message now prints the target of a replication
+ request correctly. (dhruba)
+
+ HADOOP-2538. Redirect to a warning, if plaintext parameter is true but
+ the filter parameter is not given in TaskLogServlet.
+ (Michael Bieniosek via enis)
+
+ HADOOP-2582. Prevent 'bin/hadoop fs -copyToLocal' from creating
+ zero-length files when the src does not exist.
+ (Lohit Vijayarenu via cdouglas)
+
+ HADOOP-2189. Incrementing user counters should count as progress. (ddas)
+
+ HADOOP-2649. The NameNode periodically computes replication work for
+ the datanodes. The periodicity of this computation is now configurable.
+ (dhruba)
+
+ HADOOP-2549. Correct disk size computation so that data-nodes could switch
+ to other local drives if current is full. (Hairong Kuang via shv)
+
+ HADOOP-2633. Fsck should call name-node methods directly rather than
+ through rpc. (Tsz Wo (Nicholas), SZE via shv)
+
+ HADOOP-2687. Modify a few log message generated by dfs client to be
+ logged only at INFO level. (stack via dhruba)
+
+ HADOOP-2402. Fix BlockCompressorStream to ensure it buffers data before
+ sending it down to the compressor so that each write call doesn't
+ compress. (Chris Douglas via acmurthy)
+
+ HADOOP-2645. The Metrics initialization code does not throw
+ exceptions when servers are restarted by MiniDFSCluster.
+ (Sanjay Radia via dhruba)
+
+ HADOOP-2691. Fix a race condition that was causing the DFSClient
+ to erroneously remove a good datanode from a pipeline that actually
+ had another datanode that was bad. (dhruba)
+
+ HADOOP-1195. All code in FSNamesystem checks the return value
+ of getDataNode for null before using it. (dhruba)
+
+ HADOOP-2640. Fix a bug in MultiFileSplitInputFormat that was always
+ returning 1 split in some circumstances. (Enis Soztutar via nigel)
+
+ HADOOP-2626. Fix paths with special characters to work correctly
+ with the local filesystem. (Thomas Friol via cutting)
+
+ HADOOP-2646. Fix SortValidator to work with fully-qualified
+ working directories. (Arun C Murthy via nigel)
+
+ HADOOP-2092. Added a ping mechanism to the pipes' task to periodically
+ check if the parent Java task is running, and exit if the parent isn't
+ alive and responding. (Amareshwari Sri Ramadasu via acmurthy)
+
+ HADOOP-2714. TestDecommission failed on windows because the replication
+ request was timing out. (dhruba)
+
+ HADOOP-2576. Namenode performance degradation over time triggered by
+ large heartbeat interval. (Raghu Angadi)
+
+ HADOOP-2713. TestDatanodeDeath failed on windows because the replication
+ request was timing out. (dhruba)
+
+ HADOOP-2639. Fixes a problem to do with incorrect maintenance of values
+ for runningMapTasks/runningReduceTasks. (Amar Kamat and Arun Murthy
+ via ddas)
+
+ HADOOP-2723. Fixed the check for checking whether to do user task
+ profiling. (Amareshwari Sri Ramadasu via omalley)
+
+ HADOOP-2734. Link forrest docs to new http://hadoop.apache.org
+ (Doug Cutting via nigel)
+
+ HADOOP-2641. Added Apache license headers to 95 files. (nigel)
+
+ HADOOP-2732. Fix bug in path globbing. (Hairong Kuang via nigel)
+
+ HADOOP-2404. Fix backwards compatability with hadoop-0.15 configuration
+ files that was broken by HADOOP-2185. (omalley)
+
+ HADOOP-2755. Fix fsck performance degradation because of permissions
+ issue. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-2768. Fix performance regression caused by HADOOP-1707.
+ (dhruba borthakur via nigel)
+
+ HADOOP-3108. Fix NPE in setPermission and setOwner. (shv)
+
+Release 0.15.3 - 2008-01-18
+
+ BUG FIXES
+
+ HADOOP-2562. globPaths supports {ab,cd}. (Hairong Kuang via dhruba)
+
+ HADOOP-2540. fsck reports missing blocks incorrectly. (dhruba)
+
+ HADOOP-2570. "work" directory created unconditionally, and symlinks
+ created from the task cwds.
+
+ HADOOP-2574. Fixed mapred_tutorial.xml to correct minor errors with the
+ WordCount examples. (acmurthy)
+
+Release 0.15.2 - 2008-01-02
+
+ BUG FIXES
+
+ HADOOP-2246. Moved the changelog for HADOOP-1851 from the NEW FEATURES
+ section to the INCOMPATIBLE CHANGES section. (acmurthy)
+
+ HADOOP-2238. Fix TaskGraphServlet so that it sets the content type of
+ the response appropriately. (Paul Saab via enis)
+
+ HADOOP-2129. Fix so that distcp works correctly when source is
+ HDFS but not the default filesystem. HDFS paths returned by the
+ listStatus() method are now fully-qualified. (cutting)
+
+ HADOOP-2378. Fixes a problem where the last task completion event would
+ get created after the job completes. (Alejandro Abdelnur via ddas)
+
+ HADOOP-2228. Checks whether a job with a certain jobId is already running
+ and then tries to create the JobInProgress object.
+ (Johan Oskarsson via ddas)
+
+ HADOOP-2422. dfs -cat multiple files fail with 'Unable to write to
+ output stream'. (Raghu Angadi via dhruba)
+
+ HADOOP-2460. When the namenode encounters ioerrors on writing a
+ transaction log, it stops writing new transactions to that one.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-2227. Use the LocalDirAllocator uniformly for handling all of the
+ temporary storage required for a given task. It also implies that
+ mapred.local.dir.minspacestart is handled by checking if there is enough
+ free-space on any one of the available disks. (Amareshwari Sri Ramadasu
+ via acmurthy)
+
+ HADOOP-2437. Fix the LocalDirAllocator to choose the seed for the
+ round-robin disk selections randomly. This helps in spreading data across
+ multiple partitions much better. (acmurhty)
+
+ HADOOP-2486. When the list of files from the InMemoryFileSystem is obtained
+ for merging, this patch will ensure that only those files whose checksums
+ have also got created (renamed) are returned. (ddas)
+
+ HADOOP-2456. Hardcode English locale to prevent NumberFormatException
+ from occurring when starting the NameNode with certain locales.
+ (Matthias Friedrich via nigel)
+
+ IMPROVEMENTS
+
+ HADOOP-2160. Remove project-level, non-user documentation from
+ releases, since it's now maintained in a separate tree. (cutting)
+
+ HADOOP-1327. Add user documentation for streaming. (cutting)
+
+ HADOOP-2382. Add hadoop-default.html to subversion. (cutting)
+
+ HADOOP-2158. hdfsListDirectory calls FileSystem.listStatus instead
+ of FileSystem.listPaths. This reduces the number of RPC calls on the
+ namenode, thereby improving scalability. (Christian Kunz via dhruba)
+
+Release 0.15.1 - 2007-11-27
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-713. Reduce CPU usage on namenode while listing directories.
+ FileSystem.listPaths does not return the size of the entire subtree.
+ Introduced a new API ClientProtocol.getContentLength that returns the
+ size of the subtree. (Dhruba Borthakur via dhruba)
+
+ IMPROVEMENTS
+
+ HADOOP-1917. Addition of guides/tutorial for better overall
+ documentation for Hadoop. Specifically:
+ * quickstart.html is targetted towards first-time users and helps them
+ setup a single-node cluster and play with Hadoop.
+ * cluster_setup.html helps admins to configure and setup non-trivial
+ hadoop clusters.
+ * mapred_tutorial.html is a comprehensive Map-Reduce tutorial.
+ (acmurthy)
+
+ BUG FIXES
+
+ HADOOP-2174. Removed the unnecessary Reporter.setStatus call from
+ FSCopyFilesMapper.close which led to a NPE since the reporter isn't valid
+ in the close method. (Chris Douglas via acmurthy)
+
+ HADOOP-2172. Restore performance of random access to local files
+ by caching positions of local input streams, avoiding a system
+ call. (cutting)
+
+ HADOOP-2205. Regenerate the Hadoop website since some of the changes made
+ by HADOOP-1917 weren't correctly copied over to the trunk/docs directory.
+ Also fixed a couple of minor typos and broken links. (acmurthy)
+
+Release 0.15.0 - 2007-11-2
+
+ INCOMPATIBLE CHANGES
+
+ HADOOP-1708. Make files appear in namespace as soon as they are
+ created. (Dhruba Borthakur via dhruba)
+
+ HADOOP-999. A HDFS Client immediately informs the NameNode of a new
+ file creation. ClientProtocol version changed from 14 to 15.
+ (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-932. File locking interfaces and implementations (that were
+ earlier deprecated) are removed. Client Protocol version changed
+ from 15 to 16. (Raghu Angadi via dhruba)
+
+ HADOOP-1621. FileStatus is now a concrete class and FileSystem.listPaths
+ is deprecated and replaced with listStatus. (Chris Douglas via omalley)
+
+ HADOOP-1656. The blockSize of a file is stored persistently in the file
+ inode. (Dhruba Borthakur via dhruba)
+
+ HADOOP-1838. The blocksize of files created with an earlier release is
+ set to the default block size. (Dhruba Borthakur via dhruba)
+
+ HADOOP-785. Add support for 'final' Configuration parameters,
+ removing support for 'mapred-default.xml', and changing
+ 'hadoop-site.xml' to not override other files. Now folks should
+ generally use 'hadoop-site.xml' for all configurations. Values
+ with a 'final' tag may not be overridden by subsequently loaded
+ configuration files, e.g., by jobs. (Arun C. Murthy via cutting)
+
+ HADOOP-1846. DatanodeReport in ClientProtocol can report live
+ datanodes, dead datanodes or all datanodes. Client Protocol version
+ changed from 17 to 18. (Hairong Kuang via dhruba)
+
+ HADOOP-1851. Permit specification of map output compression type
+ and codec, independent of the final output's compression
+ parameters. (Arun C Murthy via cutting)
+
+ HADOOP-1819. Jobtracker cleanups, including binding ports before
+ clearing state directories, so that inadvertently starting a
+ second jobtracker doesn't trash one that's already running. Removed
+ method JobTracker.getTracker() because the static variable, which
+ stored the value caused initialization problems.
+ (omalley via cutting)
+
+ NEW FEATURES
+
+ HADOOP-89. A client can access file data even before the creator
+ has closed the file. Introduce a new command "tail" from dfs shell.
+ (Dhruba Borthakur via dhruba)
+
+ HADOOP-1636. Allow configuration of the number of jobs kept in
+ memory by the JobTracker. (Michael Bieniosek via omalley)
+
+ HADOOP-1667. Reorganize CHANGES.txt into sections to make it
+ easier to read. Also remove numbering, to make merging easier.
+ (cutting)
+
+ HADOOP-1610. Add metrics for failed tasks.
+ (Devaraj Das via tomwhite)
+
+ HADOOP-1767. Add "bin/hadoop job -list" sub-command. (taton via cutting)
+
+ HADOOP-1351. Add "bin/hadoop job [-fail-task|-kill-task]" sub-commands
+ to terminate a particular task-attempt. (Enis Soztutar via acmurthy)
+
+ HADOOP-1880. SleepJob : An example job that sleeps at each map and
+ reduce task. (enis)
+
+ HADOOP-1809. Add a link in web site to #hadoop IRC channel. (enis)
+
+ HADOOP-1894. Add percentage graphs and mapred task completion graphs
+ to Web User Interface. Users not using Firefox may install a plugin to
+ their browsers to see svg graphics. (enis)
+
+ HADOOP-1914. Introduce a new NamenodeProtocol to allow secondary
+ namenodes and rebalancing processes to communicate with a primary
+ namenode. (Hairong Kuang via dhruba)
+
+ HADOOP-1963. Add a FileSystem implementation for the Kosmos
+ Filesystem (KFS). (Sriram Rao via cutting)
+
+ HADOOP-1822. Allow the specialization and configuration of socket
+ factories. Provide a StandardSocketFactory, and a SocksSocketFactory to
+ allow the use of SOCKS proxies. (taton).
+
+ HADOOP-1968. FileSystem supports wildcard input syntax "{ }".
+ (Hairong Kuang via dhruba)
+
+ HADOOP-2566. Add globStatus method to the FileSystem interface
+ and deprecate globPath and listPath. (Hairong Kuang via hairong)
+
+ OPTIMIZATIONS
+
+ HADOOP-1910. Reduce the number of RPCs that DistributedFileSystem.create()
+ makes to the namenode. (Raghu Angadi via dhruba)
+
+ HADOOP-1565. Reduce memory usage of NameNode by replacing
+ TreeMap in HDFS Namespace with ArrayList.
+ (Dhruba Borthakur via dhruba)
+
+ HADOOP-1743. Change DFS INode from a nested class to standalone
+ class, with specialized subclasses for directories and files, to
+ save memory on the namenode. (Konstantin Shvachko via cutting)
+
+ HADOOP-1759. Change file name in INode from String to byte[],
+ saving memory on the namenode. (Konstantin Shvachko via cutting)
+
+ HADOOP-1766. Save memory in namenode by having BlockInfo extend
+ Block, and replace many uses of Block with BlockInfo.
+ (Konstantin Shvachko via cutting)
+
+ HADOOP-1687. Save memory in namenode by optimizing BlockMap
+ representation. (Konstantin Shvachko via cutting)
+
+ HADOOP-1774. Remove use of INode.parent in Block CRC upgrade.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-1788. Increase the buffer size on the Pipes command socket.
+ (Amareshwari Sri Ramadasu and Christian Kunz via omalley)
+
+ BUG FIXES
+
+ HADOOP-1946. The Datanode code does not need to invoke du on
+ every heartbeat. (Hairong Kuang via dhruba)
+
+ HADOOP-1935. Fix a NullPointerException in internalReleaseCreate.
+ (Dhruba Borthakur)
+
+ HADOOP-1933. The nodes listed in include and exclude files
+ are always listed in the datanode report.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-1953. The job tracker should wait beteween calls to try and delete
+ the system directory (Owen O'Malley via devaraj)
+
+ HADOOP-1932. TestFileCreation fails with message saying filestatus.dat
+ is of incorrect size. (Dhruba Borthakur via dhruba)
+
+ HADOOP-1573. Support for 0 reducers in PIPES.
+ (Owen O'Malley via devaraj)
+
+ HADOOP-1500. Fix typographical errors in the DFS WebUI.
+ (Nigel Daley via dhruba)
+
+ HADOOP-1076. Periodic checkpoint can continue even if an earlier
+ checkpoint encountered an error. (Dhruba Borthakur via dhruba)
+
+ HADOOP-1887. The Namenode encounters an ArrayIndexOutOfBoundsException
+ while listing a directory that had a file that was
+ being actively written to. (Dhruba Borthakur via dhruba)
+
+ HADOOP-1904. The Namenode encounters an exception because the
+ list of blocks per datanode-descriptor was corrupted.
+ (Konstantin Shvachko via dhruba)
+
+ HADOOP-1762. The Namenode fsimage does not contain a list of
+ Datanodes. (Raghu Angadi via dhruba)
+
+ HADOOP-1890. Removed debugging prints introduced by HADOOP-1774.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-1763. Too many lost task trackers on large clusters due to
+ insufficient number of RPC handler threads on the JobTracker.
+ (Devaraj Das)
+
+ HADOOP-1463. HDFS report correct usage statistics for disk space
+ used by HDFS. (Hairong Kuang via dhruba)
+
+ HADOOP-1692. In DFS ant task, don't cache the Configuration.
+ (Chris Douglas via cutting)
+
+ HADOOP-1726. Remove lib/jetty-ext/ant.jar. (omalley)
+
+ HADOOP-1772. Fix hadoop-daemon.sh script to get correct hostname
+ under Cygwin. (Tsz Wo (Nicholas), SZE via cutting)
+
+ HADOOP-1749. Change TestDFSUpgrade to sort files, fixing sporadic
+ test failures. (Enis Soztutar via cutting)
+
+ HADOOP-1748. Fix tasktracker to be able to launch tasks when log
+ directory is relative. (omalley via cutting)
+
+ HADOOP-1775. Fix a NullPointerException and an
+ IllegalArgumentException in MapWritable.
+ (Jim Kellerman via cutting)
+
+ HADOOP-1795. Fix so that jobs can generate output file names with
+ special characters. (Fr??d??ric Bertin via cutting)
+
+ HADOOP-1810. Fix incorrect value type in MRBench (SmallJobs)
+ (Devaraj Das via tomwhite)
+
+ HADOOP-1806. Fix ant task to compile again, also fix default
+ builds to compile ant tasks. (Chris Douglas via cutting)
+
+ HADOOP-1758. Fix escape processing in librecordio to not be
+ quadratic. (Vivek Ratan via cutting)
+
+ HADOOP-1817. Fix MultiFileSplit to read and write the split
+ length, so that it is not always zero in map tasks.
+ (Thomas Friol via cutting)
+
+ HADOOP-1853. Fix contrib/streaming to accept multiple -cacheFile
+ options. (Prachi Gupta via cutting)
+
+ HADOOP-1818. Fix MultiFileInputFormat so that it does not return
+ empty splits when numPaths < numSplits. (Thomas Friol via enis)
+
+ HADOOP-1840. Fix race condition which leads to task's diagnostic
+ messages getting lost. (acmurthy)
+
+ HADOOP-1885. Fix race condition in MiniDFSCluster shutdown.
+ (Chris Douglas via nigel)
+
+ HADOOP-1889. Fix path in EC2 scripts for building your own AMI.
+ (tomwhite)
+
+ HADOOP-1892. Fix a NullPointerException in the JobTracker when
+ trying to fetch a task's diagnostic messages from the JobClient.
+ (Amar Kamat via acmurthy)
+
+ HADOOP-1897. Completely remove about.html page from the web site.
+ (enis)
+
+ HADOOP-1907. Fix null pointer exception when getting task diagnostics
+ in JobClient. (Christian Kunz via omalley)
+
+ HADOOP-1882. Remove spurious asterisks from decimal number displays.
+ (Raghu Angadi via cutting)
+
+ HADOOP-1783. Make S3 FileSystem return Paths fully-qualified with
+ scheme and host. (tomwhite)
+
+ HADOOP-1925. Make pipes' autoconf script look for libsocket and libnsl, so
+ that it can compile under Solaris. (omalley)
+
+ HADOOP-1940. TestDFSUpgradeFromImage must shut down its MiniDFSCluster.
+ (Chris Douglas via nigel)
+
+ HADOOP-1930. Fix the blame for failed fetchs on the right host. (Arun C.
+ Murthy via omalley)
+
+ HADOOP-1934. Fix the platform name on Mac to use underscores rather than
+ spaces. (omalley)
+
+ HADOOP-1959. Use "/" instead of File.separator in the StatusHttpServer.
+ (jimk via omalley)
+
+ HADOOP-1626. Improve dfsadmin help messages.
+ (Lohit Vijayarenu via dhruba)
+
+ HADOOP-1695. The SecondaryNamenode waits for the Primary NameNode to
+ start up. (Dhruba Borthakur)
+
+ HADOOP-1983. Have Pipes flush the command socket when progress is sent
+ to prevent timeouts during long computations. (omalley)
+
+ HADOOP-1875. Non-existant directories or read-only directories are
+ filtered from dfs.client.buffer.dir. (Hairong Kuang via dhruba)
+
+ HADOOP-1992. Fix the performance degradation in the sort validator.
+ (acmurthy via omalley)
+
+ HADOOP-1874. Move task-outputs' promotion/discard to a separate thread
+ distinct from the main heartbeat-processing thread. The main upside being
+ that we do not lock-up the JobTracker during HDFS operations, which
+ otherwise may lead to lost tasktrackers if the NameNode is unresponsive.
+ (Devaraj Das via acmurthy)
+
+ HADOOP-2026. Namenode prints out one log line for "Number of transactions"
+ at most once every minute. (Dhruba Borthakur)
+
+ HADOOP-2022. Ensure that status information for successful tasks is correctly
+ recorded at the JobTracker, so that, for example, one may view correct
+ information via taskdetails.jsp. This bug was introduced by HADOOP-1874.
+ (Amar Kamat via acmurthy)
+
+ HADOOP-2031. Correctly maintain the taskid which takes the TIP to
+ completion, failing which the case of lost tasktrackers isn't handled
+ properly i.e. the map TIP is incorrectly left marked as 'complete' and it
+ is never rescheduled elsewhere, leading to hung reduces.
+ (Devaraj Das via acmurthy)
+
+ HADOOP-2018. The source datanode of a data transfer waits for
+ a response from the target datanode before closing the data stream.
+ (Hairong Kuang via dhruba)
+
+ HADOOP-2023. Disable TestLocalDirAllocator on Windows.
+ (Hairong Kuang via nigel)
+
+ HADOOP-2016. Ignore status-updates from FAILED/KILLED tasks at the
+ TaskTracker. This fixes a race-condition which caused the tasks to wrongly
+ remain in the RUNNING state even after being killed by the JobTracker and
+ thus handicap the cleanup of the task's output sub-directory. (acmurthy)
+
+ HADOOP-1771. Fix a NullPointerException in streaming caused by an
+ IOException in MROutputThread. (lohit vijayarenu via nigel)
+
+ HADOOP-2028. Fix distcp so that the log dir does not need to be
+ specified and the destination does not need to exist.
+ (Chris Douglas via nigel)
+
+ HADOOP-2044. The namenode protects all lease manipulations using a
+ sortedLease lock. (Dhruba Borthakur)
+
+ HADOOP-2051. The TaskCommit thread should not die for exceptions other
+ than the InterruptedException. This behavior is there for the other long
+ running threads in the JobTracker. (Arun C Murthy via ddas)
+
+ HADOOP-1973. The FileSystem object would be accessed on the JobTracker
+ through a RPC in the InterTrackerProtocol. The check for the object being
+ null was missing and hence NPE would be thrown sometimes. This issue fixes
+ that problem. (Amareshwari Sri Ramadasu via ddas)
+
+ HADOOP-2033. The SequenceFile.Writer.sync method was a no-op, which caused
+ very uneven splits for applications like distcp that count on them.
+ (omalley)
+
+ HADOOP-2070. Added a flush method to pipes' DownwardProtocol and call
+ that before waiting for the application to finish to ensure all buffered
+ data is flushed. (Owen O'Malley via acmurthy)
+
+ HADOOP-2080. Fixed calculation of the checksum file size when the values
+ are large. (omalley)
+
+ HADOOP-2048. Change error handling in distcp so that each map copies
+ as much as possible before reporting the error. Also report progress on
+ every copy. (Chris Douglas via omalley)
+
+ HADOOP-2073. Change size of VERSION file after writing contents to it.
+ (Konstantin Shvachko via dhruba)
+
+ HADOOP-2102. Fix the deprecated ToolBase to pass its Configuration object
+ to the superceding ToolRunner to ensure it picks up the appropriate
+ configuration resources. (Dennis Kubes and Enis Soztutar via acmurthy)
+
+ HADOOP-2103. Fix minor javadoc bugs introduce by HADOOP-2046. (Nigel
+ Daley via acmurthy)
+
+ IMPROVEMENTS
+
+ HADOOP-1908. Restructure data node code so that block sending and
+ receiving are seperated from data transfer header handling.
+ (Hairong Kuang via dhruba)
+
+ HADOOP-1921. Save the configuration of completed/failed jobs and make them
+ available via the web-ui. (Amar Kamat via devaraj)
+
+ HADOOP-1266. Remove dependency of package org.apache.hadoop.net on
+ org.apache.hadoop.dfs. (Hairong Kuang via dhruba)
+
+ HADOOP-1779. Replace INodeDirectory.getINode() by a getExistingPathINodes()
+ to allow the retrieval of all existing INodes along a given path in a
+ single lookup. This facilitates removal of the 'parent' field in the
+ inode. (Christophe Taton via dhruba)
+
+ HADOOP-1756. Add toString() to some Writable-s. (ab)
+
+ HADOOP-1727. New classes: MapWritable and SortedMapWritable.
+ (Jim Kellerman via ab)
+
+ HADOOP-1651. Improve progress reporting.
+ (Devaraj Das via tomwhite)
+
+ HADOOP-1595. dfsshell can wait for a file to achieve its intended
+ replication target. (Tsz Wo (Nicholas), SZE via dhruba)
+
+ HADOOP-1693. Remove un-needed log fields in DFS replication classes,
+ since the log may be accessed statically. (Konstantin Shvachko via cutting)
+
+ HADOOP-1231. Add generics to Mapper and Reducer interfaces.
+ (tomwhite via cutting)
+
+ HADOOP-1436. Improved command-line APIs, so that all tools need
+ not subclass ToolBase, and generic parameter parser is public.
+ (Enis Soztutar via cutting)
+
+ HADOOP-1703. DFS-internal code cleanups, removing several uses of
+ the obsolete UTF8. (Christophe Taton via cutting)
+
+ HADOOP-1731. Add Hadoop's version to contrib jar file names.
+ (cutting)
+
+ HADOOP-1689. Make shell scripts more portable. All shell scripts
+ now explicitly depend on bash, but do not require that bash be
+ installed in a particular location, as long as it is on $PATH.
+ (cutting)
+
+ HADOOP-1744. Remove many uses of the deprecated UTF8 class from
+ the HDFS namenode. (Christophe Taton via cutting)
+
+ HADOOP-1654. Add IOUtils class, containing generic io-related
+ utility methods. (Enis Soztutar via cutting)
+
+ HADOOP-1158. Change JobTracker to record map-output transmission
+ errors and use them to trigger speculative re-execution of tasks.
+ (Arun C Murthy via cutting)
+
+ HADOOP-1601. Change GenericWritable to use ReflectionUtils for
+ instance creation, avoiding classloader issues, and to implement
+ Configurable. (Enis Soztutar via cutting)
+
+ HADOOP-1750. Log standard output and standard error when forking
+ task processes. (omalley via cutting)
+
+ HADOOP-1803. Generalize build.xml to make files in all
+ src/contrib/*/bin directories executable. (stack via cutting)
+
+ HADOOP-1739. Let OS always choose the tasktracker's umbilical
+ port. Also switch default address for umbilical connections to
+ loopback. (cutting)
+
+ HADOOP-1812. Let OS choose ports for IPC and RPC unit tests. (cutting)
+
+ HADOOP-1825. Create $HADOOP_PID_DIR when it does not exist.
+ (Michael Bieniosek via cutting)
+
+ HADOOP-1425. Replace uses of ToolBase with the Tool interface.
+ (Enis Soztutar via cutting)
+
+ HADOOP-1569. Reimplement DistCP to use the standard FileSystem/URI
+ code in Hadoop so that you can copy from and to all of the supported file
+ systems.(Chris Douglas via omalley)
+
+ HADOOP-1018. Improve documentation w.r.t handling of lost hearbeats between
+ TaskTrackers and JobTracker. (acmurthy)
+
+ HADOOP-1718. Add ant targets for measuring code coverage with clover.
+ (simonwillnauer via nigel)
+
+ HADOOP-1592. Log error messages to the client console when tasks
+ fail. (Amar Kamat via cutting)
+
+ HADOOP-1879. Remove some unneeded casts. (Nilay Vaish via cutting)
+
+ HADOOP-1878. Add space between priority links on job details
+ page. (Thomas Friol via cutting)
+
+ HADOOP-120. In ArrayWritable, prevent creation with null value
+ class, and improve documentation. (Cameron Pope via cutting)
+
+ HADOOP-1926. Add a random text writer example/benchmark so that we can
+ benchmark compression codecs on random data. (acmurthy via omalley)
+
+ HADOOP-1906. Warn the user if they have an obsolete madred-default.xml
+ file in their configuration directory. (acmurthy via omalley)
+
+ HADOOP-1971. Warn when job does not specify a jar. (enis via cutting)
+
+ HADOOP-1942. Increase the concurrency of transaction logging to
+ edits log. Reduce the number of syncs by double-buffering the changes
+ to the transaction log. (Dhruba Borthakur)
+
+ HADOOP-2046. Improve mapred javadoc. (Arun C. Murthy via cutting)
+
+ HADOOP-2105. Improve overview.html to clarify supported platforms,
+ software pre-requisites for hadoop, how to install them on various
+ platforms and a better general description of hadoop and it's utility.
+ (Jim Kellerman via acmurthy)
+
+
+Release 0.14.4 - 2007-11-26
+
+ BUG FIXES
+
+ HADOOP-2140. Add missing Apache Licensing text at the front of several
+ C and C++ files.
+
+ HADOOP-2169. Fix the DT_SONAME field of libhdfs.so to set it to the
+ correct value of 'libhdfs.so', currently it is set to the absolute path of
+ libhdfs.so. (acmurthy)
+
+ HADOOP-2001. Make the job priority updates and job kills synchronized on
+ the JobTracker. Deadlock was seen in the JobTracker because of the lack of
+ this synchronization. (Arun C Murthy via ddas)
+
+
+Release 0.14.3 - 2007-10-19
+
+ BUG FIXES
+
+ HADOOP-2053. Fixed a dangling reference to a memory buffer in the map
+ output sorter. (acmurthy via omalley)
+
+ HADOOP-2036. Fix a NullPointerException in JvmMetrics class. (nigel)
+
+ HADOOP-2043. Release 0.14.2 was compiled with Java 1.6 rather than
+ Java 1.5. (cutting)
+
+
+Release 0.14.2 - 2007-10-09
+
+ BUG FIXES
+
+ HADOOP-1948. Removed spurious error message during block crc upgrade.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-1862. reduces are getting stuck trying to find map outputs.
+ (Arun C. Murthy via ddas)
+
+ HADOOP-1977. Fixed handling of ToolBase cli options in JobClient.
+ (enis via omalley)
+
+ HADOOP-1972. Fix LzoCompressor to ensure the user has actually asked
+ to finish compression. (arun via omalley)
+
+ HADOOP-1970. Fix deadlock in progress reporting in the task. (Vivek
+ Ratan via omalley)
+
+ HADOOP-1978. Name-node removes edits.new after a successful startup.
+ (Konstantin Shvachko via dhruba)
+
+ HADOOP-1955. The Namenode tries to not pick the same source Datanode for
+ a replication request if the earlier replication request for the same
+ block and that source Datanode had failed.
+ (Raghu Angadi via dhruba)
+
+ HADOOP-1961. The -get option to dfs-shell works when a single filename
+ is specified. (Raghu Angadi via dhruba)
+
+ HADOOP-1997. TestCheckpoint closes the edits file after writing to it,
+ otherwise the rename of this file on Windows fails.
+ (Konstantin Shvachko via dhruba)
+
+Release 0.14.1 - 2007-09-04
+
+ BUG FIXES
+
+ HADOOP-1740. Fix null pointer exception in sorting map outputs. (Devaraj
+ Das via omalley)
+
+ HADOOP-1790. Fix tasktracker to work correctly on multi-homed
+ boxes. (Torsten Curdt via cutting)
+
+ HADOOP-1798. Fix jobtracker to correctly account for failed
+ tasks. (omalley via cutting)
+
+
+Release 0.14.0 - 2007-08-17
+
+ INCOMPATIBLE CHANGES
+
+ 1. HADOOP-1134.
+ CONFIG/API - dfs.block.size must now be a multiple of
+ io.byte.per.checksum, otherwise new files can not be written.
+ LAYOUT - DFS layout version changed from -6 to -7, which will require an
+ upgrade from previous versions.
+ PROTOCOL - Datanode RPC protocol version changed from 7 to 8.
+
+ 2. HADOOP-1283
+ API - deprecated file locking API.
+
+ 3. HADOOP-894
+ PROTOCOL - changed ClientProtocol to fetch parts of block locations.
+
+ 4. HADOOP-1336
+ CONFIG - Enable speculative execution by default.
+
+ 5. HADOOP-1197
+ API - deprecated method for Configuration.getObject, because
+ Configurations should only contain strings.
+
+ 6. HADOOP-1343
+ API - deprecate Configuration.set(String,Object) so that only strings are
+ put in Configrations.
+
+ 7. HADOOP-1207
+ CLI - Fix FsShell 'rm' command to continue when a non-existent file is
+ encountered.
+
+ 8. HADOOP-1473
+ CLI/API - Job, TIP, and Task id formats have changed and are now unique
+ across job tracker restarts.
+
+ 9. HADOOP-1400
+ API - JobClient constructor now takes a JobConf object instead of a
+ Configuration object.
+
+ NEW FEATURES and BUG FIXES
+
+ 1. HADOOP-1197. In Configuration, deprecate getObject() and add
+ getRaw(), which skips variable expansion. (omalley via cutting)
+
+ 2. HADOOP-1343. In Configuration, deprecate set(String,Object) and
+ implement Iterable. (omalley via cutting)
+
+ 3. HADOOP-1344. Add RunningJob#getJobName(). (Michael Bieniosek via cutting)
+
+ 4. HADOOP-1342. In aggregators, permit one to limit the number of
+ unique values per key. (Runping Qi via cutting)
+
+ 5. HADOOP-1340. Set the replication factor of the MD5 file in the filecache
+ to be the same as the replication factor of the original file.
+ (Dhruba Borthakur via tomwhite.)
+
+ 6. HADOOP-1355. Fix null pointer dereference in
+ TaskLogAppender.append(LoggingEvent). (Arun C Murthy via tomwhite.)
+
+ 7. HADOOP-1357. Fix CopyFiles to correctly avoid removing "/".
+ (Arun C Murthy via cutting)
+
+ 8. HADOOP-234. Add pipes facility, which permits writing MapReduce
+ programs in C++.
+
+ 9. HADOOP-1359. Fix a potential NullPointerException in HDFS.
+ (Hairong Kuang via cutting)
+
+ 10. HADOOP-1364. Fix inconsistent synchronization in SequenceFile.
+ (omalley via cutting)
+
+ 11. HADOOP-1379. Add findbugs target to build.xml.
+ (Nigel Daley via cutting)
+
+ 12. HADOOP-1364. Fix various inconsistent synchronization issues.
+ (Devaraj Das via cutting)
+
+ 13. HADOOP-1393. Remove a potential unexpected negative number from
+ uses of random number generator. (omalley via cutting)
+
+ 14. HADOOP-1387. A number of "performance" code-cleanups suggested
+ by findbugs. (Arun C Murthy via cutting)
+
+ 15. HADOOP-1401. Add contrib/hbase javadoc to tree. (stack via cutting)
+
+ 16. HADOOP-894. Change HDFS so that the client only retrieves a limited
+ number of block locations per request from the namenode.
+ (Konstantin Shvachko via cutting)
+
+ 17. HADOOP-1406. Plug a leak in MapReduce's use of metrics.
+ (David Bowen via cutting)
+
+ 18. HADOOP-1394. Implement "performance" code-cleanups in HDFS
+ suggested by findbugs. (Raghu Angadi via cutting)
+
+ 19. HADOOP-1413. Add example program that uses Knuth's dancing links
+ algorithm to solve pentomino problems. (omalley via cutting)
+
+ 20. HADOOP-1226. Change HDFS so that paths it returns are always
+ fully qualified. (Dhruba Borthakur via cutting)
+
+ 21. HADOOP-800. Improvements to HDFS web-based file browser.
+ (Enis Soztutar via cutting)
+
+ 22. HADOOP-1408. Fix a compiler warning by adding a class to replace
+ a generic. (omalley via cutting)
+
+ 23. HADOOP-1376. Modify RandomWriter example so that it can generate
+ data for the Terasort benchmark. (Devaraj Das via cutting)
+
+ 24. HADOOP-1429. Stop logging exceptions during normal IPC server
+ shutdown. (stack via cutting)
+
+ 25. HADOOP-1461. Fix the synchronization of the task tracker to
+ avoid lockups in job cleanup. (Arun C Murthy via omalley)
+
+ 26. HADOOP-1446. Update the TaskTracker metrics while the task is
+ running. (Devaraj via omalley)
+
+ 27. HADOOP-1414. Fix a number of issues identified by FindBugs as
+ "Bad Practice". (Dhruba Borthakur via cutting)
+
+ 28. HADOOP-1392. Fix "correctness" bugs identified by FindBugs in
+ fs and dfs packages. (Raghu Angadi via cutting)
+
+ 29. HADOOP-1412. Fix "dodgy" bugs identified by FindBugs in fs and
+ io packages. (Hairong Kuang via cutting)
+
+ 30. HADOOP-1261. Remove redundant events from HDFS namenode's edit
+ log when a datanode restarts. (Raghu Angadi via cutting)
+
+ 31. HADOOP-1336. Re-enable speculative execution by
+ default. (omalley via cutting)
+
+ 32. HADOOP-1311. Fix a bug in BytesWritable#set() where start offset
+ was ignored. (Dhruba Borthakur via cutting)
+
+ 33. HADOOP-1450. Move checksumming closer to user code, so that
+ checksums are created before data is stored in large buffers and
+ verified after data is read from large buffers, to better catch
+ memory errors. (cutting)
+
+ 34. HADOOP-1447. Add support in contrib/data_join for text inputs.
+ (Senthil Subramanian via cutting)
+
+ 35. HADOOP-1456. Fix TestDecommission assertion failure by setting
+ the namenode to ignore the load on datanodes while allocating
+ replicas. (Dhruba Borthakur via tomwhite)
+
+ 36. HADOOP-1396. Fix FileNotFoundException on DFS block.
+ (Dhruba Borthakur via tomwhite)
+
+ 37. HADOOP-1467. Remove redundant counters from WordCount example.
+ (Owen O'Malley via tomwhite)
+
+ 38. HADOOP-1139. Log HDFS block transitions at INFO level, to better
+ enable diagnosis of problems. (Dhruba Borthakur via cutting)
+
+ 39. HADOOP-1269. Finer grained locking in HDFS namenode.
+ (Dhruba Borthakur via cutting)
+
+ 40. HADOOP-1438. Improve HDFS documentation, correcting typos and
+ making images appear in PDF. Also update copyright date for all
+ docs. (Luke Nezda via cutting)
+
+ 41. HADOOP-1457. Add counters for monitoring task assignments.
+ (Arun C Murthy via tomwhite)
+
+ 42. HADOOP-1472. Fix so that timed-out tasks are counted as failures
+ rather than as killed. (Arun C Murthy via cutting)
+
+ 43. HADOOP-1234. Fix a race condition in file cache that caused
+ tasktracker to not be able to find cached files.
+ (Arun C Murthy via cutting)
+
+ 44. HADOOP-1482. Fix secondary namenode to roll info port.
+ (Dhruba Borthakur via cutting)
+
+ 45. HADOOP-1300. Improve removal of excess block replicas to be
+ rack-aware. Attempts are now made to keep replicas on more
+ racks. (Hairong Kuang via cutting)
+
+ 46. HADOOP-1417. Disable a few FindBugs checks that generate a lot
+ of spurious warnings. (Nigel Daley via cutting)
+
+ 47. HADOOP-1320. Rewrite RandomWriter example to bypass reduce.
+ (Arun C Murthy via cutting)
+
+ 48. HADOOP-1449. Add some examples to contrib/data_join.
+ (Senthil Subramanian via cutting)
+
+ 49. HADOOP-1459. Fix so that, in HDFS, getFileCacheHints() returns
+ hostnames instead of IP addresses. (Dhruba Borthakur via cutting)
+
+ 50. HADOOP-1493. Permit specification of "java.library.path" system
+ property in "mapred.child.java.opts" configuration property.
+ (Enis Soztutar via cutting)
+
+ 51. HADOOP-1372. Use LocalDirAllocator for HDFS temporary block
+ files, so that disk space, writability, etc. is considered.
+ (Dhruba Borthakur via cutting)
+
+ 52. HADOOP-1193. Pool allocation of compression codecs. This
+ eliminates a memory leak that could cause OutOfMemoryException,
+ and also substantially improves performance.
+ (Arun C Murthy via cutting)
+
+ 53. HADOOP-1492. Fix a NullPointerException handling version
+ mismatch during datanode registration.
+ (Konstantin Shvachko via cutting)
+
+ 54. HADOOP-1442. Fix handling of zero-length input splits.
+ (Senthil Subramanian via cutting)
+
+ 55. HADOOP-1444. Fix HDFS block id generation to check pending
+ blocks for duplicates. (Dhruba Borthakur via cutting)
+
+ 56. HADOOP-1207. Fix FsShell's 'rm' command to not stop when one of
+ the named files does not exist. (Tsz Wo Sze via cutting)
+
+ 57. HADOOP-1475. Clear tasktracker's file cache before it
+ re-initializes, to avoid confusion. (omalley via cutting)
+
+ 58. HADOOP-1505. Remove spurious stacktrace in ZlibFactory
+ introduced in HADOOP-1093. (Michael Stack via tomwhite)
+
+ 59. HADOOP-1484. Permit one to kill jobs from the web ui. Note that
+ this is disabled by default. One must set
+ "webinterface.private.actions" to enable this.
+ (Enis Soztutar via cutting)
+
+ 60. HADOOP-1003. Remove flushing of namenode edit log from primary
+ namenode lock, increasing namenode throughput.
+ (Dhruba Borthakur via cutting)
+
+ 61. HADOOP-1023. Add links to searchable mail archives.
+ (tomwhite via cutting)
+
+ 62. HADOOP-1504. Fix terminate-hadoop-cluster script in contrib/ec2
+ to only terminate Hadoop instances, and not other instances
+ started by the same user. (tomwhite via cutting)
+
+ 63. HADOOP-1462. Improve task progress reporting. Progress reports
+ are no longer blocking since i/o is performed in a separate
+ thread. Reporting during sorting and more is also more
+ consistent. (Vivek Ratan via cutting)
+
+ 64. [ intentionally blank ]
+
+ 65. HADOOP-1453. Remove some unneeded calls to FileSystem#exists()
+ when opening files, reducing the namenode load somewhat.
+ (Raghu Angadi via cutting)
+
+ 66. HADOOP-1489. Fix text input truncation bug due to mark/reset.
+ Add a unittest. (Bwolen Yang via cutting)
+
+ 67. HADOOP-1455. Permit specification of arbitrary job options on
+ pipes command line. (Devaraj Das via cutting)
+
+ 68. HADOOP-1501. Better randomize sending of block reports to
+ namenode, so reduce load spikes. (Dhruba Borthakur via cutting)
+
+ 69. HADOOP-1147. Remove @author tags from Java source files.
+
+ 70. HADOOP-1283. Convert most uses of UTF8 in the namenode to be
+ String. (Konstantin Shvachko via cutting)
+
+ 71. HADOOP-1511. Speedup hbase unit tests. (stack via cutting)
+
+ 72. HADOOP-1517. Remove some synchronization in namenode to permit
+ finer grained locking previously added. (Konstantin Shvachko via cutting)
+
+ 73. HADOOP-1512. Fix failing TestTextInputFormat on Windows.
+ (Senthil Subramanian via nigel)
+
+ 74. HADOOP-1518. Add a session id to job metrics, for use by HOD.
+ (David Bowen via cutting)
+
+ 75. HADOOP-1292. Change 'bin/hadoop fs -get' to first copy files to
+ a temporary name, then rename them to their final name, so that
+ failures don't leave partial files. (Tsz Wo Sze via cutting)
+
+ 76. HADOOP-1377. Add support for modification time to FileSystem and
+ implement in HDFS and local implementations. Also, alter access
+ to file properties to be through a new FileStatus interface.
+ (Dhruba Borthakur via cutting)
+
+ 77. HADOOP-1515. Add MultiFileInputFormat, which can pack multiple,
+ typically small, input files into each split. (Enis Soztutar via cutting)
+
+ 78. HADOOP-1514. Make reducers report progress while waiting for map
+ outputs, so they're not killed. (Vivek Ratan via cutting)
+
+ 79. HADOOP-1508. Add an Ant task for FsShell operations. Also add
+ new FsShell commands "touchz", "test" and "stat".
+ (Chris Douglas via cutting)
+
+ 80. HADOOP-1028. Add log messages for server startup and shutdown.
+ (Tsz Wo Sze via cutting)
+
+ 81. HADOOP-1485. Add metrics for monitoring shuffle.
+ (Devaraj Das via cutting)
+
+ 82. HADOOP-1536. Remove file locks from libhdfs tests.
+ (Dhruba Borthakur via nigel)
+
+ 83. HADOOP-1520. Add appropriate synchronization to FSEditsLog.
+ (Dhruba Borthakur via nigel)
+
+ 84. HADOOP-1513. Fix a race condition in directory creation.
+ (Devaraj via omalley)
+
+ 85. HADOOP-1546. Remove spurious column from HDFS web UI.
+ (Dhruba Borthakur via cutting)
+
+ 86. HADOOP-1556. Make LocalJobRunner delete working files at end of
+ job run. (Devaraj Das via tomwhite)
+
+ 87. HADOOP-1571. Add contrib lib directories to root build.xml
+ javadoc classpath. (Michael Stack via tomwhite)
+
+ 88. HADOOP-1554. Log killed tasks to the job history and display them on the
+ web/ui. (Devaraj Das via omalley)
+
+ 89. HADOOP-1533. Add persistent error logging for distcp. The logs are stored
+ into a specified hdfs directory. (Senthil Subramanian via omalley)
+
+ 90. HADOOP-1286. Add support to HDFS for distributed upgrades, which
+ permits coordinated upgrade of datanode data.
+ (Konstantin Shvachko via cutting)
+
+ 91. HADOOP-1580. Improve contrib/streaming so that subprocess exit
+ status is displayed for errors. (John Heidemann via cutting)
+
+ 92. HADOOP-1448. In HDFS, randomize lists of non-local block
+ locations returned to client, so that load is better balanced.
+ (Hairong Kuang via cutting)
+
+ 93. HADOOP-1578. Fix datanode to send its storage id to namenode
+ during registration. (Konstantin Shvachko via cutting)
+
+ 94. HADOOP-1584. Fix a bug in GenericWritable which limited it to
+ 128 types instead of 256. (Espen Amble Kolstad via cutting)
+
+ 95. HADOOP-1473. Make job ids unique across jobtracker restarts.
+ (omalley via cutting)
+
+ 96. HADOOP-1582. Fix hdfslib to return 0 instead of -1 at
+ end-of-file, per C conventions. (Christian Kunz via cutting)
+
+ 97. HADOOP-911. Fix a multithreading bug in libhdfs.
+ (Christian Kunz)
+
+ 98. HADOOP-1486. Fix so that fatal exceptions in namenode cause it
+ to exit. (Dhruba Borthakur via cutting)
+
+ 99. HADOOP-1470. Factor checksum generation and validation out of
+ ChecksumFileSystem so that it can be reused by FileSystem's with
+ built-in checksumming. (Hairong Kuang via cutting)
+
+100. HADOOP-1590. Use relative urls in jobtracker jsp pages, so that
+ webapp can be used in non-root contexts. (Thomas Friol via cutting)
+
+101. HADOOP-1596. Fix the parsing of taskids by streaming and improve the
+ error reporting. (omalley)
+
+102. HADOOP-1535. Fix the user-controlled grouping to the reduce function.
+ (Vivek Ratan via omalley)
+
+103. HADOOP-1585. Modify GenericWritable to declare the classes as subtypes
+ of Writable (Espen Amble Kolstad via omalley)
+
+104. HADOOP-1576. Fix errors in count of completed tasks when
+ speculative execution is enabled. (Arun C Murthy via cutting)
+
+105. HADOOP-1598. Fix license headers: adding missing; updating old.
+ (Enis Soztutar via cutting)
+
+106. HADOOP-1547. Provide examples for aggregate library.
+ (Runping Qi via tomwhite)
+
+107. HADOOP-1570. Permit jobs to enable and disable the use of
+ hadoop's native library. (Arun C Murthy via cutting)
+
+108. HADOOP-1433. Add job priority. (Johan Oskarsson via tomwhite)
+
+109. HADOOP-1597. Add status reports and post-upgrade options to HDFS
+ distributed upgrade. (Konstantin Shvachko via cutting)
+
+110. HADOOP-1524. Permit user task logs to appear as they're
+ created. (Michael Bieniosek via cutting)
+
+111. HADOOP-1599. Fix distcp bug on Windows. (Senthil Subramanian via cutting)
+
+112. HADOOP-1562. Add JVM metrics, including GC and logging stats.
+ (David Bowen via cutting)
+
+113. HADOOP-1613. Fix "DFS Health" page to display correct time of
+ last contact. (Dhruba Borthakur via cutting)
+
+114. HADOOP-1134. Add optimized checksum support to HDFS. Checksums
+ are now stored with each block, rather than as parallel files.
+ This reduces the namenode's memory requirements and increases
+ data integrity. (Raghu Angadi via cutting)
+
+115. HADOOP-1400. Make JobClient retry requests, so that clients can
+ survive jobtracker problems. (omalley via cutting)
+
+116. HADOOP-1564. Add unit tests for HDFS block-level checksums.
+ (Dhruba Borthakur via cutting)
+
+117. HADOOP-1620. Reduce the number of abstract FileSystem methods,
+ simplifying implementations. (cutting)
+
+118. HADOOP-1625. Fix a "could not move files" exception in datanode.
+ (Raghu Angadi via cutting)
+
+119. HADOOP-1624. Fix an infinite loop in datanode. (Raghu Angadi via cutting)
+
+120. HADOOP-1084. Switch mapred file cache to use file modification
+ time instead of checksum to detect file changes, as checksums are
+ no longer easily accessed. (Arun C Murthy via cutting)
+
+130. HADOOP-1623. Fix an infinite loop when copying directories.
+ (Dhruba Borthakur via cutting)
+
+131. HADOOP-1603. Fix a bug in namenode initialization where
+ default replication is sometimes reset to one on restart.
+ (Raghu Angadi via cutting)
+
+132. HADOOP-1635. Remove hardcoded keypair name and fix launch-hadoop-cluster
+ to support later versions of ec2-api-tools. (Stu Hood via tomwhite)
+
+133. HADOOP-1638. Fix contrib EC2 scripts to support NAT addressing.
+ (Stu Hood via tomwhite)
+
+134. HADOOP-1632. Fix an IllegalArgumentException in fsck.
+ (Hairong Kuang via cutting)
+
+135. HADOOP-1619. Fix FSInputChecker to not attempt to read past EOF.
+ (Hairong Kuang via cutting)
+
+136. HADOOP-1640. Fix TestDecommission on Windows.
+ (Dhruba Borthakur via cutting)
+
+137. HADOOP-1587. Fix TestSymLink to get required system properties.
+ (Devaraj Das via omalley)
+
+138. HADOOP-1628. Add block CRC protocol unit tests. (Raghu Angadi via omalley)
+
+139. HADOOP-1653. FSDirectory code-cleanups. FSDirectory.INode
+ becomes a static class. (Christophe Taton via dhruba)
+
+140. HADOOP-1066. Restructure documentation to make more user
+ friendly. (Connie Kleinjans and Jeff Hammerbacher via cutting)
+
+141. HADOOP-1551. libhdfs supports setting replication factor and
+ retrieving modification time of files. (Sameer Paranjpye via dhruba)
+
+141. HADOOP-1647. FileSystem.getFileStatus returns valid values for "/".
+ (Dhruba Borthakur via dhruba)
+
+142. HADOOP-1657. Fix NNBench to ensure that the block size is a
+ multiple of bytes.per.checksum. (Raghu Angadi via dhruba)
+
+143. HADOOP-1553. Replace user task output and log capture code to use shell
+ redirection instead of copier threads in the TaskTracker. Capping the
+ size of the output is now done via tail in memory and thus should not be
+ large. The output of the tasklog servlet is not forced into UTF8 and is
+ not buffered entirely in memory. (omalley)
+ Configuration changes to hadoop-default.xml:
+ remove mapred.userlog.num.splits
+ remove mapred.userlog.purge.splits
+ change default mapred.userlog.limit.kb to 0 (no limit)
+ change default mapred.userlog.retain.hours to 24
+ Configuration changes to log4j.properties:
+ remove log4j.appender.TLA.noKeepSplits
+ remove log4j.appender.TLA.purgeLogSplits
+ remove log4j.appender.TLA.logsRetainHours
+ URL changes:
+ http:///tasklog.jsp -> http://tasklog with
+ parameters limited to start and end, which may be positive (from
+ start) or negative (from end).
+ Environment:
+ require bash (v2 or later) and tail
+
+144. HADOOP-1659. Fix a job id/job name mixup. (Arun C. Murthy via omalley)
+
+145. HADOOP-1665. With HDFS Trash enabled and the same file was created
+ and deleted more than once, the suceeding deletions creates Trash item
+ names suffixed with a integer. (Dhruba Borthakur via dhruba)
+
+146. HADOOP-1666. FsShell object can be used for multiple fs commands.
+ (Dhruba Borthakur via dhruba)
+
+147. HADOOP-1654. Remove performance regression introduced by Block CRC.
+ (Raghu Angadi via dhruba)
+
+148. HADOOP-1680. Improvements to Block CRC upgrade messages.
+ (Raghu Angadi via dhruba)
+
+149. HADOOP-71. Allow Text and SequenceFile Map/Reduce inputs from non-default
+ filesystems. (omalley)
+
+150. HADOOP-1568. Expose HDFS as xml/http filesystem to provide cross-version
+ compatability. (Chris Douglas via omalley)
+
+151. HADOOP-1668. Added an INCOMPATIBILITY section to CHANGES.txt. (nigel)
+
+152. HADOOP-1629. Added a upgrade test for HADOOP-1134.
+ (Raghu Angadi via nigel)
+
+153. HADOOP-1698. Fix performance problems on map output sorting for jobs
+ with large numbers of reduces. (Devaraj Das via omalley)
+
+154. HADOOP-1716. Fix a Pipes wordcount example to remove the 'file:'
+ schema from its output path. (omalley via cutting)
+
+155. HADOOP-1714. Fix TestDFSUpgradeFromImage to work on Windows.
+ (Raghu Angadi via nigel)
+
+156. HADOOP-1663. Return a non-zero exit code if streaming fails. (Lohit Renu
+ via omalley)
+
+157. HADOOP-1712. Fix an unhandled exception on datanode during block
+ CRC upgrade. (Raghu Angadi via cutting)
+
+158. HADOOP-1717. Fix TestDFSUpgradeFromImage to work on Solaris.
+ (nigel via cutting)
+
+159. HADOOP-1437. Add Eclipse plugin in contrib.
+ (Eugene Hung and Christophe Taton via cutting)
+
+
+Release 0.13.0 - 2007-06-08
+
+ 1. HADOOP-1047. Fix TestReplication to succeed more reliably.
+ (Hairong Kuang via cutting)
+
+ 2. HADOOP-1063. Fix a race condition in MiniDFSCluster test code.
+ (Hairong Kuang via cutting)
+
+ 3. HADOOP-1101. In web ui, split shuffle statistics from reduce
+ statistics, and add some task averages. (Devaraj Das via cutting)
+
+ 4. HADOOP-1071. Improve handling of protocol version mismatch in
+ JobTracker. (Tahir Hashmi via cutting)
+
+ 5. HADOOP-1116. Increase heap size used for contrib unit tests.
+ (Philippe Gassmann via cutting)
+
+ 6. HADOOP-1120. Add contrib/data_join, tools to simplify joining
+ data from multiple sources using MapReduce. (Runping Qi via cutting)
+
+ 7. HADOOP-1064. Reduce log level of some DFSClient messages.
+ (Dhruba Borthakur via cutting)
+
+ 8. HADOOP-1137. Fix StatusHttpServer to work correctly when
+ resources are in a jar file. (Benjamin Reed via cutting)
+
+ 9. HADOOP-1094. Optimize generated Writable implementations for
+ records to not allocate a new BinaryOutputArchive or
+ BinaryInputArchive per call. (Milind Bhandarkar via cutting)
+
+10. HADOOP-1068. Improve error message for clusters with 0 datanodes.
+ (Dhruba Borthakur via tomwhite)
+
+11. HADOOP-1122. Fix divide-by-zero exception in FSNamesystem
+ chooseTarget method. (Dhruba Borthakur via tomwhite)
+
+12. HADOOP-1131. Add a closeAll() static method to FileSystem.
+ (Philippe Gassmann via tomwhite)
+
+13. HADOOP-1085. Improve port selection in HDFS and MapReduce test
+ code. Ports are now selected by the OS during testing rather than
+ by probing for free ports, improving test reliability.
+ (Arun C Murthy via cutting)
+
+14. HADOOP-1153. Fix HDFS daemons to correctly stop their threads.
+ (Konstantin Shvachko via cutting)
+
+15. HADOOP-1146. Add a counter for reduce input keys and rename the
+ "reduce input records" counter to be "reduce input groups".
+ (David Bowen via cutting)
+
+16. HADOOP-1165. In records, replace idential generated toString
+ methods with a method on the base class. (Milind Bhandarkar via cutting)
+
+17. HADOOP-1164. Fix TestReplicationPolicy to specify port zero, so
+ that a free port is automatically selected. (omalley via cutting)
+
+18. HADOOP-1166. Add a NullOutputFormat and use it in the
+ RandomWriter example. (omalley via cutting)
+
+19. HADOOP-1169. Fix a cut/paste error in CopyFiles utility so that
+ S3-based source files are correctly copied. (Michael Stack via cutting)
+
+20. HADOOP-1167. Remove extra synchronization in InMemoryFileSystem.
+ (omalley via cutting)
+
+21. HADOOP-1110. Fix an off-by-one error counting map inputs.
+ (David Bowen via cutting)
+
+22. HADOOP-1178. Fix a NullPointerException during namenode startup.
+ (Dhruba Borthakur via cutting)
+
+23. HADOOP-1011. Fix a ConcurrentModificationException when viewing
+ job history. (Tahir Hashmi via cutting)
+
+24. HADOOP-672. Improve help for fs shell commands.
+ (Dhruba Borthakur via cutting)
+
+25. HADOOP-1170. Improve datanode performance by removing device
+ checks from common operations. (Igor Bolotin via cutting)
+
+26. HADOOP-1090. Fix SortValidator's detection of whether the input
+ file belongs to the sort-input or sort-output directory.
+ (Arun C Murthy via tomwhite)
+
+27. HADOOP-1081. Fix bin/hadoop on Darwin. (Michael Bieniosek via cutting)
+
+28. HADOOP-1045. Add contrib/hbase, a BigTable-like online database.
+ (Jim Kellerman via cutting)
+
+29. HADOOP-1156. Fix a NullPointerException in MiniDFSCluster.
+ (Hairong Kuang via cutting)
+
+30. HADOOP-702. Add tools to help automate HDFS upgrades.
+ (Konstantin Shvachko via cutting)
+
+31. HADOOP-1163. Fix ganglia metrics to aggregate metrics from different
+ hosts properly. (Michael Bieniosek via tomwhite)
+
+32. HADOOP-1194. Make compression style record level for map output
+ compression. (Arun C Murthy via tomwhite)
+
+33. HADOOP-1187. Improve DFS Scalability: avoid scanning entire list of
+ datanodes in getAdditionalBlocks. (Dhruba Borthakur via tomwhite)
+
+34. HADOOP-1133. Add tool to analyze and debug namenode on a production
+ cluster. (Dhruba Borthakur via tomwhite)
+
+35. HADOOP-1151. Remove spurious printing to stderr in streaming
+ PipeMapRed. (Koji Noguchi via tomwhite)
+
+36. HADOOP-988. Change namenode to use a single map of blocks to metadata.
+ (Raghu Angadi via tomwhite)
+
+37. HADOOP-1203. Change UpgradeUtilities used by DFS tests to use
+ MiniDFSCluster to start and stop NameNode/DataNodes.
+ (Nigel Daley via tomwhite)
+
+38. HADOOP-1217. Add test.timeout property to build.xml, so that
+ long-running unit tests may be automatically terminated.
+ (Nigel Daley via cutting)
+
+39. HADOOP-1149. Improve DFS Scalability: make
+ processOverReplicatedBlock() a no-op if blocks are not
+ over-replicated. (Raghu Angadi via tomwhite)
+
+40. HADOOP-1149. Improve DFS Scalability: optimize getDistance(),
+ contains(), and isOnSameRack() in NetworkTopology.
+ (Hairong Kuang via tomwhite)
+
+41. HADOOP-1218. Make synchronization on TaskTracker's RunningJob
+ object consistent. (Devaraj Das via tomwhite)
+
+42. HADOOP-1219. Ignore progress report once a task has reported as
+ 'done'. (Devaraj Das via tomwhite)
+
+43. HADOOP-1114. Permit user to specify additional CLASSPATH elements
+ with a HADOOP_CLASSPATH environment variable. (cutting)
+
+44. HADOOP-1198. Remove ipc.client.timeout parameter override from
+ unit test configuration. Using the default is more robust and
+ has almost the same run time. (Arun C Murthy via tomwhite)
+
+45. HADOOP-1211. Remove deprecated constructor and unused static
+ members in DataNode class. (Konstantin Shvachko via tomwhite)
+
+46. HADOOP-1136. Fix ArrayIndexOutOfBoundsException in
+ FSNamesystem$UnderReplicatedBlocks add() method.
+ (Hairong Kuang via tomwhite)
+
+47. HADOOP-978. Add the client name and the address of the node that
+ previously started to create the file to the description of
+ AlreadyBeingCreatedException. (Konstantin Shvachko via tomwhite)
+
+48. HADOOP-1001. Check the type of keys and values generated by the
+ mapper against the types specified in JobConf.
+ (Tahir Hashmi via tomwhite)
+
+49. HADOOP-971. Improve DFS Scalability: Improve name node performance
+ by adding a hostname to datanodes map. (Hairong Kuang via tomwhite)
+
+50. HADOOP-1189. Fix 'No space left on device' exceptions on datanodes.
+ (Raghu Angadi via tomwhite)
+
+51. HADOOP-819. Change LineRecordWriter to not insert a tab between
+ key and value when either is null, and to print nothing when both
+ are null. (Runping Qi via cutting)
+
+52. HADOOP-1204. Rename InputFormatBase to be FileInputFormat, and
+ deprecate InputFormatBase. Also make LineRecordReader easier to
+ extend. (Runping Qi via cutting)
+
+53. HADOOP-1213. Improve logging of errors by IPC server, to
+ consistently include the service name and the call. (cutting)
+
+54. HADOOP-1238. Fix metrics reporting by TaskTracker to correctly
+ track maps_running and reduces_running.
+ (Michael Bieniosek via cutting)
+
+55. HADOOP-1093. Fix a race condition in HDFS where blocks were
+ sometimes erased before they were reported written.
+ (Dhruba Borthakur via cutting)
+
+56. HADOOP-1239. Add a package name to some testjar test classes.
+ (Jim Kellerman via cutting)
+
+57. HADOOP-1241. Fix NullPointerException in processReport when
+ namenode is restarted. (Dhruba Borthakur via tomwhite)
+
+58. HADOOP-1244. Fix stop-dfs.sh to no longer incorrectly specify
+ slaves file for stopping datanode.
+ (Michael Bieniosek via tomwhite)
+
+59. HADOOP-1253. Fix ConcurrentModificationException and
+ NullPointerException in JobControl.
+ (Johan Oskarson via tomwhite)
+
+60. HADOOP-1256. Fix NameNode so that multiple DataNodeDescriptors
+ can no longer be created on startup. (Hairong Kuang via cutting)
+
+61. HADOOP-1214. Replace streaming classes with new counterparts
+ from Hadoop core. (Runping Qi via tomwhite)
+
+62. HADOOP-1250. Move a chmod utility from streaming to FileUtil.
+ (omalley via cutting)
+
+63. HADOOP-1258. Fix TestCheckpoint test case to wait for
+ MiniDFSCluster to be active. (Nigel Daley via tomwhite)
+
+64. HADOOP-1148. Re-indent all Java source code to consistently use
+ two spaces per indent level. (cutting)
+
+65. HADOOP-1251. Add a method to Reporter to get the map InputSplit.
+ (omalley via cutting)
+
+66. HADOOP-1224. Fix "Browse the filesystem" link to no longer point
+ to dead datanodes. (Enis Soztutar via tomwhite)
+
+67. HADOOP-1154. Fail a streaming task if the threads reading from or
+ writing to the streaming process fail. (Koji Noguchi via tomwhite)
+
+68. HADOOP-968. Move shuffle and sort to run in reduce's child JVM,
+ rather than in TaskTracker. (Devaraj Das via cutting)
+
+69. HADOOP-1111. Add support for client notification of job
+ completion. If the job configuration has a job.end.notification.url
+ property it will make a HTTP GET request to the specified URL.
+ The number of retries and the interval between retries is also
+ configurable. (Alejandro Abdelnur via tomwhite)
+
+70. HADOOP-1275. Fix misspelled job notification property in
+ hadoop-default.xml. (Alejandro Abdelnur via tomwhite)
+
+71. HADOOP-1152. Fix race condition in MapOutputCopier.copyOutput file
+ rename causing possible reduce task hang.
+ (Tahir Hashmi via tomwhite)
+
+72. HADOOP-1050. Distinguish between failed and killed tasks so as to
+ not count a lost tasktracker against the job.
+ (Arun C Murthy via tomwhite)
+
+73. HADOOP-1271. Fix StreamBaseRecordReader to be able to log record
+ data that's not UTF-8. (Arun C Murthy via tomwhite)
+
+74. HADOOP-1190. Fix unchecked warnings in main Hadoop code.
+ (tomwhite)
+
+75. HADOOP-1127. Fix AlreadyBeingCreatedException in namenode for
+ jobs run with speculative execution.
+ (Arun C Murthy via tomwhite)
+
+76. HADOOP-1282. Omnibus HBase patch. Improved tests & configuration.
+ (Jim Kellerman via cutting)
+
+77. HADOOP-1262. Make dfs client try to read from a different replica
+ of the checksum file when a checksum error is detected.
+ (Hairong Kuang via tomwhite)
+
+78. HADOOP-1279. Fix JobTracker to maintain list of recently
+ completed jobs by order of completion, not submission.
+ (Arun C Murthy via cutting)
+
+79. HADOOP-1284. In contrib/streaming, permit flexible specification
+ of field delimiter and fields for partitioning and sorting.
+ (Runping Qi via cutting)
+
+80. HADOOP-1176. Fix a bug where reduce would hang when a map had
+ more than 2GB of output for it. (Arun C Murthy via cutting)
+
+81. HADOOP-1293. Fix contrib/streaming to print more than the first
+ twenty lines of standard error. (Koji Noguchi via cutting)
+
+82. HADOOP-1297. Fix datanode so that requests to remove blocks that
+ do not exist no longer causes block reports to be re-sent every
+ second. (Dhruba Borthakur via cutting)
+
+83. HADOOP-1216. Change MapReduce so that, when numReduceTasks is
+ zero, map outputs are written directly as final output, skipping
+ shuffle, sort and reduce. Use this to implement reduce=NONE
+ option in contrib/streaming. (Runping Qi via cutting)
+
+84. HADOOP-1294. Fix unchecked warnings in main Hadoop code under
+ Java 6. (tomwhite)
+
+85. HADOOP-1299. Fix so that RPC will restart after RPC.stopClient()
+ has been called. (Michael Stack via cutting)
+
+86. HADOOP-1278. Improve blacklisting of TaskTrackers by JobTracker,
+ to reduce false positives. (Arun C Murthy via cutting)
+
+87. HADOOP-1290. Move contrib/abacus into mapred/lib/aggregate.
+ (Runping Qi via cutting)
+
+88. HADOOP-1272. Extract inner classes from FSNamesystem into separate
+ classes. (Dhruba Borthakur via tomwhite)
+
+89. HADOOP-1247. Add support to contrib/streaming for aggregate
+ package, formerly called Abacus. (Runping Qi via cutting)
+
+90. HADOOP-1061. Fix bug in listing files in the S3 filesystem.
+ NOTE: this change is not backwards compatible! You should use the
+ MigrationTool supplied to migrate existing S3 filesystem data to
+ the new format. Please backup your data first before upgrading
+ (using 'hadoop distcp' for example). (tomwhite)
+
+91. HADOOP-1304. Make configurable the maximum number of task
+ attempts before a job fails. (Devaraj Das via cutting)
+
+92. HADOOP-1308. Use generics to restrict types when classes are
+ passed as parameters to JobConf methods. (Michael Bieniosek via cutting)
+
+93. HADOOP-1312. Fix a ConcurrentModificationException in NameNode
+ that killed the heartbeat monitoring thread.
+ (Dhruba Borthakur via cutting)
+
+94. HADOOP-1315. Clean up contrib/streaming, switching it to use core
+ classes more and removing unused code. (Runping Qi via cutting)
+
+95. HADOOP-485. Allow a different comparator for grouping keys in
+ calls to reduce. (Tahir Hashmi via cutting)
+
+96. HADOOP-1322. Fix TaskTracker blacklisting to work correctly in
+ one- and two-node clusters. (Arun C Murthy via cutting)
+
+97. HADOOP-1144. Permit one to specify a maximum percentage of tasks
+ that can fail before a job is aborted. The default is zero.
+ (Arun C Murthy via cutting)
+
+98. HADOOP-1184. Fix HDFS decomissioning to complete when the only
+ copy of a block is on a decommissioned node. (Dhruba Borthakur via cutting)
+
+99. HADOOP-1263. Change DFSClient to retry certain namenode calls
+ with a random, exponentially increasing backoff time, to avoid
+ overloading the namenode on, e.g., job start. (Hairong Kuang via cutting)
+
+100. HADOOP-1325. First complete, functioning version of HBase.
+ (Jim Kellerman via cutting)
+
+101. HADOOP-1276. Make tasktracker expiry interval configurable.
+ (Arun C Murthy via cutting)
+
+102. HADOOP-1326. Change JobClient#RunJob() to return the job.
+ (omalley via cutting)
+
+103. HADOOP-1270. Randomize the fetch of map outputs, speeding the
+ shuffle. (Arun C Murthy via cutting)
+
+104. HADOOP-1200. Restore disk checking lost in HADOOP-1170.
+ (Hairong Kuang via cutting)
+
+105. HADOOP-1252. Changed MapReduce's allocation of local files to
+ use round-robin among available devices, rather than a hashcode.
+ More care is also taken to not allocate files on full or offline
+ drives. (Devaraj Das via cutting)
+
+106. HADOOP-1324. Change so that an FSError kills only the task that
+ generates it rather than the entire task tracker.
+ (Arun C Murthy via cutting)
+
+107. HADOOP-1310. Fix unchecked warnings in aggregate code. (tomwhite)
+
+108. HADOOP-1255. Fix a bug where the namenode falls into an infinite
+ loop trying to remove a dead node. (Hairong Kuang via cutting)
+
+109. HADOOP-1160. Fix DistributedFileSystem.close() to close the
+ underlying FileSystem, correctly aborting files being written.
+ (Hairong Kuang via cutting)
+
+110. HADOOP-1341. Fix intermittent failures in HBase unit tests
+ caused by deadlock. (Jim Kellerman via cutting)
+
+111. HADOOP-1350. Fix shuffle performance problem caused by forcing
+ chunked encoding of map outputs. (Devaraj Das via cutting)
+
+112. HADOOP-1345. Fix HDFS to correctly retry another replica when a
+ checksum error is encountered. (Hairong Kuang via cutting)
+
+113. HADOOP-1205. Improve synchronization around HDFS block map.
+ (Hairong Kuang via cutting)
+
+114. HADOOP-1353. Fix a potential NullPointerException in namenode.
+ (Dhruba Borthakur via cutting)
+
+115. HADOOP-1354. Fix a potential NullPointerException in FsShell.
+ (Hairong Kuang via cutting)
+
+116. HADOOP-1358. Fix a potential bug when DFSClient calls skipBytes.
+ (Hairong Kuang via cutting)
+
+117. HADOOP-1356. Fix a bug in ValueHistogram. (Runping Qi via cutting)
+
+118. HADOOP-1363. Fix locking bug in JobClient#waitForCompletion().
+ (omalley via cutting)
+
+119. HADOOP-1368. Fix inconsistent synchronization in JobInProgress.
+ (omalley via cutting)
+
+120. HADOOP-1369. Fix inconsistent synchronization in TaskTracker.
+ (omalley via cutting)
+
+121. HADOOP-1361. Fix various calls to skipBytes() to check return
+ value. (Hairong Kuang via cutting)
+
+122. HADOOP-1388. Fix a potential NullPointerException in web ui.
+ (Devaraj Das via cutting)
+
+123. HADOOP-1385. Fix MD5Hash#hashCode() to generally hash to more
+ than 256 values. (omalley via cutting)
+
+124. HADOOP-1386. Fix Path to not permit the empty string as a
+ path, as this has lead to accidental file deletion. Instead
+ force applications to use "." to name the default directory.
+ (Hairong Kuang via cutting)
+
+125. HADOOP-1407. Fix integer division bug in JobInProgress which
+ meant failed tasks didn't cause the job to fail.
+ (Arun C Murthy via tomwhite)
+
+126. HADOOP-1427. Fix a typo that caused GzipCodec to incorrectly use
+ a very small input buffer. (Espen Amble Kolstad via cutting)
+
+127. HADOOP-1435. Fix globbing code to no longer use the empty string
+ to indicate the default directory, per HADOOP-1386.
+ (Hairong Kuang via cutting)
+
+128. HADOOP-1411. Make task retry framework handle
+ AlreadyBeingCreatedException when wrapped as a RemoteException.
+ (Hairong Kuang via tomwhite)
+
+129. HADOOP-1242. Improve handling of DFS upgrades.
+ (Konstantin Shvachko via cutting)
+
+130. HADOOP-1332. Fix so that TaskTracker exits reliably during unit
+ tests on Windows. (omalley via cutting)
+
+131. HADOOP-1431. Fix so that sort progress reporting during map runs
+ only while sorting, so that stuck maps are correctly terminated.
+ (Devaraj Das and Arun C Murthy via cutting)
+
+132. HADOOP-1452. Change TaskTracker.MapOutputServlet.doGet.totalRead
+ to a long, permitting map outputs to exceed 2^31 bytes.
+ (omalley via cutting)
+
+133. HADOOP-1443. Fix a bug opening zero-length files in HDFS.
+ (Konstantin Shvachko via cutting)
+
+
+Release 0.12.3 - 2007-04-06
+
+ 1. HADOOP-1162. Fix bug in record CSV and XML serialization of
+ binary values. (Milind Bhandarkar via cutting)
+
+ 2. HADOOP-1123. Fix NullPointerException in LocalFileSystem when
+ trying to recover from a checksum error.
+ (Hairong Kuang & Nigel Daley via tomwhite)
+
+ 3. HADOOP-1177. Fix bug where IOException in MapOutputLocation.getFile
+ was not being logged. (Devaraj Das via tomwhite)
+
+ 4. HADOOP-1175. Fix bugs in JSP for displaying a task's log messages.
+ (Arun C Murthy via cutting)
+
+ 5. HADOOP-1191. Fix map tasks to wait until sort progress thread has
+ stopped before reporting the task done. (Devaraj Das via cutting)
+
+ 6. HADOOP-1192. Fix an integer overflow bug in FSShell's 'dus'
+ command and a performance problem in HDFS's implementation of it.
+ (Hairong Kuang via cutting)
+
+ 7. HADOOP-1105. Fix reducers to make "progress" while iterating
+ through values. (Devaraj Das & Owen O'Malley via tomwhite)
+
+ 8. HADOOP-1179. Make Task Tracker close index file as soon as the read
+ is done when serving get-map-output requests.
+ (Devaraj Das via tomwhite)
+
+
+Release 0.12.2 - 2007-23-17
+
+ 1. HADOOP-1135. Fix bug in block report processing which may cause
+ the namenode to delete blocks. (Dhruba Borthakur via tomwhite)
+
+ 2. HADOOP-1145. Make XML serializer and deserializer classes public
+ in record package. (Milind Bhandarkar via cutting)
+
+ 3. HADOOP-1140. Fix a deadlock in metrics. (David Bowen via cutting)
+
+ 4. HADOOP-1150. Fix streaming -reducer and -mapper to give them
+ defaults. (Owen O'Malley via tomwhite)
+
+
+Release 0.12.1 - 2007-03-17
+
+ 1. HADOOP-1035. Fix a StackOverflowError in FSDataSet.
+ (Raghu Angadi via cutting)
+
+ 2. HADOOP-1053. Fix VInt representation of negative values. Also
+ remove references in generated record code to methods outside of
+ the record package and improve some record documentation.
+ (Milind Bhandarkar via cutting)
+
+ 3. HADOOP-1067. Compile fails if Checkstyle jar is present in lib
+ directory. Also remove dependency on a particular Checkstyle
+ version number. (tomwhite)
+
+ 4. HADOOP-1060. Fix an IndexOutOfBoundsException in the JobTracker
+ that could cause jobs to hang. (Arun C Murthy via cutting)
+
+ 5. HADOOP-1077. Fix a race condition fetching map outputs that could
+ hang reduces. (Devaraj Das via cutting)
+
+ 6. HADOOP-1083. Fix so that when a cluster restarts with a missing
+ datanode, its blocks are replicated. (Hairong Kuang via cutting)
+
+ 7. HADOOP-1082. Fix a NullPointerException in ChecksumFileSystem.
+ (Hairong Kuang via cutting)
+
+ 8. HADOOP-1088. Fix record serialization of negative values.
+ (Milind Bhandarkar via cutting)
+
+ 9. HADOOP-1080. Fix bug in bin/hadoop on Windows when native
+ libraries are present. (ab via cutting)
+
+10. HADOOP-1091. Fix a NullPointerException in MetricsRecord.
+ (David Bowen via tomwhite)
+
+11. HADOOP-1092. Fix a NullPointerException in HeartbeatMonitor
+ thread. (Hairong Kuang via tomwhite)
+
+12. HADOOP-1112. Fix a race condition in Hadoop metrics.
+ (David Bowen via tomwhite)
+
+13. HADOOP-1108. Checksummed file system should retry reading if a
+ different replica is found when handling ChecksumException.
+ (Hairong Kuang via tomwhite)
+
+14. HADOOP-1070. Fix a problem with number of racks and datanodes
+ temporarily doubling. (Konstantin Shvachko via tomwhite)
+
+15. HADOOP-1099. Fix NullPointerException in JobInProgress.
+ (Gautam Kowshik via tomwhite)
+
+16. HADOOP-1115. Fix bug where FsShell copyToLocal doesn't
+ copy directories. (Hairong Kuang via tomwhite)
+
+17. HADOOP-1109. Fix NullPointerException in StreamInputFormat.
+ (Koji Noguchi via tomwhite)
+
+18. HADOOP-1117. Fix DFS scalability: when the namenode is
+ restarted it consumes 80% CPU. (Dhruba Borthakur via
+ tomwhite)
+
+19. HADOOP-1089. Make the C++ version of write and read v-int
+ agree with the Java versions. (Milind Bhandarkar via
+ tomwhite)
+
+20. HADOOP-1096. Rename InputArchive and OutputArchive and
+ make them public. (Milind Bhandarkar via tomwhite)
+
+21. HADOOP-1128. Fix missing progress information in map tasks.
+ (Espen Amble Kolstad, Andrzej Bialecki, and Owen O'Malley
+ via tomwhite)
+
+22. HADOOP-1129. Fix DFSClient to not hide IOExceptions in
+ flush method. (Hairong Kuang via tomwhite)
+
+23. HADOOP-1126. Optimize CPU usage for under replicated blocks
+ when cluster restarts. (Hairong Kuang via tomwhite)
+
+
+Release 0.12.0 - 2007-03-02
+
+ 1. HADOOP-975. Separate stdout and stderr from tasks.
+ (Arun C Murthy via cutting)
+
+ 2. HADOOP-982. Add some setters and a toString() method to
+ BytesWritable. (omalley via cutting)
+
+ 3. HADOOP-858. Move contrib/smallJobsBenchmark to src/test, removing
+ obsolete bits. (Nigel Daley via cutting)
+
+ 4. HADOOP-992. Fix MiniMR unit tests to use MiniDFS when specified,
+ rather than the local FS. (omalley via cutting)
+
+ 5. HADOOP-954. Change use of metrics to use callback mechanism.
+ Also rename utility class Metrics to MetricsUtil.
+ (David Bowen & Nigel Daley via cutting)
+
+ 6. HADOOP-893. Improve HDFS client's handling of dead datanodes.
+ The set is no longer reset with each block, but rather is now
+ maintained for the life of an open file. (Raghu Angadi via cutting)
+
+ 7. HADOOP-882. Upgrade to jets3t version 0.5, used by the S3
+ FileSystem. This version supports retries. (Michael Stack via cutting)
+
+ 8. HADOOP-977. Send task's stdout and stderr to JobClient's stdout
+ and stderr respectively, with each line tagged by the task's name.
+ (Arun C Murthy via cutting)
+
+ 9. HADOOP-761. Change unit tests to not use /tmp. (Nigel Daley via cutting)
+
+10. HADOOP-1007. Make names of metrics used in Hadoop unique.
+ (Nigel Daley via cutting)
+
+11. HADOOP-491. Change mapred.task.timeout to be per-job, and make a
+ value of zero mean no timeout. Also change contrib/streaming to
+ disable task timeouts. (Arun C Murthy via cutting)
+
+12. HADOOP-1010. Add Reporter.NULL, a Reporter implementation that
+ does nothing. (Runping Qi via cutting)
+
+13. HADOOP-923. In HDFS NameNode, move replication computation to a
+ separate thread, to improve heartbeat processing time.
+ (Dhruba Borthakur via cutting)
+
+14. HADOOP-476. Rewrite contrib/streaming command-line processing,
+ improving parameter validation. (Sanjay Dahiya via cutting)
+
+15. HADOOP-973. Improve error messages in Namenode. This should help
+ to track down a problem that was appearing as a
+ NullPointerException. (Dhruba Borthakur via cutting)
+
+16. HADOOP-649. Fix so that jobs with no tasks are not lost.
+ (Thomas Friol via cutting)
+
+17. HADOOP-803. Reduce memory use by HDFS namenode, phase I.
+ (Raghu Angadi via cutting)
+
+18. HADOOP-1021. Fix MRCaching-based unit tests on Windows.
+ (Nigel Daley via cutting)
+
+19. HADOOP-889. Remove duplicate code from HDFS unit tests.
+ (Milind Bhandarkar via cutting)
+
+20. HADOOP-943. Improve HDFS's fsck command to display the filename
+ for under-replicated blocks. (Dhruba Borthakur via cutting)
+
+21. HADOOP-333. Add validator for sort benchmark output.
+ (Arun C Murthy via cutting)
+
+22. HADOOP-947. Improve performance of datanode decomissioning.
+ (Dhruba Borthakur via cutting)
+
+23. HADOOP-442. Permit one to specify hosts allowed to connect to
+ namenode and jobtracker with include and exclude files. (Wendy
+ Chien via cutting)
+
+24. HADOOP-1017. Cache constructors, for improved performance.
+ (Ron Bodkin via cutting)
+
+25. HADOOP-867. Move split creation out of JobTracker to client.
+ Splits are now saved in a separate file, read by task processes
+ directly, so that user code is no longer required in the
+ JobTracker. (omalley via cutting)
+
+26. HADOOP-1006. Remove obsolete '-local' option from test code.
+ (Gautam Kowshik via cutting)
+
+27. HADOOP-952. Create a public (shared) Hadoop EC2 AMI.
+ The EC2 scripts now support launch of public AMIs.
+ (tomwhite)
+
+28. HADOOP-1025. Remove some obsolete code in ipc.Server. (cutting)
+
+29. HADOOP-997. Implement S3 retry mechanism for failed block
+ transfers. This includes a generic retry mechanism for use
+ elsewhere in Hadoop. (tomwhite)
+
+30. HADOOP-990. Improve HDFS support for full datanode volumes.
+ (Raghu Angadi via cutting)
+
+31. HADOOP-564. Replace uses of "dfs://" URIs with the more standard
+ "hdfs://". (Wendy Chien via cutting)
+
+32. HADOOP-1030. In unit tests, unify setting of ipc.client.timeout.
+ Also increase the value used from one to two seconds, in hopes of
+ making tests complete more reliably. (cutting)
+
+33. HADOOP-654. Stop assigning tasks to a tasktracker if it has
+ failed more than a specified number in the job.
+ (Arun C Murthy via cutting)
+
+34. HADOOP-985. Change HDFS to identify nodes by IP address rather
+ than by DNS hostname. (Raghu Angadi via cutting)
+
+35. HADOOP-248. Optimize location of map outputs to not use random
+ probes. (Devaraj Das via cutting)
+
+36. HADOOP-1029. Fix streaming's input format to correctly seek to
+ the start of splits. (Arun C Murthy via cutting)
+
+37. HADOOP-492. Add per-job and per-task counters. These are
+ incremented via the Reporter interface and available through the
+ web ui and the JobClient API. The mapreduce framework maintains a
+ few basic counters, and applications may add their own. Counters
+ are also passed to the metrics system.
+ (David Bowen via cutting)
+
+38. HADOOP-1034. Fix datanode to better log exceptions.
+ (Philippe Gassmann via cutting)
+
+39. HADOOP-878. In contrib/streaming, fix reducer=NONE to work with
+ multiple maps. (Arun C Murthy via cutting)
+
+40. HADOOP-1039. In HDFS's TestCheckpoint, avoid restarting
+ MiniDFSCluster so often, speeding this test. (Dhruba Borthakur via cutting)
+
+41. HADOOP-1040. Update RandomWriter example to use counters and
+ user-defined input and output formats. (omalley via cutting)
+
+42. HADOOP-1027. Fix problems with in-memory merging during shuffle
+ and re-enable this optimization. (Devaraj Das via cutting)
+
+43. HADOOP-1036. Fix exception handling in TaskTracker to keep tasks
+ from being lost. (Arun C Murthy via cutting)
+
+44. HADOOP-1042. Improve the handling of failed map output fetches.
+ (Devaraj Das via cutting)
+
+45. HADOOP-928. Make checksums optional per FileSystem.
+ (Hairong Kuang via cutting)
+
+46. HADOOP-1044. Fix HDFS's TestDecommission to not spuriously fail.
+ (Wendy Chien via cutting)
+
+47. HADOOP-972. Optimize HDFS's rack-aware block placement algorithm.
+ (Hairong Kuang via cutting)
+
+48. HADOOP-1043. Optimize shuffle, increasing parallelism.
+ (Devaraj Das via cutting)
+
+49. HADOOP-940. Improve HDFS's replication scheduling.
+ (Dhruba Borthakur via cutting)
+
+50. HADOOP-1020. Fix a bug in Path resolution, and a with unit tests
+ on Windows. (cutting)
+
+51. HADOOP-941. Enhance record facility.
+ (Milind Bhandarkar via cutting)
+
+52. HADOOP-1000. Fix so that log messages in task subprocesses are
+ not written to a task's standard error. (Arun C Murthy via cutting)
+
+53. HADOOP-1037. Fix bin/slaves.sh, which currently only works with
+ /bin/bash, to specify /bin/bash rather than /bin/sh. (cutting)
+
+54. HADOOP-1046. Clean up tmp from partially received stale block files. (ab)
+
+55. HADOOP-1041. Optimize mapred counter implementation. Also group
+ counters by their declaring Enum. (David Bowen via cutting)
+
+56. HADOOP-1032. Permit one to specify jars that will be cached
+ across multiple jobs. (Gautam Kowshik via cutting)
+
+57. HADOOP-1051. Add optional checkstyle task to build.xml. To use
+ this developers must download the (LGPL'd) checkstyle jar
+ themselves. (tomwhite via cutting)
+
+58. HADOOP-1049. Fix a race condition in IPC client.
+ (Devaraj Das via cutting)
+
+60. HADOOP-1056. Check HDFS include/exclude node lists with both IP
+ address and hostname. (Wendy Chien via cutting)
+
+61. HADOOP-994. In HDFS, limit the number of blocks invalidated at
+ once. Large lists were causing datenodes to timeout.
+ (Dhruba Borthakur via cutting)
+
+62. HADOOP-432. Add a trash feature, disabled by default. When
+ enabled, the FSShell 'rm' command will move things to a trash
+ directory in the filesystem. In HDFS, a thread periodically
+ checkpoints the trash and removes old checkpoints. (cutting)
+
+
+Release 0.11.2 - 2007-02-16
+
+ 1. HADOOP-1009. Fix an infinite loop in the HDFS namenode.
+ (Dhruba Borthakur via cutting)
+
+ 2. HADOOP-1014. Disable in-memory merging during shuffle, as this is
+ causing data corruption. (Devaraj Das via cutting)
+
+
+Release 0.11.1 - 2007-02-09
+
+ 1. HADOOP-976. Make SequenceFile.Metadata public. (Runping Qi via cutting)
+
+ 2. HADOOP-917. Fix a NullPointerException in SequenceFile's merger
+ with large map outputs. (omalley via cutting)
+
+ 3. HADOOP-984. Fix a bug in shuffle error handling introduced by
+ HADOOP-331. If a map output is unavailable, the job tracker is
+ once more informed. (Arun C Murthy via cutting)
+
+ 4. HADOOP-987. Fix a problem in HDFS where blocks were not removed
+ from neededReplications after a replication target was selected.
+ (Hairong Kuang via cutting)
+
+Release 0.11.0 - 2007-02-02
+
+ 1. HADOOP-781. Remove methods deprecated in 0.10 that are no longer
+ widely used. (cutting)
+
+ 2. HADOOP-842. Change HDFS protocol so that the open() method is
+ passed the client hostname, to permit the namenode to order block
+ locations on the basis of network topology.
+ (Hairong Kuang via cutting)
+
+ 3. HADOOP-852. Add an ant task to compile record definitions, and
+ use it to compile record unit tests. (Milind Bhandarkar via cutting)
+
+ 4. HADOOP-757. Fix "Bad File Descriptor" exception in HDFS client
+ when an output file is closed twice. (Raghu Angadi via cutting)
+
+ 5. [ intentionally blank ]
+
+ 6. HADOOP-890. Replace dashes in metric names with underscores,
+ for better compatibility with some monitoring systems.
+ (Nigel Daley via cutting)
+
+ 7. HADOOP-801. Add to jobtracker a log of task completion events.
+ (Sanjay Dahiya via cutting)
+
+ 8. HADOOP-855. In HDFS, try to repair files with checksum errors.
+ An exception is still thrown, but corrupt blocks are now removed
+ when they have replicas. (Wendy Chien via cutting)
+
+ 9. HADOOP-886. Reduce number of timer threads created by metrics API
+ by pooling contexts. (Nigel Daley via cutting)
+
+10. HADOOP-897. Add a "javac.args" property to build.xml that permits
+ one to pass arbitrary options to javac. (Milind Bhandarkar via cutting)
+
+11. HADOOP-899. Update libhdfs for changes in HADOOP-871.
+ (Sameer Paranjpye via cutting)
+
+12. HADOOP-905. Remove some dead code from JobClient. (cutting)
+
+13. HADOOP-902. Fix a NullPointerException in HDFS client when
+ closing output streams. (Raghu Angadi via cutting)
+
+14. HADOOP-735. Switch generated record code to use BytesWritable to
+ represent fields of type 'buffer'. (Milind Bhandarkar via cutting)
+
+15. HADOOP-830. Improve mapreduce merge performance by buffering and
+ merging multiple map outputs as they arrive at reduce nodes before
+ they're written to disk. (Devaraj Das via cutting)
+
+16. HADOOP-908. Add a new contrib package, Abacus, that simplifies
+ counting and aggregation, built on MapReduce. (Runping Qi via cutting)
+
+17. HADOOP-901. Add support for recursive renaming to the S3 filesystem.
+ (Tom White via cutting)
+
+18. HADOOP-912. Fix a bug in TaskTracker.isIdle() that was
+ sporadically causing unit test failures. (Arun C Murthy via cutting)
+
+19. HADOOP-909. Fix the 'du' command to correctly compute the size of
+ FileSystem directory trees. (Hairong Kuang via cutting)
+
+20. HADOOP-731. When a checksum error is encountered on a file stored
+ in HDFS, try another replica of the data, if any.
+ (Wendy Chien via cutting)
+
+21. HADOOP-732. Add support to SequenceFile for arbitrary metadata,
+ as a set of attribute value pairs. (Runping Qi via cutting)
+
+22. HADOOP-929. Fix PhasedFileSystem to pass configuration to
+ underlying FileSystem. (Sanjay Dahiya via cutting)
+
+23. HADOOP-935. Fix contrib/abacus to not delete pre-existing output
+ files, but rather to fail in this case. (Runping Qi via cutting)
+
+24. HADOOP-936. More metric renamings, as in HADOOP-890.
+ (Nigel Daley via cutting)
+
+25. HADOOP-856. Fix HDFS's fsck command to not report that
+ non-existent filesystems are healthy. (Milind Bhandarkar via cutting)
+
+26. HADOOP-602. Remove the dependency on Lucene's PriorityQueue
+ utility, by copying it into Hadoop. This facilitates using Hadoop
+ with different versions of Lucene without worrying about CLASSPATH
+ order. (Milind Bhandarkar via cutting)
+
+27. [ intentionally blank ]
+
+28. HADOOP-227. Add support for backup namenodes, which periodically
+ get snapshots of the namenode state. (Dhruba Borthakur via cutting)
+
+29. HADOOP-884. Add scripts in contrib/ec2 to facilitate running
+ Hadoop on an Amazon's EC2 cluster. (Tom White via cutting)
+
+30. HADOOP-937. Change the namenode to request re-registration of
+ datanodes in more circumstances. (Hairong Kuang via cutting)
+
+31. HADOOP-922. Optimize small forward seeks in HDFS. If data is has
+ likely already in flight, skip ahead rather than re-opening the
+ block. (Dhruba Borthakur via cutting)
+
+32. HADOOP-961. Add a 'job -events' sub-command that prints job
+ events, including task completions and failures. (omalley via cutting)
+
+33. HADOOP-959. Fix namenode snapshot code added in HADOOP-227 to
+ work on Windows. (Dhruba Borthakur via cutting)
+
+34. HADOOP-934. Fix TaskTracker to catch metrics exceptions that were
+ causing heartbeats to fail. (Arun Murthy via cutting)
+
+35. HADOOP-881. Fix JobTracker web interface to display the correct
+ number of task failures. (Sanjay Dahiya via cutting)
+
+36. HADOOP-788. Change contrib/streaming to subclass TextInputFormat,
+ permitting it to take advantage of native compression facilities.
+ (Sanjay Dahiya via cutting)
+
+37. HADOOP-962. In contrib/ec2: make scripts executable in tar file;
+ add a README; make the environment file use a template.
+ (Tom White via cutting)
+
+38. HADOOP-549. Fix a NullPointerException in TaskReport's
+ serialization. (omalley via cutting)
+
+39. HADOOP-963. Fix remote exceptions to have the stack trace of the
+ caller thread, not the IPC listener thread. (omalley via cutting)
+
+40. HADOOP-967. Change RPC clients to start sending a version header.
+ (omalley via cutting)
+
+41. HADOOP-964. Fix a bug introduced by HADOOP-830 where jobs failed
+ whose comparators and/or i/o types were in the job's jar.
+ (Dennis Kubes via cutting)
+
+42. HADOOP-969. Fix a deadlock in JobTracker. (omalley via cutting)
+
+43. HADOOP-862. Add support for the S3 FileSystem to the CopyFiles
+ tool. (Michael Stack via cutting)
+
+44. HADOOP-965. Fix IsolationRunner so that job's jar can be found.
+ (Dennis Kubes via cutting)
+
+45. HADOOP-309. Fix two NullPointerExceptions in StatusHttpServer.
+ (navychen via cutting)
+
+46. HADOOP-692. Add rack awareness to HDFS's placement of blocks.
+ (Hairong Kuang via cutting)
+
+
+Release 0.10.1 - 2007-01-10
+
+ 1. HADOOP-857. Fix S3 FileSystem implementation to permit its use
+ for MapReduce input and output. (Tom White via cutting)
+
+ 2. HADOOP-863. Reduce logging verbosity introduced by HADOOP-813.
+ (Devaraj Das via cutting)
+
+ 3. HADOOP-815. Fix memory leaks in JobTracker. (Arun C Murthy via cutting)
+
+ 4. HADOOP-600. Fix a race condition in JobTracker.
+ (Arun C Murthy via cutting)
+
+ 5. HADOOP-864. Fix 'bin/hadoop -jar' to operate correctly when
+ hadoop.tmp.dir does not yet exist. (omalley via cutting)
+
+ 6. HADOOP-866. Fix 'dfs -get' command to remove existing crc files,
+ if any. (Milind Bhandarkar via cutting)
+
+ 7. HADOOP-871. Fix a bug in bin/hadoop setting JAVA_LIBRARY_PATH.
+ (Arun C Murthy via cutting)
+
+ 8. HADOOP-868. Decrease the number of open files during map,
+ respecting io.sort.fa ctor. (Devaraj Das via cutting)
+
+ 9. HADOOP-865. Fix S3 FileSystem so that partially created files can
+ be deleted. (Tom White via cutting)
+
+10. HADOOP-873. Pass java.library.path correctly to child processes.
+ (omalley via cutting)
+
+11. HADOOP-851. Add support for the LZO codec. This is much faster
+ than the default, zlib-based compression, but it is only available
+ when the native library is built. (Arun C Murthy via cutting)
+
+12. HADOOP-880. Fix S3 FileSystem to remove directories.
+ (Tom White via cutting)
+
+13. HADOOP-879. Fix InputFormatBase to handle output generated by
+ MapFileOutputFormat. (cutting)
+
+14. HADOOP-659. In HDFS, prioritize replication of blocks based on
+ current replication level. Blocks which are severely
+ under-replicated should be further replicated before blocks which
+ are less under-replicated. (Hairong Kuang via cutting)
+
+15. HADOOP-726. Deprecate FileSystem locking methods. They are not
+ currently usable. Locking should eventually provided as an
+ independent service. (Raghu Angadi via cutting)
+
+16. HADOOP-758. Fix exception handling during reduce so that root
+ exceptions are not masked by exceptions in cleanups.
+ (Raghu Angadi via cutting)
+
+
+Release 0.10.0 - 2007-01-05
+
+ 1. HADOOP-763. Change DFS namenode benchmark to not use MapReduce.
+ (Nigel Daley via cutting)
+
+ 2. HADOOP-777. Use fully-qualified hostnames for tasktrackers and
+ datanodes. (Mahadev Konar via cutting)
+
+ 3. HADOOP-621. Change 'dfs -cat' to exit sooner when output has been
+ closed. (Dhruba Borthakur via cutting)
+
+ 4. HADOOP-752. Rationalize some synchronization in DFS namenode.
+ (Dhruba Borthakur via cutting)
+
+ 5. HADOOP-629. Fix RPC services to better check the protocol name and
+ version. (omalley via cutting)
+
+ 6. HADOOP-774. Limit the number of invalid blocks returned with
+ heartbeats by the namenode to datanodes. Transmitting and
+ processing very large invalid block lists can tie up both the
+ namenode and datanode for too long. (Dhruba Borthakur via cutting)
+
+ 7. HADOOP-738. Change 'dfs -get' command to not create CRC files by
+ default, adding a -crc option to force their creation.
+ (Milind Bhandarkar via cutting)
+
+ 8. HADOOP-676. Improved exceptions and error messages for common job
+ input specification errors. (Sanjay Dahiya via cutting)
+
+ 9. [Included in 0.9.2 release]
+
+10. HADOOP-756. Add new dfsadmin option to wait for filesystem to be
+ operational. (Dhruba Borthakur via cutting)
+
+11. HADOOP-770. Fix jobtracker web interface to display, on restart,
+ jobs that were running when it was last stopped.
+ (Sanjay Dahiya via cutting)
+
+12. HADOOP-331. Write all map outputs to a single file with an index,
+ rather than to a separate file per reduce task. This should both
+ speed the shuffle and make things more scalable.
+ (Devaraj Das via cutting)
+
+13. HADOOP-818. Fix contrib unit tests to not depend on core unit
+ tests. (omalley via cutting)
+
+14. HADOOP-786. Log common exception at debug level.
+ (Sanjay Dahiya via cutting)
+
+15. HADOOP-796. Provide more convenient access to failed task
+ information in the web interface. (Sanjay Dahiya via cutting)
+
+16. HADOOP-764. Reduce memory allocations in namenode some.
+ (Dhruba Borthakur via cutting)
+
+17. HADOOP-802. Update description of mapred.speculative.execution to
+ mention reduces. (Nigel Daley via cutting)
+
+18. HADOOP-806. Include link to datanodes on front page of namenode
+ web interface. (Raghu Angadi via cutting)
+
+19. HADOOP-618. Make JobSubmissionProtocol public.
+ (Arun C Murthy via cutting)
+
+20. HADOOP-782. Fully remove killed tasks. (Arun C Murthy via cutting)
+
+21. HADOOP-792. Fix 'dfs -mv' to return correct status.
+ (Dhruba Borthakur via cutting)
+
+22. HADOOP-673. Give each task its own working directory again.
+ (Mahadev Konar via cutting)
+
+23. HADOOP-571. Extend the syntax of Path to be a URI; to be
+ optionally qualified with a scheme and authority. The scheme
+ determines the FileSystem implementation, while the authority
+ determines the FileSystem instance. New FileSystem
+ implementations may be provided by defining an fs..impl
+ property, naming the FileSystem implementation class. This
+ permits easy integration of new FileSystem implementations.
+ (cutting)
+
+24. HADOOP-720. Add an HDFS white paper to website.
+ (Dhruba Borthakur via cutting)
+
+25. HADOOP-794. Fix a divide-by-zero exception when a job specifies
+ zero map tasks. (omalley via cutting)
+
+26. HADOOP-454. Add a 'dfs -dus' command that provides summary disk
+ usage. (Hairong Kuang via cutting)
+
+27. HADOOP-574. Add an Amazon S3 implementation of FileSystem. To
+ use this, one need only specify paths of the form
+ s3://id:secret@bucket/. Alternately, the AWS access key id and
+ secret can be specified in your config, with the properties
+ fs.s3.awsAccessKeyId and fs.s3.awsSecretAccessKey.
+ (Tom White via cutting)
+
+28. HADOOP-824. Rename DFSShell to be FsShell, since it applies
+ generically to all FileSystem implementations. (cutting)
+
+29. HADOOP-813. Fix map output sorting to report progress, so that
+ sorts which take longer than the task timeout do not fail.
+ (Devaraj Das via cutting)
+
+30. HADOOP-825. Fix HDFS daemons when configured with new URI syntax.
+ (omalley via cutting)
+
+31. HADOOP-596. Fix a bug in phase reporting during reduce.
+ (Sanjay Dahiya via cutting)
+
+32. HADOOP-811. Add a utility, MultithreadedMapRunner.
+ (Alejandro Abdelnur via cutting)
+
+33. HADOOP-829. Within HDFS, clearly separate three different
+ representations for datanodes: one for RPCs, one for
+ namenode-internal use, and one for namespace persistence.
+ (Dhruba Borthakur via cutting)
+
+34. HADOOP-823. Fix problem starting datanode when not all configured
+ data directories exist. (Bryan Pendleton via cutting)
+
+35. HADOOP-451. Add a Split interface. CAUTION: This incompatibly
+ changes the InputFormat and RecordReader interfaces. Not only is
+ FileSplit replaced with Split, but a FileSystem parameter is no
+ longer passed in several methods, input validation has changed,
+ etc. (omalley via cutting)
+
+36. HADOOP-814. Optimize locking in namenode. (Dhruba Borthakur via cutting)
+
+37. HADOOP-738. Change 'fs -put' and 'fs -get' commands to accept
+ standard input and output, respectively. Standard i/o is
+ specified by a file named '-'. (Wendy Chien via cutting)
+
+38. HADOOP-835. Fix a NullPointerException reading record-compressed
+ SequenceFiles. (Hairong Kuang via cutting)
+
+39. HADOOP-836. Fix a MapReduce bug on Windows, where the wrong
+ FileSystem was used. Also add a static FileSystem.getLocal()
+ method and better Path checking in HDFS, to help avoid such issues
+ in the future. (omalley via cutting)
+
+40. HADOOP-837. Improve RunJar utility to unpack jar file
+ hadoop.tmp.dir, rather than the system temporary directory.
+ (Hairong Kuang via cutting)
+
+41. HADOOP-841. Fix native library to build 32-bit version even when
+ on a 64-bit host, if a 32-bit JVM is used. (Arun C Murthy via cutting)
+
+42. HADOOP-838. Fix tasktracker to pass java.library.path to
+ sub-processes, so that libhadoop.a is found.
+ (Arun C Murthy via cutting)
+
+43. HADOOP-844. Send metrics messages on a fixed-delay schedule
+ instead of a fixed-rate schedule. (David Bowen via cutting)
+
+44. HADOOP-849. Fix OutOfMemory exceptions in TaskTracker due to a
+ file handle leak in SequenceFile. (Devaraj Das via cutting)
+
+45. HADOOP-745. Fix a synchronization bug in the HDFS namenode.
+ (Dhruba Borthakur via cutting)
+
+46. HADOOP-850. Add Writable implementations for variable-length
+ integers. (ab via cutting)
+
+47. HADOOP-525. Add raw comparators to record types. This greatly
+ improves record sort performance. (Milind Bhandarkar via cutting)
+
+48. HADOOP-628. Fix a problem with 'fs -cat' command, where some
+ characters were replaced with question marks. (Wendy Chien via cutting)
+
+49. HADOOP-804. Reduce verbosity of MapReduce logging.
+ (Sanjay Dahiya via cutting)
+
+50. HADOOP-853. Rename 'site' to 'docs', in preparation for inclusion
+ in releases. (cutting)
+
+51. HADOOP-371. Include contrib jars and site documentation in
+ distributions. Also add contrib and example documentation to
+ distributed javadoc, in separate sections. (Nigel Daley via cutting)
+
+52. HADOOP-846. Report progress during entire map, as sorting of
+ intermediate outputs may happen at any time, potentially causing
+ task timeouts. (Devaraj Das via cutting)
+
+53. HADOOP-840. In task tracker, queue task cleanups and perform them
+ in a separate thread. (omalley & Mahadev Konar via cutting)
+
+54. HADOOP-681. Add to HDFS the ability to decommission nodes. This
+ causes their blocks to be re-replicated on other nodes, so that
+ they may be removed from a cluster. (Dhruba Borthakur via cutting)
+
+55. HADOOP-470. In HDFS web ui, list the datanodes containing each
+ copy of a block. (Hairong Kuang via cutting)
+
+56. HADOOP-700. Change bin/hadoop to only include core jar file on
+ classpath, not example, test, etc. Also rename core jar to
+ hadoop-${version}-core.jar so that it can be more easily
+ identified. (Nigel Daley via cutting)
+
+57. HADOOP-619. Extend InputFormatBase to accept individual files and
+ glob patterns as MapReduce inputs, not just directories. Also
+ change contrib/streaming to use this. (Sanjay Dahia via cutting)
+
+
+Release 0.9.2 - 2006-12-15
+
+ 1. HADOOP-639. Restructure InterTrackerProtocol to make task
+ accounting more reliable. (Arun C Murthy via cutting)
+
+ 2. HADOOP-827. Turn off speculative execution by default, since it's
+ currently broken. (omalley via cutting)
+
+ 3. HADOOP-791. Fix a deadlock in the task tracker.
+ (Mahadev Konar via cutting)
+
+
+Release 0.9.1 - 2006-12-06
+
+ 1. HADOOP-780. Use ReflectionUtils to instantiate key and value
+ objects. (ab)
+
+ 2. HADOOP-779. Fix contrib/streaming to work correctly with gzipped
+ input files. (Hairong Kuang via cutting)
+
+
+Release 0.9.0 - 2006-12-01
+
+ 1. HADOOP-655. Remove most deprecated code. A few deprecated things
+ remain, notably UTF8 and some methods that are still required.
+ Also cleaned up constructors for SequenceFile, MapFile, SetFile,
+ and ArrayFile a bit. (cutting)
+
+ 2. HADOOP-565. Upgrade to Jetty version 6. (Sanjay Dahiya via cutting)
+
+ 3. HADOOP-682. Fix DFS format command to work correctly when
+ configured with a non-existent directory. (Sanjay Dahiya via cutting)
+
+ 4. HADOOP-645. Fix a bug in contrib/streaming when -reducer is NONE.
+ (Dhruba Borthakur via cutting)
+
+ 5. HADOOP-687. Fix a classpath bug in bin/hadoop that blocked the
+ servers from starting. (Sameer Paranjpye via omalley)
+
+ 6. HADOOP-683. Remove a script dependency on bash, so it works with
+ dash, the new default for /bin/sh on Ubuntu. (James Todd via cutting)
+
+ 7. HADOOP-382. Extend unit tests to run multiple datanodes.
+ (Milind Bhandarkar via cutting)
+
+ 8. HADOOP-604. Fix some synchronization issues and a
+ NullPointerException in DFS datanode. (Raghu Angadi via cutting)
+
+ 9. HADOOP-459. Fix memory leaks and a host of other issues with
+ libhdfs. (Sameer Paranjpye via cutting)
+
+10. HADOOP-694. Fix a NullPointerException in jobtracker.
+ (Mahadev Konar via cutting)
+
+11. HADOOP-637. Fix a memory leak in the IPC server. Direct buffers
+ are not collected like normal buffers, and provided little
+ advantage. (Raghu Angadi via cutting)
+
+12. HADOOP-696. Fix TestTextInputFormat unit test to not rely on the
+ order of directory listings. (Sameer Paranjpye via cutting)
+
+13. HADOOP-611. Add support for iterator-based merging to
+ SequenceFile. (Devaraj Das via cutting)
+
+14. HADOOP-688. Move DFS administrative commands to a separate
+ command named 'dfsadmin'. (Dhruba Borthakur via cutting)
+
+15. HADOOP-708. Fix test-libhdfs to return the correct status, so
+ that failures will break the build. (Nigel Daley via cutting)
+
+16. HADOOP-646. Fix namenode to handle edits files larger than 2GB.
+ (Milind Bhandarkar via cutting)
+
+17. HADOOP-705. Fix a bug in the JobTracker when failed jobs were
+ not completely cleaned up. (Mahadev Konar via cutting)
+
+18. HADOOP-613. Perform final merge while reducing. This removes one
+ sort pass over the data and should consequently significantly
+ decrease overall processing time. (Devaraj Das via cutting)
+
+19. HADOOP-661. Make each job's configuration visible through the web
+ ui. (Arun C Murthy via cutting)
+
+20. HADOOP-489. In MapReduce, separate user logs from system logs.
+ Each task's log output is now available through the web ui. (Arun
+ C Murthy via cutting)
+
+21. HADOOP-712. Fix record io's xml serialization to correctly handle
+ control-characters. (Milind Bhandarkar via cutting)
+
+22. HADOOP-668. Improvements to the web-based DFS browser.
+ (Hairong Kuang via cutting)
+
+23. HADOOP-715. Fix build.xml so that test logs are written in build
+ directory, rather than in CWD. (Arun C Murthy via cutting)
+
+24. HADOOP-538. Add support for building an optional native library,
+ libhadoop.so, that improves the performance of zlib-based
+ compression. To build this, specify -Dcompile.native to Ant.
+ (Arun C Murthy via cutting)
+
+25. HADOOP-610. Fix an problem when the DFS block size is configured
+ to be smaller than the buffer size, typically only when debugging.
+ (Milind Bhandarkar via cutting)
+
+26. HADOOP-695. Fix a NullPointerException in contrib/streaming.
+ (Hairong Kuang via cutting)
+
+27. HADOOP-652. In DFS, when a file is deleted, the block count is
+ now decremented. (Vladimir Krokhmalyov via cutting)
+
+28. HADOOP-725. In DFS, optimize block placement algorithm,
+ previously a performance bottleneck. (Milind Bhandarkar via cutting)
+
+29. HADOOP-723. In MapReduce, fix a race condition during the
+ shuffle, which resulted in FileNotFoundExceptions. (omalley via cutting)
+
+30. HADOOP-447. In DFS, fix getBlockSize(Path) to work with relative
+ paths. (Raghu Angadi via cutting)
+
+31. HADOOP-733. Make exit codes in DFShell consistent and add a unit
+ test. (Dhruba Borthakur via cutting)
+
+32. HADOOP-709. Fix contrib/streaming to work with commands that
+ contain control characters. (Dhruba Borthakur via cutting)
+
+33. HADOOP-677. In IPC, permit a version header to be transmitted
+ when connections are established. This will permit us to change
+ the format of IPC requests back-compatibly in subsequent releases.
+ (omalley via cutting)
+
+34. HADOOP-699. Fix DFS web interface so that filesystem browsing
+ works correctly, using the right port number. Also add support
+ for sorting datanode list by various columns.
+ (Raghu Angadi via cutting)
+
+35. HADOOP-76. Implement speculative reduce. Now when a job is
+ configured for speculative execution, both maps and reduces will
+ execute speculatively. Reduce outputs are written to temporary
+ location and moved to the final location when reduce is complete.
+ (Sanjay Dahiya via cutting)
+
+36. HADOOP-736. Roll back to Jetty 5.1.4, due to performance problems
+ with Jetty 6.0.1.
+
+37. HADOOP-739. Fix TestIPC to use different port number, making it
+ more reliable. (Nigel Daley via cutting)
+
+38. HADOOP-749. Fix a NullPointerException in jobfailures.jsp.
+ (omalley via cutting)
+
+39. HADOOP-747. Fix record serialization to work correctly when
+ records are embedded in Maps. (Milind Bhandarkar via cutting)
+
+40. HADOOP-698. Fix HDFS client not to retry the same datanode on
+ read failures. (Milind Bhandarkar via cutting)
+
+41. HADOOP-689. Add GenericWritable, to facilitate polymorphism in
+ MapReduce, SequenceFile, etc. (Feng Jiang via cutting)
+
+42. HADOOP-430. Stop datanode's HTTP server when registration with
+ namenode fails. (Wendy Chien via cutting)
+
+43. HADOOP-750. Fix a potential race condition during mapreduce
+ shuffle. (omalley via cutting)
+
+44. HADOOP-728. Fix contrib/streaming-related issues, including
+ '-reducer NONE'. (Sanjay Dahiya via cutting)
+
+
+Release 0.8.0 - 2006-11-03
+
+ 1. HADOOP-477. Extend contrib/streaming to scan the PATH environment
+ variables when resolving executable program names.
+ (Dhruba Borthakur via cutting)
+
+ 2. HADOOP-583. In DFSClient, reduce the log level of re-connect
+ attempts from 'info' to 'debug', so they are not normally shown.
+ (Konstantin Shvachko via cutting)
+
+ 3. HADOOP-498. Re-implement DFS integrity checker to run server-side,
+ for much improved performance. (Milind Bhandarkar via cutting)
+
+ 4. HADOOP-586. Use the jar name for otherwise un-named jobs.
+ (Sanjay Dahiya via cutting)
+
+ 5. HADOOP-514. Make DFS heartbeat interval configurable.
+ (Milind Bhandarkar via cutting)
+
+ 6. HADOOP-588. Fix logging and accounting of failed tasks.
+ (Sanjay Dahiya via cutting)
+
+ 7. HADOOP-462. Improve command line parsing in DFSShell, so that
+ incorrect numbers of arguments result in informative errors rather
+ than ArrayOutOfBoundsException. (Dhruba Borthakur via cutting)
+
+ 8. HADOOP-561. Fix DFS so that one replica of each block is written
+ locally, if possible. This was the intent, but there as a bug.
+ (Dhruba Borthakur via cutting)
+
+ 9. HADOOP-610. Fix TaskTracker to survive more exceptions, keeping
+ tasks from becoming lost. (omalley via cutting)
+
+10. HADOOP-625. Add a servlet to all http daemons that displays a
+ stack dump, useful for debugging. (omalley via cutting)
+
+11. HADOOP-554. Fix DFSShell to return -1 for errors.
+ (Dhruba Borthakur via cutting)
+
+12. HADOOP-626. Correct the documentation in the NNBench example
+ code, and also remove a mistaken call there.
+ (Nigel Daley via cutting)
+
+13. HADOOP-634. Add missing license to many files.
+ (Nigel Daley via cutting)
+
+14. HADOOP-627. Fix some synchronization problems in MiniMRCluster
+ that sometimes caused unit tests to fail. (Nigel Daley via cutting)
+
+15. HADOOP-563. Improve the NameNode's lease policy so that leases
+ are held for one hour without renewal (instead of one minute).
+ However another attempt to create the same file will still succeed
+ if the lease has not been renewed within a minute. This prevents
+ communication or scheduling problems from causing a write to fail
+ for up to an hour, barring some other process trying to create the
+ same file. (Dhruba Borthakur via cutting)
+
+16. HADOOP-635. In DFSShell, permit specification of multiple files
+ as the source for file copy and move commands.
+ (Dhruba Borthakur via cutting)
+
+17. HADOOP-641. Change NameNode to request a fresh block report from
+ a re-discovered DataNode, so that no-longer-needed replications
+ are stopped promptly. (Konstantin Shvachko via cutting)
+
+18. HADOOP-642. Change IPC client to specify an explicit connect
+ timeout. (Konstantin Shvachko via cutting)
+
+19. HADOOP-638. Fix an unsynchronized access to TaskTracker's
+ internal state. (Nigel Daley via cutting)
+
+20. HADOOP-624. Fix servlet path to stop a Jetty warning on startup.
+ (omalley via cutting)
+
+21. HADOOP-578. Failed tasks are no longer placed at the end of the
+ task queue. This was originally done to work around other
+ problems that have now been fixed. Re-executing failed tasks
+ sooner causes buggy jobs to fail faster. (Sanjay Dahiya via cutting)
+
+22. HADOOP-658. Update source file headers per Apache policy. (cutting)
+
+23. HADOOP-636. Add MapFile & ArrayFile constructors which accept a
+ Progressable, and pass it down to SequenceFile. This permits
+ reduce tasks which use MapFile to still report progress while
+ writing blocks to the filesystem. (cutting)
+
+24. HADOOP-576. Enable contrib/streaming to use the file cache. Also
+ extend the cache to permit symbolic links to cached items, rather
+ than local file copies. (Mahadev Konar via cutting)
+
+25. HADOOP-482. Fix unit tests to work when a cluster is running on
+ the same machine, removing port conflicts. (Wendy Chien via cutting)
+
+26. HADOOP-90. Permit dfs.name.dir to list multiple directories,
+ where namenode data is to be replicated. (Milind Bhandarkar via cutting)
+
+27. HADOOP-651. Fix DFSCk to correctly pass parameters to the servlet
+ on the namenode. (Milind Bhandarkar via cutting)
+
+28. HADOOP-553. Change main() routines of DataNode and NameNode to
+ log exceptions rather than letting the JVM print them to standard
+ error. Also, change the hadoop-daemon.sh script to rotate
+ standard i/o log files. (Raghu Angadi via cutting)
+
+29. HADOOP-399. Fix javadoc warnings. (Nigel Daley via cutting)
+
+30. HADOOP-599. Fix web ui and command line to correctly report DFS
+ filesystem size statistics. Also improve web layout.
+ (Raghu Angadi via cutting)
+
+31. HADOOP-660. Permit specification of junit test output format.
+ (Nigel Daley via cutting)
+
+32. HADOOP-663. Fix a few unit test issues. (Mahadev Konar via cutting)
+
+33. HADOOP-664. Cause entire build to fail if libhdfs tests fail.
+ (Nigel Daley via cutting)
+
+34. HADOOP-633. Keep jobtracker from dying when job initialization
+ throws exceptions. Also improve exception handling in a few other
+ places and add more informative thread names.
+ (omalley via cutting)
+
+35. HADOOP-669. Fix a problem introduced by HADOOP-90 that can cause
+ DFS to lose files. (Milind Bhandarkar via cutting)
+
+36. HADOOP-373. Consistently check the value returned by
+ FileSystem.mkdirs(). (Wendy Chien via cutting)
+
+37. HADOOP-670. Code cleanups in some DFS internals: use generic
+ types, replace Vector with ArrayList, etc.
+ (Konstantin Shvachko via cutting)
+
+38. HADOOP-647. Permit map outputs to use a different compression
+ type than the job output. (omalley via cutting)
+
+39. HADOOP-671. Fix file cache to check for pre-existence before
+ creating . (Mahadev Konar via cutting)
+
+40. HADOOP-665. Extend many DFSShell commands to accept multiple
+ arguments. Now commands like "ls", "rm", etc. will operate on
+ multiple files. (Dhruba Borthakur via cutting)
+
+
+Release 0.7.2 - 2006-10-18
+
+ 1. HADOOP-607. Fix a bug where classes included in job jars were not
+ found by tasks. (Mahadev Konar via cutting)
+
+ 2. HADOOP-609. Add a unit test that checks that classes in job jars
+ can be found by tasks. Also modify unit tests to specify multiple
+ local directories. (Mahadev Konar via cutting)
+
+
+Release 0.7.1 - 2006-10-11
+
+ 1. HADOOP-593. Fix a NullPointerException in the JobTracker.
+ (omalley via cutting)
+
+ 2. HADOOP-592. Fix a NullPointerException in the IPC Server. Also
+ consistently log when stale calls are discarded. (omalley via cutting)
+
+ 3. HADOOP-594. Increase the DFS safe-mode threshold from .95 to
+ .999, so that nearly all blocks must be reported before filesystem
+ modifications are permitted. (Konstantin Shvachko via cutting)
+
+ 4. HADOOP-598. Fix tasks to retry when reporting completion, so that
+ a single RPC timeout won't fail a task. (omalley via cutting)
+
+ 5. HADOOP-597. Fix TaskTracker to not discard map outputs for errors
+ in transmitting them to reduce nodes. (omalley via cutting)
+
+
+Release 0.7.0 - 2006-10-06
+
+ 1. HADOOP-243. Fix rounding in the display of task and job progress
+ so that things are not shown to be 100% complete until they are in
+ fact finished. (omalley via cutting)
+
+ 2. HADOOP-438. Limit the length of absolute paths in DFS, since the
+ file format used to store pathnames has some limitations.
+ (Wendy Chien via cutting)
+
+ 3. HADOOP-530. Improve error messages in SequenceFile when keys or
+ values are of the wrong type. (Hairong Kuang via cutting)
+
+ 4. HADOOP-288. Add a file caching system and use it in MapReduce to
+ cache job jar files on slave nodes. (Mahadev Konar via cutting)
+
+ 5. HADOOP-533. Fix unit test to not modify conf directory.
+ (Hairong Kuang via cutting)
+
+ 6. HADOOP-527. Permit specification of the local address that various
+ Hadoop daemons should bind to. (Philippe Gassmann via cutting)
+
+ 7. HADOOP-542. Updates to contrib/streaming: reformatted source code,
+ on-the-fly merge sort, a fix for HADOOP-540, etc.
+ (Michel Tourn via cutting)
+
+ 8. HADOOP-545. Remove an unused config file parameter.
+ (Philippe Gassmann via cutting)
+
+ 9. HADOOP-548. Add an Ant property "test.output" to build.xml that
+ causes test output to be logged to the console. (omalley via cutting)
+
+10. HADOOP-261. Record an error message when map output is lost.
+ (omalley via cutting)
+
+11. HADOOP-293. Report the full list of task error messages in the
+ web ui, not just the most recent. (omalley via cutting)
+
+12. HADOOP-551. Restore JobClient's console printouts to only include
+ a maximum of one update per one percent of progress.
+ (omalley via cutting)
+
+13. HADOOP-306. Add a "safe" mode to DFS. The name node enters this
+ when less than a specified percentage of file data is complete.
+ Currently safe mode is only used on startup, but eventually it
+ will also be entered when datanodes disconnect and file data
+ becomes incomplete. While in safe mode no filesystem
+ modifications are permitted and block replication is inhibited.
+ (Konstantin Shvachko via cutting)
+
+14. HADOOP-431. Change 'dfs -rm' to not operate recursively and add a
+ new command, 'dfs -rmr' which operates recursively.
+ (Sameer Paranjpye via cutting)
+
+15. HADOOP-263. Include timestamps for job transitions. The web
+ interface now displays the start and end times of tasks and the
+ start times of sorting and reducing for reduce tasks. Also,
+ extend ObjectWritable to handle enums, so that they can be passed
+ as RPC parameters. (Sanjay Dahiya via cutting)
+
+16. HADOOP-556. Contrib/streaming: send keep-alive reports to task
+ tracker every 10 seconds rather than every 100 records, to avoid
+ task timeouts. (Michel Tourn via cutting)
+
+17. HADOOP-547. Fix reduce tasks to ping tasktracker while copying
+ data, rather than only between copies, avoiding task timeouts.
+ (Sanjay Dahiya via cutting)
+
+18. HADOOP-537. Fix src/c++/libhdfs build process to create files in
+ build/, no longer modifying the source tree.
+ (Arun C Murthy via cutting)
+
+19. HADOOP-487. Throw a more informative exception for unknown RPC
+ hosts. (Sameer Paranjpye via cutting)
+
+20. HADOOP-559. Add file name globbing (pattern matching) support to
+ the FileSystem API, and use it in DFSShell ('bin/hadoop dfs')
+ commands. (Hairong Kuang via cutting)
+
+21. HADOOP-508. Fix a bug in FSDataInputStream. Incorrect data was
+ returned after seeking to a random location.
+ (Milind Bhandarkar via cutting)
+
+22. HADOOP-560. Add a "killed" task state. This can be used to
+ distinguish kills from other failures. Task state has also been
+ converted to use an enum type instead of an int, uncovering a bug
+ elsewhere. The web interface is also updated to display killed
+ tasks. (omalley via cutting)
+
+23. HADOOP-423. Normalize Paths containing directories named "." and
+ "..", using the standard, unix interpretation. Also add checks in
+ DFS, prohibiting the use of "." or ".." as directory or file
+ names. (Wendy Chien via cutting)
+
+24. HADOOP-513. Replace map output handling with a servlet, rather
+ than a JSP page. This fixes an issue where
+ IllegalStateException's were logged, sets content-length
+ correctly, and better handles some errors. (omalley via cutting)
+
+25. HADOOP-552. Improved error checking when copying map output files
+ to reduce nodes. (omalley via cutting)
+
+26. HADOOP-566. Fix scripts to work correctly when accessed through
+ relative symbolic links. (Lee Faris via cutting)
+
+27. HADOOP-519. Add positioned read methods to FSInputStream. These
+ permit one to read from a stream without moving its position, and
+ can hence be performed by multiple threads at once on a single
+ stream. Implement an optimized version for DFS and local FS.
+ (Milind Bhandarkar via cutting)
+
+28. HADOOP-522. Permit block compression with MapFile and SetFile.
+ Since these formats are always sorted, block compression can
+ provide a big advantage. (cutting)
+
+29. HADOOP-567. Record version and revision information in builds. A
+ package manifest is added to the generated jar file containing
+ version information, and a VersionInfo utility is added that
+ includes further information, including the build date and user,
+ and the subversion revision and repository. A 'bin/hadoop
+ version' comand is added to show this information, and it is also
+ added to various web interfaces. (omalley via cutting)
+
+30. HADOOP-568. Fix so that errors while initializing tasks on a
+ tasktracker correctly report the task as failed to the jobtracker,
+ so that it will be rescheduled. (omalley via cutting)
+
+31. HADOOP-550. Disable automatic UTF-8 validation in Text. This
+ permits, e.g., TextInputFormat to again operate on non-UTF-8 data.
+ (Hairong and Mahadev via cutting)
+
+32. HADOOP-343. Fix mapred copying so that a failed tasktracker
+ doesn't cause other copies to slow. (Sameer Paranjpye via cutting)
+
+33. HADOOP-239. Add a persistent job history mechanism, so that basic
+ job statistics are not lost after 24 hours and/or when the
+ jobtracker is restarted. (Sanjay Dahiya via cutting)
+
+34. HADOOP-506. Ignore heartbeats from stale task trackers.
+ (Sanjay Dahiya via cutting)
+
+35. HADOOP-255. Discard stale, queued IPC calls. Do not process
+ calls whose clients will likely time out before they receive a
+ response. When the queue is full, new calls are now received and
+ queued, and the oldest calls are discarded, so that, when servers
+ get bogged down, they no longer develop a backlog on the socket.
+ This should improve some DFS namenode failure modes.
+ (omalley via cutting)
+
+36. HADOOP-581. Fix datanode to not reset itself on communications
+ errors with the namenode. If a request to the namenode fails, the
+ datanode should retry, not restart. This reduces the load on the
+ namenode, since restarts cause a resend of the block report.
+ (omalley via cutting)
+
+
+Release 0.6.2 - 2006-09-18
+
+1. HADOOP-532. Fix a bug reading value-compressed sequence files,
+ where an exception was thrown reporting that the full value had not
+ been read. (omalley via cutting)
+
+2. HADOOP-534. Change the default value class in JobConf to be Text
+ instead of the now-deprecated UTF8. This fixes the Grep example
+ program, which was updated to use Text, but relies on this
+ default. (Hairong Kuang via cutting)
+
+
+Release 0.6.1 - 2006-09-13
+
+ 1. HADOOP-520. Fix a bug in libhdfs, where write failures were not
+ correctly returning error codes. (Arun C Murthy via cutting)
+
+ 2. HADOOP-523. Fix a NullPointerException when TextInputFormat is
+ explicitly specified. Also add a test case for this.
+ (omalley via cutting)
+
+ 3. HADOOP-521. Fix another NullPointerException finding the
+ ClassLoader when using libhdfs. (omalley via cutting)
+
+ 4. HADOOP-526. Fix a NullPointerException when attempting to start
+ two datanodes in the same directory. (Milind Bhandarkar via cutting)
+
+ 5. HADOOP-529. Fix a NullPointerException when opening
+ value-compressed sequence files generated by pre-0.6.0 Hadoop.
+ (omalley via cutting)
+
+
+Release 0.6.0 - 2006-09-08
+
+ 1. HADOOP-427. Replace some uses of DatanodeDescriptor in the DFS
+ web UI code with DatanodeInfo, the preferred public class.
+ (Devaraj Das via cutting)
+
+ 2. HADOOP-426. Fix streaming contrib module to work correctly on
+ Solaris. This was causing nightly builds to fail.
+ (Michel Tourn via cutting)
+
+ 3. HADOOP-400. Improvements to task assignment. Tasks are no longer
+ re-run on nodes where they have failed (unless no other node is
+ available). Also, tasks are better load-balanced among nodes.
+ (omalley via cutting)
+
+ 4. HADOOP-324. Fix datanode to not exit when a disk is full, but
+ rather simply to fail writes. (Wendy Chien via cutting)
+
+ 5. HADOOP-434. Change smallJobsBenchmark to use standard Hadoop
+ scripts. (Sanjay Dahiya via cutting)
+
+ 6. HADOOP-453. Fix a bug in Text.setCapacity(). (siren via cutting)
+
+
+ 7. HADOOP-450. Change so that input types are determined by the
+ RecordReader rather than specified directly in the JobConf. This
+ facilitates jobs with a variety of input types.
+
+ WARNING: This contains incompatible API changes! The RecordReader
+ interface has two new methods that all user-defined InputFormats
+ must now define. Also, the values returned by TextInputFormat are
+ no longer of class UTF8, but now of class Text.
+
+ 8. HADOOP-436. Fix an error-handling bug in the web ui.
+ (Devaraj Das via cutting)
+
+ 9. HADOOP-455. Fix a bug in Text, where DEL was not permitted.
+ (Hairong Kuang via cutting)
+
+10. HADOOP-456. Change the DFS namenode to keep a persistent record
+ of the set of known datanodes. This will be used to implement a
+ "safe mode" where filesystem changes are prohibited when a
+ critical percentage of the datanodes are unavailable.
+ (Konstantin Shvachko via cutting)
+
+11. HADOOP-322. Add a job control utility. This permits one to
+ specify job interdependencies. Each job is submitted only after
+ the jobs it depends on have successfully completed.
+ (Runping Qi via cutting)
+
+12. HADOOP-176. Fix a bug in IntWritable.Comparator.
+ (Dick King via cutting)
+
+13. HADOOP-421. Replace uses of String in recordio package with Text
+ class, for improved handling of UTF-8 data.
+ (Milind Bhandarkar via cutting)
+
+14. HADOOP-464. Improved error message when job jar not found.
+ (Michel Tourn via cutting)
+
+15. HADOOP-469. Fix /bin/bash specifics that have crept into our
+ /bin/sh scripts since HADOOP-352.
+ (Jean-Baptiste Quenot via cutting)
+
+16. HADOOP-468. Add HADOOP_NICENESS environment variable to set
+ scheduling priority for daemons. (Vetle Roeim via cutting)
+
+17. HADOOP-473. Fix TextInputFormat to correctly handle more EOL
+ formats. Things now work correctly with CR, LF or CRLF.
+ (Dennis Kubes & James White via cutting)
+
+18. HADOOP-461. Make Java 1.5 an explicit requirement. (cutting)
+
+19. HADOOP-54. Add block compression to SequenceFile. One may now
+ specify that blocks of keys and values are compressed together,
+ improving compression for small keys and values.
+ SequenceFile.Writer's constructor is now deprecated and replaced
+ with a factory method. (Arun C Murthy via cutting)
+
+20. HADOOP-281. Prohibit DFS files that are also directories.
+ (Wendy Chien via cutting)
+
+21. HADOOP-486. Add the job username to JobStatus instances returned
+ by JobClient. (Mahadev Konar via cutting)
+
+22. HADOOP-437. contrib/streaming: Add support for gzipped inputs.
+ (Michel Tourn via cutting)
+
+23. HADOOP-463. Add variable expansion to config files.
+ Configuration property values may now contain variable
+ expressions. A variable is referenced with the syntax
+ '${variable}'. Variables values are found first in the
+ configuration, and then in Java system properties. The default
+ configuration is modified so that temporary directories are now
+ under ${hadoop.tmp.dir}, which is, by default,
+ /tmp/hadoop-${user.name}. (Michel Tourn via cutting)
+
+24. HADOOP-419. Fix a NullPointerException finding the ClassLoader
+ when using libhdfs. (omalley via cutting)
+
+25. HADOOP-460. Fix contrib/smallJobsBenchmark to use Text instead of
+ UTF8. (Sanjay Dahiya via cutting)
+
+26. HADOOP-196. Fix Configuration(Configuration) constructor to work
+ correctly. (Sami Siren via cutting)
+
+27. HADOOP-501. Fix Configuration.toString() to handle URL resources.
+ (Thomas Friol via cutting)
+
+28. HADOOP-499. Reduce the use of Strings in contrib/streaming,
+ replacing them with Text for better performance.
+ (Hairong Kuang via cutting)
+
+29. HADOOP-64. Manage multiple volumes with a single DataNode.
+ Previously DataNode would create a separate daemon per configured
+ volume, each with its own connection to the NameNode. Now all
+ volumes are handled by a single DataNode daemon, reducing the load
+ on the NameNode. (Milind Bhandarkar via cutting)
+
+30. HADOOP-424. Fix MapReduce so that jobs which generate zero splits
+ do not fail. (Fr??d??ric Bertin via cutting)
+
+31. HADOOP-408. Adjust some timeouts and remove some others so that
+ unit tests run faster. (cutting)
+
+32. HADOOP-507. Fix an IllegalAccessException in DFS.
+ (omalley via cutting)
+
+33. HADOOP-320. Fix so that checksum files are correctly copied when
+ the destination of a file copy is a directory.
+ (Hairong Kuang via cutting)
+
+34. HADOOP-286. In DFSClient, avoid pinging the NameNode with
+ renewLease() calls when no files are being written.
+ (Konstantin Shvachko via cutting)
+
+35. HADOOP-312. Close idle IPC connections. All IPC connections were
+ cached forever. Now, after a connection has been idle for more
+ than a configurable amount of time (one second by default), the
+ connection is closed, conserving resources on both client and
+ server. (Devaraj Das via cutting)
+
+36. HADOOP-497. Permit the specification of the network interface and
+ nameserver to be used when determining the local hostname
+ advertised by datanodes and tasktrackers.
+ (Lorenzo Thione via cutting)
+
+37. HADOOP-441. Add a compression codec API and extend SequenceFile
+ to use it. This will permit the use of alternate compression
+ codecs in SequenceFile. (Arun C Murthy via cutting)
+
+38. HADOOP-483. Improvements to libhdfs build and documentation.
+ (Arun C Murthy via cutting)
+
+39. HADOOP-458. Fix a memory corruption bug in libhdfs.
+ (Arun C Murthy via cutting)
+
+40. HADOOP-517. Fix a contrib/streaming bug in end-of-line detection.
+ (Hairong Kuang via cutting)
+
+41. HADOOP-474. Add CompressionCodecFactory, and use it in
+ TextInputFormat and TextOutputFormat. Compressed input files are
+ automatically decompressed when they have the correct extension.
+ Output files will, when output compression is specified, be
+ generated with an approprate extension. Also add a gzip codec and
+ fix problems with UTF8 text inputs. (omalley via cutting)
+
+
+Release 0.5.0 - 2006-08-04
+
+ 1. HADOOP-352. Fix shell scripts to use /bin/sh instead of
+ /bin/bash, for better portability.
+ (Jean-Baptiste Quenot via cutting)
+
+ 2. HADOOP-313. Permit task state to be saved so that single tasks
+ may be manually re-executed when debugging. (omalley via cutting)
+
+ 3. HADOOP-339. Add method to JobClient API listing jobs that are
+ not yet complete, i.e., that are queued or running.
+ (Mahadev Konar via cutting)
+
+ 4. HADOOP-355. Updates to the streaming contrib module, including
+ API fixes, making reduce optional, and adding an input type for
+ StreamSequenceRecordReader. (Michel Tourn via cutting)
+
+ 5. HADOOP-358. Fix a NPE bug in Path.equals().
+ (Fr??d??ric Bertin via cutting)
+
+ 6. HADOOP-327. Fix ToolBase to not call System.exit() when
+ exceptions are thrown. (Hairong Kuang via cutting)
+
+ 7. HADOOP-359. Permit map output to be compressed.
+ (omalley via cutting)
+
+ 8. HADOOP-341. Permit input URI to CopyFiles to use the HTTP
+ protocol. This lets one, e.g., more easily copy log files into
+ DFS. (Arun C Murthy via cutting)
+
+ 9. HADOOP-361. Remove unix dependencies from streaming contrib
+ module tests, making them pure java. (Michel Tourn via cutting)
+
+10. HADOOP-354. Make public methods to stop DFS daemons.
+ (Barry Kaplan via cutting)
+
+11. HADOOP-252. Add versioning to RPC protocols.
+ (Milind Bhandarkar via cutting)
+
+12. HADOOP-356. Add contrib to "compile" and "test" build targets, so
+ that this code is better maintained. (Michel Tourn via cutting)
+
+13. HADOOP-307. Add smallJobsBenchmark contrib module. This runs
+ lots of small jobs, in order to determine per-task overheads.
+ (Sanjay Dahiya via cutting)
+
+14. HADOOP-342. Add a tool for log analysis: Logalyzer.
+ (Arun C Murthy via cutting)
+
+15. HADOOP-347. Add web-based browsing of DFS content. The namenode
+ redirects browsing requests to datanodes. Content requests are
+ redirected to datanodes where the data is local when possible.
+ (Devaraj Das via cutting)
+
+16. HADOOP-351. Make Hadoop IPC kernel independent of Jetty.
+ (Devaraj Das via cutting)
+
+17. HADOOP-237. Add metric reporting to DFS and MapReduce. With only
+ minor configuration changes, one can now monitor many Hadoop
+ system statistics using Ganglia or other monitoring systems.
+ (Milind Bhandarkar via cutting)
+
+18. HADOOP-376. Fix datanode's HTTP server to scan for a free port.
+ (omalley via cutting)
+
+19. HADOOP-260. Add --config option to shell scripts, specifying an
+ alternate configuration directory. (Milind Bhandarkar via cutting)
+
+20. HADOOP-381. Permit developers to save the temporary files for
+ tasks whose names match a regular expression, to facilliate
+ debugging. (omalley via cutting)
+
+21. HADOOP-344. Fix some Windows-related problems with DF.
+ (Konstantin Shvachko via cutting)
+
+22. HADOOP-380. Fix reduce tasks to poll less frequently for map
+ outputs. (Mahadev Konar via cutting)
+
+23. HADOOP-321. Refactor DatanodeInfo, in preparation for
+ HADOOP-306. (Konstantin Shvachko & omalley via cutting)
+
+24. HADOOP-385. Fix some bugs in record io code generation.
+ (Milind Bhandarkar via cutting)
+
+25. HADOOP-302. Add new Text class to replace UTF8, removing
+ limitations of that class. Also refactor utility methods for
+ writing zero-compressed integers (VInts and VLongs).
+ (Hairong Kuang via cutting)
+
+26. HADOOP-335. Refactor DFS namespace/transaction logging in
+ namenode. (Konstantin Shvachko via cutting)
+
+27. HADOOP-375. Fix handling of the datanode HTTP daemon's port so
+ that multiple datanode's can be run on a single host.
+ (Devaraj Das via cutting)
+
+28. HADOOP-386. When removing excess DFS block replicas, remove those
+ on nodes with the least free space first.
+ (Johan Oskarson via cutting)
+
+29. HADOOP-389. Fix intermittent failures of mapreduce unit tests.
+ Also fix some build dependencies.
+ (Mahadev & Konstantin via cutting)
+
+30. HADOOP-362. Fix a problem where jobs hang when status messages
+ are recieved out-of-order. (omalley via cutting)
+
+31. HADOOP-394. Change order of DFS shutdown in unit tests to
+ minimize errors logged. (Konstantin Shvachko via cutting)
+
+32. HADOOP-396. Make DatanodeID implement Writable.
+ (Konstantin Shvachko via cutting)
+
+33. HADOOP-377. Permit one to add URL resources to a Configuration.
+ (Jean-Baptiste Quenot via cutting)
+
+34. HADOOP-345. Permit iteration over Configuration key/value pairs.
+ (Michel Tourn via cutting)
+
+35. HADOOP-409. Streaming contrib module: make configuration
+ properties available to commands as environment variables.
+ (Michel Tourn via cutting)
+
+36. HADOOP-369. Add -getmerge option to dfs command that appends all
+ files in a directory into a single local file.
+ (Johan Oskarson via cutting)
+
+37. HADOOP-410. Replace some TreeMaps with HashMaps in DFS, for
+ a 17% performance improvement. (Milind Bhandarkar via cutting)
+
+38. HADOOP-411. Add unit tests for command line parser.
+ (Hairong Kuang via cutting)
+
+39. HADOOP-412. Add MapReduce input formats that support filtering
+ of SequenceFile data, including sampling and regex matching.
+ Also, move JobConf.newInstance() to a new utility class.
+ (Hairong Kuang via cutting)
+
+40. HADOOP-226. Fix fsck command to properly consider replication
+ counts, now that these can vary per file. (Bryan Pendleton via cutting)
+
+41. HADOOP-425. Add a Python MapReduce example, using Jython.
+ (omalley via cutting)
+
+
+Release 0.4.0 - 2006-06-28
+
+ 1. HADOOP-298. Improved progress reports for CopyFiles utility, the
+ distributed file copier. (omalley via cutting)
+
+ 2. HADOOP-299. Fix the task tracker, permitting multiple jobs to
+ more easily execute at the same time. (omalley via cutting)
+
+ 3. HADOOP-250. Add an HTTP user interface to the namenode, running
+ on port 50070. (Devaraj Das via cutting)
+
+ 4. HADOOP-123. Add MapReduce unit tests that run a jobtracker and
+ tasktracker, greatly increasing code coverage.
+ (Milind Bhandarkar via cutting)
+
+ 5. HADOOP-271. Add links from jobtracker's web ui to tasktracker's
+ web ui. Also attempt to log a thread dump of child processes
+ before they're killed. (omalley via cutting)
+
+ 6. HADOOP-210. Change RPC server to use a selector instead of a
+ thread per connection. This should make it easier to scale to
+ larger clusters. Note that this incompatibly changes the RPC
+ protocol: clients and servers must both be upgraded to the new
+ version to ensure correct operation. (Devaraj Das via cutting)
+
+ 7. HADOOP-311. Change DFS client to retry failed reads, so that a
+ single read failure will not alone cause failure of a task.
+ (omalley via cutting)
+
+ 8. HADOOP-314. Remove the "append" phase when reducing. Map output
+ files are now directly passed to the sorter, without first
+ appending them into a single file. Now, the first third of reduce
+ progress is "copy" (transferring map output to reduce nodes), the
+ middle third is "sort" (sorting map output) and the last third is
+ "reduce" (generating output). Long-term, the "sort" phase will
+ also be removed. (omalley via cutting)
+
+ 9. HADOOP-316. Fix a potential deadlock in the jobtracker.
+ (omalley via cutting)
+
+10. HADOOP-319. Fix FileSystem.close() to remove the FileSystem
+ instance from the cache. (Hairong Kuang via cutting)
+
+11. HADOOP-135. Fix potential deadlock in JobTracker by acquiring
+ locks in a consistent order. (omalley via cutting)
+
+12. HADOOP-278. Check for existence of input directories before
+ starting MapReduce jobs, making it easier to debug this common
+ error. (omalley via cutting)
+
+13. HADOOP-304. Improve error message for
+ UnregisterdDatanodeException to include expected node name.
+ (Konstantin Shvachko via cutting)
+
+14. HADOOP-305. Fix TaskTracker to ask for new tasks as soon as a
+ task is finished, rather than waiting for the next heartbeat.
+ This improves performance when tasks are short.
+ (Mahadev Konar via cutting)
+
+15. HADOOP-59. Add support for generic command line options. One may
+ now specify the filesystem (-fs), the MapReduce jobtracker (-jt),
+ a config file (-conf) or any configuration property (-D). The
+ "dfs", "fsck", "job", and "distcp" commands currently support
+ this, with more to be added. (Hairong Kuang via cutting)
+
+16. HADOOP-296. Permit specification of the amount of reserved space
+ on a DFS datanode. One may specify both the percentage free and
+ the number of bytes. (Johan Oskarson via cutting)
+
+17. HADOOP-325. Fix a problem initializing RPC parameter classes, and
+ remove the workaround used to initialize classes.
+ (omalley via cutting)
+
+18. HADOOP-328. Add an option to the "distcp" command to ignore read
+ errors while copying. (omalley via cutting)
+
+19. HADOOP-27. Don't allocate tasks to trackers whose local free
+ space is too low. (Johan Oskarson via cutting)
+
+20. HADOOP-318. Keep slow DFS output from causing task timeouts.
+ This incompatibly changes some public interfaces, adding a
+ parameter to OutputFormat.getRecordWriter() and the new method
+ Reporter.progress(), but it makes lots of tasks succeed that were
+ previously failing. (Milind Bhandarkar via cutting)
+
+
+Release 0.3.2 - 2006-06-09
+
+ 1. HADOOP-275. Update the streaming contrib module to use log4j for
+ its logging. (Michel Tourn via cutting)
+
+ 2. HADOOP-279. Provide defaults for log4j logging parameters, so
+ that things still work reasonably when Hadoop-specific system
+ properties are not provided. (omalley via cutting)
+
+ 3. HADOOP-280. Fix a typo in AllTestDriver which caused the wrong
+ test to be run when "DistributedFSCheck" was specified.
+ (Konstantin Shvachko via cutting)
+
+ 4. HADOOP-240. DFS's mkdirs() implementation no longer logs a warning
+ when the directory already exists. (Hairong Kuang via cutting)
+
+ 5. HADOOP-285. Fix DFS datanodes to be able to re-join the cluster
+ after the connection to the namenode is lost. (omalley via cutting)
+
+ 6. HADOOP-277. Fix a race condition when creating directories.
+ (Sameer Paranjpye via cutting)
+
+ 7. HADOOP-289. Improved exception handling in DFS datanode.
+ (Konstantin Shvachko via cutting)
+
+ 8. HADOOP-292. Fix client-side logging to go to standard error
+ rather than standard output, so that it can be distinguished from
+ application output. (omalley via cutting)
+
+ 9. HADOOP-294. Fixed bug where conditions for retrying after errors
+ in the DFS client were reversed. (omalley via cutting)
+
+
+Release 0.3.1 - 2006-06-05
+
+ 1. HADOOP-272. Fix a bug in bin/hadoop setting log
+ parameters. (omalley & cutting)
+
+ 2. HADOOP-274. Change applications to log to standard output rather
+ than to a rolling log file like daemons. (omalley via cutting)
+
+ 3. HADOOP-262. Fix reduce tasks to report progress while they're
+ waiting for map outputs, so that they do not time out.
+ (Mahadev Konar via cutting)
+
+ 4. HADOOP-245 and HADOOP-246. Improvements to record io package.
+ (Mahadev Konar via cutting)
+
+ 5. HADOOP-276. Add logging config files to jar file so that they're
+ always found. (omalley via cutting)
+
+
+Release 0.3.0 - 2006-06-02
+
+ 1. HADOOP-208. Enhance MapReduce web interface, adding new pages
+ for failed tasks, and tasktrackers. (omalley via cutting)
+
+ 2. HADOOP-204. Tweaks to metrics package. (David Bowen via cutting)
+
+ 3. HADOOP-209. Add a MapReduce-based file copier. This will
+ copy files within or between file systems in parallel.
+ (Milind Bhandarkar via cutting)
+
+ 4. HADOOP-146. Fix DFS to check when randomly generating a new block
+ id that no existing blocks already have that id.
+ (Milind Bhandarkar via cutting)
+
+ 5. HADOOP-180. Make a daemon thread that does the actual task clean ups, so
+ that the main offerService thread in the taskTracker doesn't get stuck
+ and miss his heartbeat window. This was killing many task trackers as
+ big jobs finished (300+ tasks / node). (omalley via cutting)
+
+ 6. HADOOP-200. Avoid transmitting entire list of map task names to
+ reduce tasks. Instead just transmit the number of map tasks and
+ henceforth refer to them by number when collecting map output.
+ (omalley via cutting)
+
+ 7. HADOOP-219. Fix a NullPointerException when handling a checksum
+ exception under SequenceFile.Sorter.sort(). (cutting & stack)
+
+ 8. HADOOP-212. Permit alteration of the file block size in DFS. The
+ default block size for new files may now be specified in the
+ configuration with the dfs.block.size property. The block size
+ may also be specified when files are opened.
+ (omalley via cutting)
+
+ 9. HADOOP-218. Avoid accessing configuration while looping through
+ tasks in JobTracker. (Mahadev Konar via cutting)
+
+10. HADOOP-161. Add hashCode() method to DFS's Block.
+ (Milind Bhandarkar via cutting)
+
+11. HADOOP-115. Map output types may now be specified. These are also
+ used as reduce input types, thus permitting reduce input types to
+ differ from reduce output types. (Runping Qi via cutting)
+
+12. HADOOP-216. Add task progress to task status page.
+ (Bryan Pendelton via cutting)
+
+13. HADOOP-233. Add web server to task tracker that shows running
+ tasks and logs. Also add log access to job tracker web interface.
+ (omalley via cutting)
+
+14. HADOOP-205. Incorporate pending tasks into tasktracker load
+ calculations. (Mahadev Konar via cutting)
+
+15. HADOOP-247. Fix sort progress to better handle exceptions.
+ (Mahadev Konar via cutting)
+
+16. HADOOP-195. Improve performance of the transfer of map outputs to
+ reduce nodes by performing multiple transfers in parallel, each on
+ a separate socket. (Sameer Paranjpye via cutting)
+
+17. HADOOP-251. Fix task processes to be tolerant of failed progress
+ reports to their parent process. (omalley via cutting)
+
+18. HADOOP-325. Improve the FileNotFound exceptions thrown by
+ LocalFileSystem to include the name of the file.
+ (Benjamin Reed via cutting)
+
+19. HADOOP-254. Use HTTP to transfer map output data to reduce
+ nodes. This, together with HADOOP-195, greatly improves the
+ performance of these transfers. (omalley via cutting)
+
+20. HADOOP-163. Cause datanodes that\ are unable to either read or
+ write data to exit, so that the namenode will no longer target
+ them for new blocks and will replicate their data on other nodes.
+ (Hairong Kuang via cutting)
+
+21. HADOOP-222. Add a -setrep option to the dfs commands that alters
+ file replication levels. (Johan Oskarson via cutting)
+
+22. HADOOP-75. In DFS, only check for a complete file when the file
+ is closed, rather than as each block is written.
+ (Milind Bhandarkar via cutting)
+
+23. HADOOP-124. Change DFS so that datanodes are identified by a
+ persistent ID rather than by host and port. This solves a number
+ of filesystem integrity problems, when, e.g., datanodes are
+ restarted. (Konstantin Shvachko via cutting)
+
+24. HADOOP-256. Add a C API for DFS. (Arun C Murthy via cutting)
+
+25. HADOOP-211. Switch to use the Jakarta Commons logging internally,
+ configured to use log4j by default. (Arun C Murthy and cutting)
+
+26. HADOOP-265. Tasktracker now fails to start if it does not have a
+ writable local directory for temporary files. In this case, it
+ logs a message to the JobTracker and exits. (Hairong Kuang via cutting)
+
+27. HADOOP-270. Fix potential deadlock in datanode shutdown.
+ (Hairong Kuang via cutting)
+
+Release 0.2.1 - 2006-05-12
+
+ 1. HADOOP-199. Fix reduce progress (broken by HADOOP-182).
+ (omalley via cutting)
+
+ 2. HADOOP-201. Fix 'bin/hadoop dfs -report'. (cutting)
+
+ 3. HADOOP-207. Fix JDK 1.4 incompatibility introduced by HADOOP-96.
+ System.getenv() does not work in JDK 1.4. (Hairong Kuang via cutting)
+
+
+Release 0.2.0 - 2006-05-05
+
+ 1. Fix HADOOP-126. 'bin/hadoop dfs -cp' now correctly copies .crc
+ files. (Konstantin Shvachko via cutting)
+
+ 2. Fix HADOOP-51. Change DFS to support per-file replication counts.
+ (Konstantin Shvachko via cutting)
+
+ 3. Fix HADOOP-131. Add scripts to start/stop dfs and mapred daemons.
+ Use these in start/stop-all scripts. (Chris Mattmann via cutting)
+
+ 4. Stop using ssh options by default that are not yet in widely used
+ versions of ssh. Folks can still enable their use by uncommenting
+ a line in conf/hadoop-env.sh. (cutting)
+
+ 5. Fix HADOOP-92. Show information about all attempts to run each
+ task in the web ui. (Mahadev konar via cutting)
+
+ 6. Fix HADOOP-128. Improved DFS error handling. (Owen O'Malley via cutting)
+
+ 7. Fix HADOOP-129. Replace uses of java.io.File with new class named
+ Path. This fixes bugs where java.io.File methods were called
+ directly when FileSystem methods were desired, and reduces the
+ likelihood of such bugs in the future. It also makes the handling
+ of pathnames more consistent between local and dfs FileSystems and
+ between Windows and Unix. java.io.File-based methods are still
+ available for back-compatibility, but are deprecated and will be
+ removed once 0.2 is released. (cutting)
+
+ 8. Change dfs.data.dir and mapred.local.dir to be comma-separated
+ lists of directories, no longer be space-separated. This fixes
+ several bugs on Windows. (cutting)
+
+ 9. Fix HADOOP-144. Use mapred task id for dfs client id, to
+ facilitate debugging. (omalley via cutting)
+
+10. Fix HADOOP-143. Do not line-wrap stack-traces in web ui.
+ (omalley via cutting)
+
+11. Fix HADOOP-118. In DFS, improve clean up of abandoned file
+ creations. (omalley via cutting)
+
+12. Fix HADOOP-138. Stop multiple tasks in a single heartbeat, rather
+ than one per heartbeat. (Stefan via cutting)
+
+13. Fix HADOOP-139. Remove a potential deadlock in
+ LocalFileSystem.lock(). (Igor Bolotin via cutting)
+
+14. Fix HADOOP-134. Don't hang jobs when the tasktracker is
+ misconfigured to use an un-writable local directory. (omalley via cutting)
+
+15. Fix HADOOP-115. Correct an error message. (Stack via cutting)
+
+16. Fix HADOOP-133. Retry pings from child to parent, in case of
+ (local) communcation problems. Also log exit status, so that one
+ can distinguish patricide from other deaths. (omalley via cutting)
+
+17. Fix HADOOP-142. Avoid re-running a task on a host where it has
+ previously failed. (omalley via cutting)
+
+18. Fix HADOOP-148. Maintain a task failure count for each
+ tasktracker and display it in the web ui. (omalley via cutting)
+
+19. Fix HADOOP-151. Close a potential socket leak, where new IPC
+ connection pools were created per configuration instance that RPCs
+ use. Now a global RPC connection pool is used again, as
+ originally intended. (cutting)
+
+20. Fix HADOOP-69. Don't throw a NullPointerException when getting
+ hints for non-existing file split. (Bryan Pendelton via cutting)
+
+21. Fix HADOOP-157. When a task that writes dfs files (e.g., a reduce
+ task) failed and was retried, it would fail again and again,
+ eventually failing the job. The problem was that dfs did not yet
+ know that the failed task had abandoned the files, and would not
+ yet let another task create files with the same names. Dfs now
+ retries when creating a file long enough for locks on abandoned
+ files to expire. (omalley via cutting)
+
+22. Fix HADOOP-150. Improved task names that include job
+ names. (omalley via cutting)
+
+23. Fix HADOOP-162. Fix ConcurrentModificationException when
+ releasing file locks. (omalley via cutting)
+
+24. Fix HADOOP-132. Initial check-in of new Metrics API, including
+ implementations for writing metric data to a file and for sending
+ it to Ganglia. (David Bowen via cutting)
+
+25. Fix HADOOP-160. Remove some uneeded synchronization around
+ time-consuming operations in the TaskTracker. (omalley via cutting)
+
+26. Fix HADOOP-166. RPCs failed when passed subclasses of a declared
+ parameter type. This is fixed by changing ObjectWritable to store
+ both the declared type and the instance type for Writables. Note
+ that this incompatibly changes the format of ObjectWritable and
+ will render unreadable any ObjectWritables stored in files.
+ Nutch only uses ObjectWritable in intermediate files, so this
+ should not be a problem for Nutch. (Stefan & cutting)
+
+27. Fix HADOOP-168. MapReduce RPC protocol methods should all declare
+ IOException, so that timeouts are handled appropriately.
+ (omalley via cutting)
+
+28. Fix HADOOP-169. Don't fail a reduce task if a call to the
+ jobtracker to locate map outputs fails. (omalley via cutting)
+
+29. Fix HADOOP-170. Permit FileSystem clients to examine and modify
+ the replication count of individual files. Also fix a few
+ replication-related bugs. (Konstantin Shvachko via cutting)
+
+30. Permit specification of a higher replication levels for job
+ submission files (job.xml and job.jar). This helps with large
+ clusters, since these files are read by every node. (cutting)
+
+31. HADOOP-173. Optimize allocation of tasks with local data. (cutting)
+
+32. HADOOP-167. Reduce number of Configurations and JobConf's
+ created. (omalley via cutting)
+
+33. NUTCH-256. Change FileSystem#createNewFile() to create a .crc
+ file. The lack of a .crc file was causing warnings. (cutting)
+
+34. HADOOP-174. Change JobClient to not abort job until it has failed
+ to contact the job tracker for five attempts, not just one as
+ before. (omalley via cutting)
+
+35. HADOOP-177. Change MapReduce web interface to page through tasks.
+ Previously, when jobs had more than a few thousand tasks they
+ could crash web browsers. (Mahadev Konar via cutting)
+
+36. HADOOP-178. In DFS, piggyback blockwork requests from datanodes
+ on heartbeat responses from namenode. This reduces the volume of
+ RPC traffic. Also move startup delay in blockwork from datanode
+ to namenode. This fixes a problem where restarting the namenode
+ triggered a lot of uneeded replication. (Hairong Kuang via cutting)
+
+37. HADOOP-183. If the DFS namenode is restarted with different
+ minimum and/or maximum replication counts, existing files'
+ replication counts are now automatically adjusted to be within the
+ newly configured bounds. (Hairong Kuang via cutting)
+
+38. HADOOP-186. Better error handling in TaskTracker's top-level
+ loop. Also improve calculation of time to send next heartbeat.
+ (omalley via cutting)
+
+39. HADOOP-187. Add two MapReduce examples/benchmarks. One creates
+ files containing random data. The second sorts the output of the
+ first. (omalley via cutting)
+
+40. HADOOP-185. Fix so that, when a task tracker times out making the
+ RPC asking for a new task to run, the job tracker does not think
+ that it is actually running the task returned. (omalley via cutting)
+
+41. HADOOP-190. If a child process hangs after it has reported
+ completion, its output should not be lost. (Stack via cutting)
+
+42. HADOOP-184. Re-structure some test code to better support testing
+ on a cluster. (Mahadev Konar via cutting)
+
+43. HADOOP-191 Add streaming package, Hadoop's first contrib module.
+ This permits folks to easily submit MapReduce jobs whose map and
+ reduce functions are implemented by shell commands. Use
+ 'bin/hadoop jar build/hadoop-streaming.jar' to get details.
+ (Michel Tourn via cutting)
+
+44. HADOOP-189. Fix MapReduce in standalone configuration to
+ correctly handle job jar files that contain a lib directory with
+ nested jar files. (cutting)
+
+45. HADOOP-65. Initial version of record I/O framework that enables
+ the specification of record types and generates marshalling code
+ in both Java and C++. Generated Java code implements
+ WritableComparable, but is not yet otherwise used by
+ Hadoop. (Milind Bhandarkar via cutting)
+
+46. HADOOP-193. Add a MapReduce-based FileSystem benchmark.
+ (Konstantin Shvachko via cutting)
+
+47. HADOOP-194. Add a MapReduce-based FileSystem checker. This reads
+ every block in every file in the filesystem. (Konstantin Shvachko
+ via cutting)
+
+48. HADOOP-182. Fix so that lost task trackers to not change the
+ status of reduce tasks or completed jobs. Also fixes the progress
+ meter so that failed tasks are subtracted. (omalley via cutting)
+
+49. HADOOP-96. Logging improvements. Log files are now separate from
+ standard output and standard error files. Logs are now rolled.
+ Logging of all DFS state changes can be enabled, to facilitate
+ debugging. (Hairong Kuang via cutting)
+
+
+Release 0.1.1 - 2006-04-08
+
+ 1. Added CHANGES.txt, logging all significant changes to Hadoop. (cutting)
+
+ 2. Fix MapReduceBase.close() to throw IOException, as declared in the
+ Closeable interface. This permits subclasses which override this
+ method to throw that exception. (cutting)
+
+ 3. Fix HADOOP-117. Pathnames were mistakenly transposed in
+ JobConf.getLocalFile() causing many mapred temporary files to not
+ be removed. (Raghavendra Prabhu via cutting)
+
+ 4. Fix HADOOP-116. Clean up job submission files when jobs complete.
+ (cutting)
+
+ 5. Fix HADOOP-125. Fix handling of absolute paths on Windows (cutting)
+
+Release 0.1.0 - 2006-04-01
+
+ 1. The first release of Hadoop.
+
diff --git a/core/lib/hadoop-0.20.0/LICENSE.txt b/core/lib/hadoop-0.20.0/LICENSE.txt
new file mode 100644
index 0000000000..59bcdbc978
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/LICENSE.txt
@@ -0,0 +1,244 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+
+APACHE HADOOP SUBCOMPONENTS:
+
+The Apache Hadoop project contains subcomponents with separate copyright
+notices and license terms. Your use of the source code for the these
+subcomponents is subject to the terms and conditions of the following
+licenses.
+
+For the org.apache.hadoop.util.bloom.* classes:
+
+/**
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract
+ * 034819 (http://www.one-lab.org)
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/core/lib/hadoop-0.20.0/NOTICE.txt b/core/lib/hadoop-0.20.0/NOTICE.txt
new file mode 100644
index 0000000000..62fc5816c9
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/NOTICE.txt
@@ -0,0 +1,2 @@
+This product includes software developed by The Apache Software
+Foundation (http://www.apache.org/).
diff --git a/core/lib/hadoop-0.20.0/README.txt b/core/lib/hadoop-0.20.0/README.txt
new file mode 100644
index 0000000000..148cd31c86
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/README.txt
@@ -0,0 +1,31 @@
+For the latest information about Hadoop, please visit our website at:
+
+ http://hadoop.apache.org/core/
+
+and our wiki, at:
+
+ http://wiki.apache.org/hadoop/
+
+This distribution includes cryptographic software. The country in
+which you currently reside may have restrictions on the import,
+possession, use, and/or re-export to another country, of
+encryption software. BEFORE using any encryption software, please
+check your country's laws, regulations and policies concerning the
+import, possession, or use, and re-export of encryption software, to
+see if this is permitted. See for more
+information.
+
+The U.S. Government Department of Commerce, Bureau of Industry and
+Security (BIS), has classified this software as Export Commodity
+Control Number (ECCN) 5D002.C.1, which includes information security
+software using or performing cryptographic functions with asymmetric
+algorithms. The form and manner of this Apache Software Foundation
+distribution makes it eligible for export under the License Exception
+ENC Technology Software Unrestricted (TSU) exception (see the BIS
+Export Administration Regulations, Section 740.13) for both object
+code and source code.
+
+The following provides more details on the included cryptographic
+software:
+ Hadoop Core uses the SSL libraries from the Jetty project written
+by mortbay.org.
diff --git a/core/lib/hadoop-0.20.0/bin/hadoop b/core/lib/hadoop-0.20.0/bin/hadoop
new file mode 100755
index 0000000000..273549f138
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/hadoop
@@ -0,0 +1,289 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The Hadoop command script
+#
+# Environment Variables
+#
+# JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# HADOOP_CLASSPATH Extra Java CLASSPATH entries.
+#
+# HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 1000.
+#
+# HADOOP_OPTS Extra Java runtime options.
+#
+# HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
+# HADOOP_CLIENT_OPTS when the respective command is run.
+# HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
+# for e.g. HADOOP_CLIENT_OPTS applies to
+# more than one command (fs, dfs, fsck,
+# dfsadmin etc)
+#
+# HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
+#
+# HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
+#
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+ echo "Usage: hadoop [--config confdir] COMMAND"
+ echo "where COMMAND is one of:"
+ echo " namenode -format format the DFS filesystem"
+ echo " secondarynamenode run the DFS secondary namenode"
+ echo " namenode run the DFS namenode"
+ echo " datanode run a DFS datanode"
+ echo " dfsadmin run a DFS admin client"
+ echo " mradmin run a Map-Reduce admin client"
+ echo " fsck run a DFS filesystem checking utility"
+ echo " fs run a generic filesystem user client"
+ echo " balancer run a cluster balancing utility"
+ echo " jobtracker run the MapReduce job Tracker node"
+ echo " pipes run a Pipes job"
+ echo " tasktracker run a MapReduce task Tracker node"
+ echo " job manipulate MapReduce jobs"
+ echo " queue get information regarding JobQueues"
+ echo " version print the version"
+ echo " jar run a jar file"
+ echo " distcp copy file or directories recursively"
+ echo " archive -archiveName NAME * create a hadoop archive"
+ echo " daemonlog get/set the log level for each daemon"
+ echo " or"
+ echo " CLASSNAME run the class named CLASSNAME"
+ echo "Most commands print help when invoked w/o parameters."
+ exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
+fi
+
+# some Java parameters
+if [ "$JAVA_HOME" != "" ]; then
+ #echo "run java in $JAVA_HOME"
+ JAVA_HOME=$JAVA_HOME
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx1000m
+
+# check envvars which might override default args
+if [ "$HADOOP_HEAPSIZE" != "" ]; then
+ #echo "run with heapsize $HADOOP_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
+ #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $HADOOP_CONF_DIR
+CLASSPATH="${HADOOP_CONF_DIR}"
+CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
+
+# for developers, add Hadoop classes to CLASSPATH
+if [ -d "$HADOOP_HOME/build/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
+fi
+if [ -d "$HADOOP_HOME/build/webapps" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
+fi
+if [ -d "$HADOOP_HOME/build/test/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
+fi
+if [ -d "$HADOOP_HOME/build/tools" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
+fi
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# for releases, add core hadoop jar & webapps to CLASSPATH
+if [ -d "$HADOOP_HOME/webapps" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME
+fi
+for f in $HADOOP_HOME/hadoop-*-core.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# add libs to CLASSPATH
+for f in $HADOOP_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+if [ -d "$HADOOP_HOME/build/ivy/lib/Hadoop/common" ]; then
+for f in $HADOOP_HOME/build/ivy/lib/Hadoop/common/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+fi
+
+for f in $HADOOP_HOME/lib/jsp-2.1/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+for f in $HADOOP_HOME/hadoop-*-tools.jar; do
+ TOOL_PATH=${TOOL_PATH}:$f;
+done
+for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
+ TOOL_PATH=${TOOL_PATH}:$f;
+done
+
+# add user-specified CLASSPATH last
+if [ "$HADOOP_CLASSPATH" != "" ]; then
+ CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
+fi
+
+# default log directory & file
+if [ "$HADOOP_LOG_DIR" = "" ]; then
+ HADOOP_LOG_DIR="$HADOOP_HOME/logs"
+fi
+if [ "$HADOOP_LOGFILE" = "" ]; then
+ HADOOP_LOGFILE='hadoop.log'
+fi
+
+# default policy file for service-level authorization
+if [ "$HADOOP_POLICYFILE" = "" ]; then
+ HADOOP_POLICYFILE="hadoop-policy.xml"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# figure out which class to run
+if [ "$COMMAND" = "namenode" ] ; then
+ CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
+elif [ "$COMMAND" = "secondarynamenode" ] ; then
+ CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
+elif [ "$COMMAND" = "datanode" ] ; then
+ CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
+elif [ "$COMMAND" = "fs" ] ; then
+ CLASS=org.apache.hadoop.fs.FsShell
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "dfs" ] ; then
+ CLASS=org.apache.hadoop.fs.FsShell
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "dfsadmin" ] ; then
+ CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "mradmin" ] ; then
+ CLASS=org.apache.hadoop.mapred.tools.MRAdmin
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "fsck" ] ; then
+ CLASS=org.apache.hadoop.hdfs.tools.DFSck
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "balancer" ] ; then
+ CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
+elif [ "$COMMAND" = "jobtracker" ] ; then
+ CLASS=org.apache.hadoop.mapred.JobTracker
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
+elif [ "$COMMAND" = "tasktracker" ] ; then
+ CLASS=org.apache.hadoop.mapred.TaskTracker
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
+elif [ "$COMMAND" = "job" ] ; then
+ CLASS=org.apache.hadoop.mapred.JobClient
+elif [ "$COMMAND" = "queue" ] ; then
+ CLASS=org.apache.hadoop.mapred.JobQueueClient
+elif [ "$COMMAND" = "pipes" ] ; then
+ CLASS=org.apache.hadoop.mapred.pipes.Submitter
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "version" ] ; then
+ CLASS=org.apache.hadoop.util.VersionInfo
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "jar" ] ; then
+ CLASS=org.apache.hadoop.util.RunJar
+elif [ "$COMMAND" = "distcp" ] ; then
+ CLASS=org.apache.hadoop.tools.DistCp
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "daemonlog" ] ; then
+ CLASS=org.apache.hadoop.log.LogLevel
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "archive" ] ; then
+ CLASS=org.apache.hadoop.tools.HadoopArchives
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "sampler" ] ; then
+ CLASS=org.apache.hadoop.mapred.lib.InputSampler
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+else
+ CLASS=$COMMAND
+fi
+
+# cygwin path translation
+if $cygwin; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+ HADOOP_HOME=`cygpath -w "$HADOOP_HOME"`
+ HADOOP_LOG_DIR=`cygpath -w "$HADOOP_LOG_DIR"`
+ TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
+fi
+# setup 'java.library.path' for native-hadoop code if necessary
+JAVA_LIBRARY_PATH=''
+if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
+ JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} -Xmx32m org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
+
+ if [ -d "$HADOOP_HOME/build/native" ]; then
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
+ fi
+
+ if [ -d "${HADOOP_HOME}/lib/native" ]; then
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
+ else
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
+ fi
+ fi
+fi
+
+# cygwin path translation
+if $cygwin; then
+ JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
+fi
+
+HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
+HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
+HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
+HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
+HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+fi
+HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.policy.file=$HADOOP_POLICYFILE"
+
+# run it
+exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
diff --git a/core/lib/hadoop-0.20.0/bin/hadoop-config.sh b/core/lib/hadoop-0.20.0/bin/hadoop-config.sh
new file mode 100755
index 0000000000..1f9d52da79
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/hadoop-config.sh
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# included in all the hadoop scripts with source command
+# should not be executable directly
+# also should not be passed any arguments, since we need original $*
+
+# resolve links - $0 may be a softlink
+
+this="$0"
+while [ -h "$this" ]; do
+ ls=`ls -ld "$this"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ this="$link"
+ else
+ this=`dirname "$this"`/"$link"
+ fi
+done
+
+# convert relative path to absolute path
+bin=`dirname "$this"`
+script=`basename "$this"`
+bin=`cd "$bin"; pwd`
+this="$bin/$script"
+
+# the root of the Hadoop installation
+export HADOOP_HOME=`dirname "$this"`/..
+
+#check to see if the conf dir is given as an optional argument
+if [ $# -gt 1 ]
+then
+ if [ "--config" = "$1" ]
+ then
+ shift
+ confdir=$1
+ shift
+ HADOOP_CONF_DIR=$confdir
+ fi
+fi
+
+# Allow alternate conf dir location.
+HADOOP_CONF_DIR="${HADOOP_CONF_DIR:-$HADOOP_HOME/conf}"
+
+#check to see it is specified whether to use the slaves or the
+# masters file
+if [ $# -gt 1 ]
+then
+ if [ "--hosts" = "$1" ]
+ then
+ shift
+ slavesfile=$1
+ shift
+ export HADOOP_SLAVES="${HADOOP_CONF_DIR}/$slavesfile"
+ fi
+fi
diff --git a/core/lib/hadoop-0.20.0/bin/hadoop-daemon.sh b/core/lib/hadoop-0.20.0/bin/hadoop-daemon.sh
new file mode 100755
index 0000000000..e10390a9e2
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/hadoop-daemon.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Runs a Hadoop command as a daemon.
+#
+# Environment Variables
+#
+# HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
+# HADOOP_LOG_DIR Where log files are stored. PWD by default.
+# HADOOP_MASTER host:path where hadoop code should be rsync'd from
+# HADOOP_PID_DIR The pid files are stored. /tmp by default.
+# HADOOP_IDENT_STRING A string representing this instance of hadoop. $USER by default
+# HADOOP_NICENESS The scheduling priority for daemons. Defaults to 0.
+##
+
+usage="Usage: hadoop-daemon.sh [--config ] [--hosts hostlistfile] (start|stop) "
+
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# get arguments
+startStop=$1
+shift
+command=$1
+shift
+
+hadoop_rotate_log ()
+{
+ log=$1;
+ num=5;
+ if [ -n "$2" ]; then
+ num=$2
+ fi
+ if [ -f "$log" ]; then # rotate logs
+ while [ $num -gt 1 ]; do
+ prev=`expr $num - 1`
+ [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"
+ num=$prev
+ done
+ mv "$log" "$log.$num";
+ fi
+}
+
+if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
+fi
+
+# get log directory
+if [ "$HADOOP_LOG_DIR" = "" ]; then
+ export HADOOP_LOG_DIR="$HADOOP_HOME/logs"
+fi
+mkdir -p "$HADOOP_LOG_DIR"
+
+if [ "$HADOOP_PID_DIR" = "" ]; then
+ HADOOP_PID_DIR=/tmp
+fi
+
+if [ "$HADOOP_IDENT_STRING" = "" ]; then
+ export HADOOP_IDENT_STRING="$USER"
+fi
+
+# some variables
+export HADOOP_LOGFILE=hadoop-$HADOOP_IDENT_STRING-$command-$HOSTNAME.log
+export HADOOP_ROOT_LOGGER="INFO,DRFA"
+log=$HADOOP_LOG_DIR/hadoop-$HADOOP_IDENT_STRING-$command-$HOSTNAME.out
+pid=$HADOOP_PID_DIR/hadoop-$HADOOP_IDENT_STRING-$command.pid
+
+# Set default scheduling priority
+if [ "$HADOOP_NICENESS" = "" ]; then
+ export HADOOP_NICENESS=0
+fi
+
+case $startStop in
+
+ (start)
+
+ mkdir -p "$HADOOP_PID_DIR"
+
+ if [ -f $pid ]; then
+ if kill -0 `cat $pid` > /dev/null 2>&1; then
+ echo $command running as process `cat $pid`. Stop it first.
+ exit 1
+ fi
+ fi
+
+ if [ "$HADOOP_MASTER" != "" ]; then
+ echo rsync from $HADOOP_MASTER
+ rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' $HADOOP_MASTER/ "$HADOOP_HOME"
+ fi
+
+ hadoop_rotate_log $log
+ echo starting $command, logging to $log
+ cd "$HADOOP_HOME"
+ nohup nice -n $HADOOP_NICENESS "$HADOOP_HOME"/bin/hadoop --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
+ echo $! > $pid
+ sleep 1; head "$log"
+ ;;
+
+ (stop)
+
+ if [ -f $pid ]; then
+ if kill -0 `cat $pid` > /dev/null 2>&1; then
+ echo stopping $command
+ kill `cat $pid`
+ else
+ echo no $command to stop
+ fi
+ else
+ echo no $command to stop
+ fi
+ ;;
+
+ (*)
+ echo $usage
+ exit 1
+ ;;
+
+esac
+
+
diff --git a/core/lib/hadoop-0.20.0/bin/hadoop-daemons.sh b/core/lib/hadoop-0.20.0/bin/hadoop-daemons.sh
new file mode 100755
index 0000000000..894d8ab11a
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/hadoop-daemons.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Run a Hadoop command on all slave hosts.
+
+usage="Usage: hadoop-daemons.sh [--config confdir] [--hosts hostlistfile] [start|stop] command args..."
+
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. $bin/hadoop-config.sh
+
+exec "$bin/slaves.sh" --config $HADOOP_CONF_DIR cd "$HADOOP_HOME" \; "$bin/hadoop-daemon.sh" --config $HADOOP_CONF_DIR "$@"
diff --git a/core/lib/hadoop-0.20.0/bin/rcc b/core/lib/hadoop-0.20.0/bin/rcc
new file mode 100755
index 0000000000..a39745be6f
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/rcc
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The Hadoop record compiler
+#
+# Environment Variables
+#
+# JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# HADOOP_OPTS Extra Java runtime options.
+#
+# HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
+#
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
+fi
+
+# some Java parameters
+if [ "$JAVA_HOME" != "" ]; then
+ #echo "run java in $JAVA_HOME"
+ JAVA_HOME=$JAVA_HOME
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx1000m
+
+# CLASSPATH initially contains $HADOOP_CONF_DIR
+CLASSPATH="${HADOOP_CONF_DIR}"
+CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
+
+# for developers, add Hadoop classes to CLASSPATH
+if [ -d "$HADOOP_HOME/build/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
+fi
+if [ -d "$HADOOP_HOME/build/webapps" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
+fi
+if [ -d "$HADOOP_HOME/build/test/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
+fi
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# for releases, add core hadoop jar & webapps to CLASSPATH
+if [ -d "$HADOOP_HOME/webapps" ]; then
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME
+fi
+for f in $HADOOP_HOME/hadoop-*-core.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# add libs to CLASSPATH
+for f in $HADOOP_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# restore ordinary behaviour
+unset IFS
+
+CLASS='org.apache.hadoop.record.compiler.generated.Rcc'
+
+# cygwin path translation
+if expr `uname` : 'CYGWIN*' > /dev/null; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+fi
+
+# run it
+exec "$JAVA" $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
diff --git a/core/lib/hadoop-0.20.0/bin/slaves.sh b/core/lib/hadoop-0.20.0/bin/slaves.sh
new file mode 100755
index 0000000000..fc9f720be7
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/slaves.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Run a shell command on all slave hosts.
+#
+# Environment Variables
+#
+# HADOOP_SLAVES File naming remote hosts.
+# Default is ${HADOOP_CONF_DIR}/slaves.
+# HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
+# HADOOP_SLAVE_SLEEP Seconds to sleep between spawning remote commands.
+# HADOOP_SSH_OPTS Options passed to ssh when running remote commands.
+##
+
+usage="Usage: slaves.sh [--config confdir] command..."
+
+# if no args specified, show usage
+if [ $# -le 0 ]; then
+ echo $usage
+ exit 1
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# If the slaves file is specified in the command line,
+# then it takes precedence over the definition in
+# hadoop-env.sh. Save it here.
+HOSTLIST=$HADOOP_SLAVES
+
+if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
+fi
+
+if [ "$HOSTLIST" = "" ]; then
+ if [ "$HADOOP_SLAVES" = "" ]; then
+ export HOSTLIST="${HADOOP_CONF_DIR}/slaves"
+ else
+ export HOSTLIST="${HADOOP_SLAVES}"
+ fi
+fi
+
+for slave in `cat "$HOSTLIST"|sed "s/#.*$//;/^$/d"`; do
+ ssh $HADOOP_SSH_OPTS $slave $"${@// /\\ }" \
+ 2>&1 | sed "s/^/$slave: /" &
+ if [ "$HADOOP_SLAVE_SLEEP" != "" ]; then
+ sleep $HADOOP_SLAVE_SLEEP
+ fi
+done
+
+wait
diff --git a/core/lib/hadoop-0.20.0/bin/start-all.sh b/core/lib/hadoop-0.20.0/bin/start-all.sh
new file mode 100755
index 0000000000..b1eefc8fbe
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/start-all.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Start all hadoop daemons. Run this on master node.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# start dfs daemons
+"$bin"/start-dfs.sh --config $HADOOP_CONF_DIR
+
+# start mapred daemons
+"$bin"/start-mapred.sh --config $HADOOP_CONF_DIR
diff --git a/core/lib/hadoop-0.20.0/bin/start-balancer.sh b/core/lib/hadoop-0.20.0/bin/start-balancer.sh
new file mode 100755
index 0000000000..e8c93f90ca
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/start-balancer.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# Start balancer daemon.
+
+"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start balancer $@
diff --git a/core/lib/hadoop-0.20.0/bin/start-dfs.sh b/core/lib/hadoop-0.20.0/bin/start-dfs.sh
new file mode 100755
index 0000000000..bda2035a2b
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/start-dfs.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Start hadoop dfs daemons.
+# Optinally upgrade or rollback dfs state.
+# Run this on master node.
+
+usage="Usage: start-dfs.sh [-upgrade|-rollback]"
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# get arguments
+if [ $# -ge 1 ]; then
+ nameStartOpt=$1
+ shift
+ case $nameStartOpt in
+ (-upgrade)
+ ;;
+ (-rollback)
+ dataStartOpt=$nameStartOpt
+ ;;
+ (*)
+ echo $usage
+ exit 1
+ ;;
+ esac
+fi
+
+# start dfs daemons
+# start namenode after datanodes, to minimize time namenode is up w/o data
+# note: datanodes will log connection errors until namenode starts
+"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start namenode $nameStartOpt
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR start datanode $dataStartOpt
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts masters start secondarynamenode
diff --git a/core/lib/hadoop-0.20.0/bin/start-mapred.sh b/core/lib/hadoop-0.20.0/bin/start-mapred.sh
new file mode 100755
index 0000000000..b64c8f51d8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/start-mapred.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Start hadoop map reduce daemons. Run this on master node.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# start mapred daemons
+# start jobtracker first to minimize connection errors at startup
+"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start jobtracker
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR start tasktracker
diff --git a/core/lib/hadoop-0.20.0/bin/stop-all.sh b/core/lib/hadoop-0.20.0/bin/stop-all.sh
new file mode 100755
index 0000000000..033f2fe8d8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/stop-all.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Stop all hadoop daemons. Run this on master node.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+"$bin"/stop-mapred.sh --config $HADOOP_CONF_DIR
+"$bin"/stop-dfs.sh --config $HADOOP_CONF_DIR
diff --git a/core/lib/hadoop-0.20.0/bin/stop-balancer.sh b/core/lib/hadoop-0.20.0/bin/stop-balancer.sh
new file mode 100755
index 0000000000..483a9c2549
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/stop-balancer.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+# Stop balancer daemon.
+# Run this on the machine where the balancer is running
+
+"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop balancer
diff --git a/core/lib/hadoop-0.20.0/bin/stop-dfs.sh b/core/lib/hadoop-0.20.0/bin/stop-dfs.sh
new file mode 100755
index 0000000000..14fe61d17e
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/stop-dfs.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Stop hadoop DFS daemons. Run this on master node.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop namenode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR stop datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts masters stop secondarynamenode
+
diff --git a/core/lib/hadoop-0.20.0/bin/stop-mapred.sh b/core/lib/hadoop-0.20.0/bin/stop-mapred.sh
new file mode 100755
index 0000000000..aa51c1f87b
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/bin/stop-mapred.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Stop hadoop map reduce daemons. Run this on master node.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hadoop-config.sh
+
+"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop jobtracker
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR stop tasktracker
+
diff --git a/core/lib/hadoop-0.20.0/build.xml b/core/lib/hadoop-0.20.0/build.xml
new file mode 100644
index 0000000000..68932d7d9f
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/build.xml
@@ -0,0 +1,1796 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tests failed!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clover not found. Code coverage reports disabled.
+
+
+
+
+
+ ##################################################################
+ Clover not found.
+ Please specify -Dclover.home=<base of clover installation>
+ on the command line.
+ ##################################################################
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ You need Apache Ivy 2.0 or later from http://ant.apache.org/
+ It could not be loaded from ${ivy_repo_url}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reports generated:${build.ivy.report.dir}
+
+
+
+
+
+
+
+
+
+
+ Not found: ${hadoop.jar}
+ Please run the target "jar" in the main build file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/Pipes.hh b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/Pipes.hh
new file mode 100644
index 0000000000..9a785d966a
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/Pipes.hh
@@ -0,0 +1,258 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_PIPES_HH
+#define HADOOP_PIPES_HH
+
+#ifdef SWIG
+%module (directors="1") HadoopPipes
+%include "std_string.i"
+%feature("director") Mapper;
+%feature("director") Reducer;
+%feature("director") Partitioner;
+%feature("director") RecordReader;
+%feature("director") RecordWriter;
+%feature("director") Factory;
+#else
+#include
+#endif
+
+namespace HadoopPipes {
+
+/**
+ * This interface defines the interface between application code and the
+ * foreign code interface to Hadoop Map/Reduce.
+ */
+
+/**
+ * A JobConf defines the properties for a job.
+ */
+class JobConf {
+public:
+ virtual bool hasKey(const std::string& key) const = 0;
+ virtual const std::string& get(const std::string& key) const = 0;
+ virtual int getInt(const std::string& key) const = 0;
+ virtual float getFloat(const std::string& key) const = 0;
+ virtual bool getBoolean(const std::string&key) const = 0;
+ virtual ~JobConf() {}
+};
+
+/**
+ * Task context provides the information about the task and job.
+ */
+class TaskContext {
+public:
+ /**
+ * Counter to keep track of a property and its value.
+ */
+ class Counter {
+ private:
+ int id;
+ public:
+ Counter(int counterId) : id(counterId) {}
+ Counter(const Counter& counter) : id(counter.id) {}
+
+ int getId() const { return id; }
+ };
+
+ /**
+ * Get the JobConf for the current task.
+ */
+ virtual const JobConf* getJobConf() = 0;
+
+ /**
+ * Get the current key.
+ * @return the current key
+ */
+ virtual const std::string& getInputKey() = 0;
+
+ /**
+ * Get the current value.
+ * @return the current value
+ */
+ virtual const std::string& getInputValue() = 0;
+
+ /**
+ * Generate an output record
+ */
+ virtual void emit(const std::string& key, const std::string& value) = 0;
+
+ /**
+ * Mark your task as having made progress without changing the status
+ * message.
+ */
+ virtual void progress() = 0;
+
+ /**
+ * Set the status message and call progress.
+ */
+ virtual void setStatus(const std::string& status) = 0;
+
+ /**
+ * Register a counter with the given group and name.
+ */
+ virtual Counter*
+ getCounter(const std::string& group, const std::string& name) = 0;
+
+ /**
+ * Increment the value of the counter with the given amount.
+ */
+ virtual void incrementCounter(const Counter* counter, uint64_t amount) = 0;
+
+ virtual ~TaskContext() {}
+};
+
+class MapContext: public TaskContext {
+public:
+
+ /**
+ * Access the InputSplit of the mapper.
+ */
+ virtual const std::string& getInputSplit() = 0;
+
+ /**
+ * Get the name of the key class of the input to this task.
+ */
+ virtual const std::string& getInputKeyClass() = 0;
+
+ /**
+ * Get the name of the value class of the input to this task.
+ */
+ virtual const std::string& getInputValueClass() = 0;
+
+};
+
+class ReduceContext: public TaskContext {
+public:
+ /**
+ * Advance to the next value.
+ */
+ virtual bool nextValue() = 0;
+};
+
+class Closable {
+public:
+ virtual void close() {}
+ virtual ~Closable() {}
+};
+
+/**
+ * The application's mapper class to do map.
+ */
+class Mapper: public Closable {
+public:
+ virtual void map(MapContext& context) = 0;
+};
+
+/**
+ * The application's reducer class to do reduce.
+ */
+class Reducer: public Closable {
+public:
+ virtual void reduce(ReduceContext& context) = 0;
+};
+
+/**
+ * User code to decide where each key should be sent.
+ */
+class Partitioner {
+public:
+ virtual int partition(const std::string& key, int numOfReduces) = 0;
+ virtual ~Partitioner() {}
+};
+
+/**
+ * For applications that want to read the input directly for the map function
+ * they can define RecordReaders in C++.
+ */
+class RecordReader: public Closable {
+public:
+ virtual bool next(std::string& key, std::string& value) = 0;
+
+ /**
+ * The progress of the record reader through the split as a value between
+ * 0.0 and 1.0.
+ */
+ virtual float getProgress() = 0;
+};
+
+/**
+ * An object to write key/value pairs as they are emited from the reduce.
+ */
+class RecordWriter: public Closable {
+public:
+ virtual void emit(const std::string& key,
+ const std::string& value) = 0;
+};
+
+/**
+ * A factory to create the necessary application objects.
+ */
+class Factory {
+public:
+ virtual Mapper* createMapper(MapContext& context) const = 0;
+ virtual Reducer* createReducer(ReduceContext& context) const = 0;
+
+ /**
+ * Create a combiner, if this application has one.
+ * @return the new combiner or NULL, if one is not needed
+ */
+ virtual Reducer* createCombiner(MapContext& context) const {
+ return NULL;
+ }
+
+ /**
+ * Create an application partitioner object.
+ * @return the new partitioner or NULL, if the default partitioner should be
+ * used.
+ */
+ virtual Partitioner* createPartitioner(MapContext& context) const {
+ return NULL;
+ }
+
+ /**
+ * Create an application record reader.
+ * @return the new RecordReader or NULL, if the Java RecordReader should be
+ * used.
+ */
+ virtual RecordReader* createRecordReader(MapContext& context) const {
+ return NULL;
+ }
+
+ /**
+ * Create an application record writer.
+ * @return the new RecordWriter or NULL, if the Java RecordWriter should be
+ * used.
+ */
+ virtual RecordWriter* createRecordWriter(ReduceContext& context) const {
+ return NULL;
+ }
+
+ virtual ~Factory() {}
+};
+
+/**
+ * Run the assigned task in the framework.
+ * The user's main function should set the various functions using the
+ * set* functions above and then call this.
+ * @return true, if the task succeeded.
+ */
+bool runTask(const Factory& factory);
+
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/SerialUtils.hh b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/SerialUtils.hh
new file mode 100644
index 0000000000..16cbab65b2
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/SerialUtils.hh
@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_SERIAL_UTILS_HH
+#define HADOOP_SERIAL_UTILS_HH
+
+#include
+
+namespace HadoopUtils {
+
+ /**
+ * A simple exception class that records a message for the user.
+ */
+ class Error {
+ private:
+ std::string error;
+ public:
+
+ /**
+ * Create an error object with the given message.
+ */
+ Error(const std::string& msg);
+
+ /**
+ * Construct an error object with the given message that was created on
+ * the given file, line, and functino.
+ */
+ Error(const std::string& msg,
+ const std::string& file, int line, const std::string& function);
+
+ /**
+ * Get the error message.
+ */
+ const std::string& getMessage() const;
+ };
+
+ /**
+ * Check to make sure that the condition is true, and throw an exception
+ * if it is not. The exception will contain the message and a description
+ * of the source location.
+ */
+ #define HADOOP_ASSERT(CONDITION, MESSAGE) \
+ { \
+ if (!(CONDITION)) { \
+ throw HadoopUtils::Error((MESSAGE), __FILE__, __LINE__, \
+ __PRETTY_FUNCTION__); \
+ } \
+ }
+
+ /**
+ * An interface for an input stream.
+ */
+ class InStream {
+ public:
+ /**
+ * Reads len bytes from the stream into the buffer.
+ * @param buf the buffer to read into
+ * @param buflen the length of the buffer
+ * @throws Error if there are problems reading
+ */
+ virtual void read(void *buf, size_t len) = 0;
+ virtual ~InStream() {}
+ };
+
+ /**
+ * An interface for an output stream.
+ */
+ class OutStream {
+ public:
+ /**
+ * Write the given buffer to the stream.
+ * @param buf the data to write
+ * @param len the number of bytes to write
+ * @throws Error if there are problems writing
+ */
+ virtual void write(const void *buf, size_t len) = 0;
+ /**
+ * Flush the data to the underlying store.
+ */
+ virtual void flush() = 0;
+ virtual ~OutStream() {}
+ };
+
+ /**
+ * A class to read a file as a stream.
+ */
+ class FileInStream : public InStream {
+ public:
+ FileInStream();
+ bool open(const std::string& name);
+ bool open(FILE* file);
+ void read(void *buf, size_t buflen);
+ bool skip(size_t nbytes);
+ bool close();
+ virtual ~FileInStream();
+ private:
+ /**
+ * The file to write to.
+ */
+ FILE *mFile;
+ /**
+ * Does is this class responsible for closing the FILE*?
+ */
+ bool isOwned;
+ };
+
+ /**
+ * A class to write a stream to a file.
+ */
+ class FileOutStream: public OutStream {
+ public:
+
+ /**
+ * Create a stream that isn't bound to anything.
+ */
+ FileOutStream();
+
+ /**
+ * Create the given file, potentially overwriting an existing file.
+ */
+ bool open(const std::string& name, bool overwrite);
+ bool open(FILE* file);
+ void write(const void* buf, size_t len);
+ bool advance(size_t nbytes);
+ void flush();
+ bool close();
+ virtual ~FileOutStream();
+ private:
+ FILE *mFile;
+ bool isOwned;
+ };
+
+ /**
+ * A stream that reads from a string.
+ */
+ class StringInStream: public InStream {
+ public:
+ StringInStream(const std::string& str);
+ virtual void read(void *buf, size_t buflen);
+ private:
+ const std::string& buffer;
+ std::string::const_iterator itr;
+ };
+
+ void serializeInt(int32_t t, OutStream& stream);
+ int32_t deserializeInt(InStream& stream);
+ void serializeLong(int64_t t, OutStream& stream);
+ int64_t deserializeLong(InStream& stream);
+ void serializeFloat(float t, OutStream& stream);
+ float deserializeFloat(InStream& stream);
+ void serializeString(const std::string& t, OutStream& stream);
+ void deserializeString(std::string& t, InStream& stream);
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/StringUtils.hh b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/StringUtils.hh
new file mode 100644
index 0000000000..4720172725
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/StringUtils.hh
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_STRING_UTILS_HH
+#define HADOOP_STRING_UTILS_HH
+
+#include
+#include
+#include
+
+namespace HadoopUtils {
+
+ /**
+ * Convert an integer to a string.
+ */
+ std::string toString(int32_t x);
+
+ /**
+ * Convert a string to an integer.
+ * @throws Error if the string is not a valid integer
+ */
+ int32_t toInt(const std::string& val);
+
+ /**
+ * Convert the string to a float.
+ * @throws Error if the string is not a valid float
+ */
+ float toFloat(const std::string& val);
+
+ /**
+ * Convert the string to a boolean.
+ * @throws Error if the string is not a valid boolean value
+ */
+ bool toBool(const std::string& val);
+
+ /**
+ * Get the current time in the number of milliseconds since 1970.
+ */
+ uint64_t getCurrentMillis();
+
+ /**
+ * Split a string into "words". Multiple deliminators are treated as a single
+ * word break, so no zero-length words are returned.
+ * @param str the string to split
+ * @param separator a list of characters that divide words
+ */
+ std::vector splitString(const std::string& str,
+ const char* separator);
+
+ /**
+ * Quote a string to avoid "\", non-printable characters, and the
+ * deliminators.
+ * @param str the string to quote
+ * @param deliminators the set of characters to always quote
+ */
+ std::string quoteString(const std::string& str,
+ const char* deliminators);
+
+ /**
+ * Unquote the given string to return the original string.
+ * @param str the string to unquote
+ */
+ std::string unquoteString(const std::string& str);
+
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/TemplateFactory.hh b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/TemplateFactory.hh
new file mode 100644
index 0000000000..22e10ae56f
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/include/hadoop/TemplateFactory.hh
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_PIPES_TEMPLATE_FACTORY_HH
+#define HADOOP_PIPES_TEMPLATE_FACTORY_HH
+
+namespace HadoopPipes {
+
+ template
+ class TemplateFactory2: public Factory {
+ public:
+ Mapper* createMapper(MapContext& context) const {
+ return new mapper(context);
+ }
+ Reducer* createReducer(ReduceContext& context) const {
+ return new reducer(context);
+ }
+ };
+
+ template
+ class TemplateFactory3: public TemplateFactory2 {
+ public:
+ Partitioner* createPartitioner(MapContext& context) const {
+ return new partitioner(context);
+ }
+ };
+
+ template
+ class TemplateFactory3
+ : public TemplateFactory2 {
+ };
+
+ template
+ class TemplateFactory4
+ : public TemplateFactory3{
+ public:
+ Reducer* createCombiner(MapContext& context) const {
+ return new combiner(context);
+ }
+ };
+
+ template
+ class TemplateFactory4
+ : public TemplateFactory3{
+ };
+
+ template
+ class TemplateFactory5
+ : public TemplateFactory4{
+ public:
+ RecordReader* createRecordReader(MapContext& context) const {
+ return new recordReader(context);
+ }
+ };
+
+ template
+ class TemplateFactory5
+ : public TemplateFactory4{
+ };
+
+ template
+ class TemplateFactory
+ : public TemplateFactory5{
+ public:
+ RecordWriter* createRecordWriter(ReduceContext& context) const {
+ return new recordWriter(context);
+ }
+ };
+
+ template
+ class TemplateFactory
+ : public TemplateFactory5{
+ };
+
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooppipes.a b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooppipes.a
new file mode 100644
index 0000000000..be303140cb
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooppipes.a differ
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooputils.a b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooputils.a
new file mode 100644
index 0000000000..8a0aded98e
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-amd64-64/lib/libhadooputils.a differ
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/Pipes.hh b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/Pipes.hh
new file mode 100644
index 0000000000..9a785d966a
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/Pipes.hh
@@ -0,0 +1,258 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_PIPES_HH
+#define HADOOP_PIPES_HH
+
+#ifdef SWIG
+%module (directors="1") HadoopPipes
+%include "std_string.i"
+%feature("director") Mapper;
+%feature("director") Reducer;
+%feature("director") Partitioner;
+%feature("director") RecordReader;
+%feature("director") RecordWriter;
+%feature("director") Factory;
+#else
+#include
+#endif
+
+namespace HadoopPipes {
+
+/**
+ * This interface defines the interface between application code and the
+ * foreign code interface to Hadoop Map/Reduce.
+ */
+
+/**
+ * A JobConf defines the properties for a job.
+ */
+class JobConf {
+public:
+ virtual bool hasKey(const std::string& key) const = 0;
+ virtual const std::string& get(const std::string& key) const = 0;
+ virtual int getInt(const std::string& key) const = 0;
+ virtual float getFloat(const std::string& key) const = 0;
+ virtual bool getBoolean(const std::string&key) const = 0;
+ virtual ~JobConf() {}
+};
+
+/**
+ * Task context provides the information about the task and job.
+ */
+class TaskContext {
+public:
+ /**
+ * Counter to keep track of a property and its value.
+ */
+ class Counter {
+ private:
+ int id;
+ public:
+ Counter(int counterId) : id(counterId) {}
+ Counter(const Counter& counter) : id(counter.id) {}
+
+ int getId() const { return id; }
+ };
+
+ /**
+ * Get the JobConf for the current task.
+ */
+ virtual const JobConf* getJobConf() = 0;
+
+ /**
+ * Get the current key.
+ * @return the current key
+ */
+ virtual const std::string& getInputKey() = 0;
+
+ /**
+ * Get the current value.
+ * @return the current value
+ */
+ virtual const std::string& getInputValue() = 0;
+
+ /**
+ * Generate an output record
+ */
+ virtual void emit(const std::string& key, const std::string& value) = 0;
+
+ /**
+ * Mark your task as having made progress without changing the status
+ * message.
+ */
+ virtual void progress() = 0;
+
+ /**
+ * Set the status message and call progress.
+ */
+ virtual void setStatus(const std::string& status) = 0;
+
+ /**
+ * Register a counter with the given group and name.
+ */
+ virtual Counter*
+ getCounter(const std::string& group, const std::string& name) = 0;
+
+ /**
+ * Increment the value of the counter with the given amount.
+ */
+ virtual void incrementCounter(const Counter* counter, uint64_t amount) = 0;
+
+ virtual ~TaskContext() {}
+};
+
+class MapContext: public TaskContext {
+public:
+
+ /**
+ * Access the InputSplit of the mapper.
+ */
+ virtual const std::string& getInputSplit() = 0;
+
+ /**
+ * Get the name of the key class of the input to this task.
+ */
+ virtual const std::string& getInputKeyClass() = 0;
+
+ /**
+ * Get the name of the value class of the input to this task.
+ */
+ virtual const std::string& getInputValueClass() = 0;
+
+};
+
+class ReduceContext: public TaskContext {
+public:
+ /**
+ * Advance to the next value.
+ */
+ virtual bool nextValue() = 0;
+};
+
+class Closable {
+public:
+ virtual void close() {}
+ virtual ~Closable() {}
+};
+
+/**
+ * The application's mapper class to do map.
+ */
+class Mapper: public Closable {
+public:
+ virtual void map(MapContext& context) = 0;
+};
+
+/**
+ * The application's reducer class to do reduce.
+ */
+class Reducer: public Closable {
+public:
+ virtual void reduce(ReduceContext& context) = 0;
+};
+
+/**
+ * User code to decide where each key should be sent.
+ */
+class Partitioner {
+public:
+ virtual int partition(const std::string& key, int numOfReduces) = 0;
+ virtual ~Partitioner() {}
+};
+
+/**
+ * For applications that want to read the input directly for the map function
+ * they can define RecordReaders in C++.
+ */
+class RecordReader: public Closable {
+public:
+ virtual bool next(std::string& key, std::string& value) = 0;
+
+ /**
+ * The progress of the record reader through the split as a value between
+ * 0.0 and 1.0.
+ */
+ virtual float getProgress() = 0;
+};
+
+/**
+ * An object to write key/value pairs as they are emited from the reduce.
+ */
+class RecordWriter: public Closable {
+public:
+ virtual void emit(const std::string& key,
+ const std::string& value) = 0;
+};
+
+/**
+ * A factory to create the necessary application objects.
+ */
+class Factory {
+public:
+ virtual Mapper* createMapper(MapContext& context) const = 0;
+ virtual Reducer* createReducer(ReduceContext& context) const = 0;
+
+ /**
+ * Create a combiner, if this application has one.
+ * @return the new combiner or NULL, if one is not needed
+ */
+ virtual Reducer* createCombiner(MapContext& context) const {
+ return NULL;
+ }
+
+ /**
+ * Create an application partitioner object.
+ * @return the new partitioner or NULL, if the default partitioner should be
+ * used.
+ */
+ virtual Partitioner* createPartitioner(MapContext& context) const {
+ return NULL;
+ }
+
+ /**
+ * Create an application record reader.
+ * @return the new RecordReader or NULL, if the Java RecordReader should be
+ * used.
+ */
+ virtual RecordReader* createRecordReader(MapContext& context) const {
+ return NULL;
+ }
+
+ /**
+ * Create an application record writer.
+ * @return the new RecordWriter or NULL, if the Java RecordWriter should be
+ * used.
+ */
+ virtual RecordWriter* createRecordWriter(ReduceContext& context) const {
+ return NULL;
+ }
+
+ virtual ~Factory() {}
+};
+
+/**
+ * Run the assigned task in the framework.
+ * The user's main function should set the various functions using the
+ * set* functions above and then call this.
+ * @return true, if the task succeeded.
+ */
+bool runTask(const Factory& factory);
+
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/SerialUtils.hh b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/SerialUtils.hh
new file mode 100644
index 0000000000..16cbab65b2
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/SerialUtils.hh
@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_SERIAL_UTILS_HH
+#define HADOOP_SERIAL_UTILS_HH
+
+#include
+
+namespace HadoopUtils {
+
+ /**
+ * A simple exception class that records a message for the user.
+ */
+ class Error {
+ private:
+ std::string error;
+ public:
+
+ /**
+ * Create an error object with the given message.
+ */
+ Error(const std::string& msg);
+
+ /**
+ * Construct an error object with the given message that was created on
+ * the given file, line, and functino.
+ */
+ Error(const std::string& msg,
+ const std::string& file, int line, const std::string& function);
+
+ /**
+ * Get the error message.
+ */
+ const std::string& getMessage() const;
+ };
+
+ /**
+ * Check to make sure that the condition is true, and throw an exception
+ * if it is not. The exception will contain the message and a description
+ * of the source location.
+ */
+ #define HADOOP_ASSERT(CONDITION, MESSAGE) \
+ { \
+ if (!(CONDITION)) { \
+ throw HadoopUtils::Error((MESSAGE), __FILE__, __LINE__, \
+ __PRETTY_FUNCTION__); \
+ } \
+ }
+
+ /**
+ * An interface for an input stream.
+ */
+ class InStream {
+ public:
+ /**
+ * Reads len bytes from the stream into the buffer.
+ * @param buf the buffer to read into
+ * @param buflen the length of the buffer
+ * @throws Error if there are problems reading
+ */
+ virtual void read(void *buf, size_t len) = 0;
+ virtual ~InStream() {}
+ };
+
+ /**
+ * An interface for an output stream.
+ */
+ class OutStream {
+ public:
+ /**
+ * Write the given buffer to the stream.
+ * @param buf the data to write
+ * @param len the number of bytes to write
+ * @throws Error if there are problems writing
+ */
+ virtual void write(const void *buf, size_t len) = 0;
+ /**
+ * Flush the data to the underlying store.
+ */
+ virtual void flush() = 0;
+ virtual ~OutStream() {}
+ };
+
+ /**
+ * A class to read a file as a stream.
+ */
+ class FileInStream : public InStream {
+ public:
+ FileInStream();
+ bool open(const std::string& name);
+ bool open(FILE* file);
+ void read(void *buf, size_t buflen);
+ bool skip(size_t nbytes);
+ bool close();
+ virtual ~FileInStream();
+ private:
+ /**
+ * The file to write to.
+ */
+ FILE *mFile;
+ /**
+ * Does is this class responsible for closing the FILE*?
+ */
+ bool isOwned;
+ };
+
+ /**
+ * A class to write a stream to a file.
+ */
+ class FileOutStream: public OutStream {
+ public:
+
+ /**
+ * Create a stream that isn't bound to anything.
+ */
+ FileOutStream();
+
+ /**
+ * Create the given file, potentially overwriting an existing file.
+ */
+ bool open(const std::string& name, bool overwrite);
+ bool open(FILE* file);
+ void write(const void* buf, size_t len);
+ bool advance(size_t nbytes);
+ void flush();
+ bool close();
+ virtual ~FileOutStream();
+ private:
+ FILE *mFile;
+ bool isOwned;
+ };
+
+ /**
+ * A stream that reads from a string.
+ */
+ class StringInStream: public InStream {
+ public:
+ StringInStream(const std::string& str);
+ virtual void read(void *buf, size_t buflen);
+ private:
+ const std::string& buffer;
+ std::string::const_iterator itr;
+ };
+
+ void serializeInt(int32_t t, OutStream& stream);
+ int32_t deserializeInt(InStream& stream);
+ void serializeLong(int64_t t, OutStream& stream);
+ int64_t deserializeLong(InStream& stream);
+ void serializeFloat(float t, OutStream& stream);
+ float deserializeFloat(InStream& stream);
+ void serializeString(const std::string& t, OutStream& stream);
+ void deserializeString(std::string& t, InStream& stream);
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/StringUtils.hh b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/StringUtils.hh
new file mode 100644
index 0000000000..4720172725
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/StringUtils.hh
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_STRING_UTILS_HH
+#define HADOOP_STRING_UTILS_HH
+
+#include
+#include
+#include
+
+namespace HadoopUtils {
+
+ /**
+ * Convert an integer to a string.
+ */
+ std::string toString(int32_t x);
+
+ /**
+ * Convert a string to an integer.
+ * @throws Error if the string is not a valid integer
+ */
+ int32_t toInt(const std::string& val);
+
+ /**
+ * Convert the string to a float.
+ * @throws Error if the string is not a valid float
+ */
+ float toFloat(const std::string& val);
+
+ /**
+ * Convert the string to a boolean.
+ * @throws Error if the string is not a valid boolean value
+ */
+ bool toBool(const std::string& val);
+
+ /**
+ * Get the current time in the number of milliseconds since 1970.
+ */
+ uint64_t getCurrentMillis();
+
+ /**
+ * Split a string into "words". Multiple deliminators are treated as a single
+ * word break, so no zero-length words are returned.
+ * @param str the string to split
+ * @param separator a list of characters that divide words
+ */
+ std::vector splitString(const std::string& str,
+ const char* separator);
+
+ /**
+ * Quote a string to avoid "\", non-printable characters, and the
+ * deliminators.
+ * @param str the string to quote
+ * @param deliminators the set of characters to always quote
+ */
+ std::string quoteString(const std::string& str,
+ const char* deliminators);
+
+ /**
+ * Unquote the given string to return the original string.
+ * @param str the string to unquote
+ */
+ std::string unquoteString(const std::string& str);
+
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/TemplateFactory.hh b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/TemplateFactory.hh
new file mode 100644
index 0000000000..22e10ae56f
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/include/hadoop/TemplateFactory.hh
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HADOOP_PIPES_TEMPLATE_FACTORY_HH
+#define HADOOP_PIPES_TEMPLATE_FACTORY_HH
+
+namespace HadoopPipes {
+
+ template
+ class TemplateFactory2: public Factory {
+ public:
+ Mapper* createMapper(MapContext& context) const {
+ return new mapper(context);
+ }
+ Reducer* createReducer(ReduceContext& context) const {
+ return new reducer(context);
+ }
+ };
+
+ template
+ class TemplateFactory3: public TemplateFactory2 {
+ public:
+ Partitioner* createPartitioner(MapContext& context) const {
+ return new partitioner(context);
+ }
+ };
+
+ template
+ class TemplateFactory3
+ : public TemplateFactory2 {
+ };
+
+ template
+ class TemplateFactory4
+ : public TemplateFactory3{
+ public:
+ Reducer* createCombiner(MapContext& context) const {
+ return new combiner(context);
+ }
+ };
+
+ template
+ class TemplateFactory4
+ : public TemplateFactory3{
+ };
+
+ template
+ class TemplateFactory5
+ : public TemplateFactory4{
+ public:
+ RecordReader* createRecordReader(MapContext& context) const {
+ return new recordReader(context);
+ }
+ };
+
+ template
+ class TemplateFactory5
+ : public TemplateFactory4{
+ };
+
+ template
+ class TemplateFactory
+ : public TemplateFactory5{
+ public:
+ RecordWriter* createRecordWriter(ReduceContext& context) const {
+ return new recordWriter(context);
+ }
+ };
+
+ template
+ class TemplateFactory
+ : public TemplateFactory5{
+ };
+
+}
+
+#endif
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooppipes.a b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooppipes.a
new file mode 100644
index 0000000000..73debcec92
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooppipes.a differ
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooputils.a b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooputils.a
new file mode 100644
index 0000000000..6753169b8a
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhadooputils.a differ
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la
new file mode 100644
index 0000000000..b6ce94229f
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.la
@@ -0,0 +1,41 @@
+# libhdfs.la - a libtool library file
+# Generated by ltmain.sh (GNU libtool) 2.2
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='libhdfs.so.0'
+
+# Names of this library.
+library_names='libhdfs.so.0.0.0 libhdfs.so.0 libhdfs.so'
+
+# The name of the static archive.
+old_library=''
+
+# Linker flags that can not go in dependency_libs.
+inherited_linker_flags=''
+
+# Libraries that this one depends upon.
+dependency_libs=' -L/home/hadoopqa/tools/java/latest1.6-32/jre/lib/i386/server -ljvm -ldl -lpthread'
+
+# Names of additional weak libraries provided by this library
+weak_library_names=''
+
+# Version information for libhdfs.
+current=0
+age=0
+revision=0
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='/home/ndaley/hadoop/branch-0.20/build/c++/Linux-i386-32/lib'
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so
new file mode 100644
index 0000000000..358d582d43
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so differ
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0 b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0
new file mode 100644
index 0000000000..358d582d43
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0 differ
diff --git a/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0.0.0 b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0.0.0
new file mode 100644
index 0000000000..358d582d43
Binary files /dev/null and b/core/lib/hadoop-0.20.0/c++/Linux-i386-32/lib/libhdfs.so.0.0.0 differ
diff --git a/core/lib/hadoop-0.20.0/conf/capacity-scheduler.xml b/core/lib/hadoop-0.20.0/conf/capacity-scheduler.xml
new file mode 100644
index 0000000000..d22a3964b4
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/capacity-scheduler.xml
@@ -0,0 +1,156 @@
+
+
+
+
+
+
+
+
+
+
+ mapred.capacity-scheduler.queue.default.guaranteed-capacity
+ 100
+ Percentage of the number of slots in the cluster that are
+ guaranteed to be available for jobs in this queue.
+
+
+
+
+ mapred.capacity-scheduler.queue.default.reclaim-time-limit
+ 300
+ The amount of time, in seconds, before which
+ resources distributed to other queues will be reclaimed.
+
+
+
+
+ mapred.capacity-scheduler.queue.default.supports-priority
+ false
+ If true, priorities of jobs will be taken into
+ account in scheduling decisions.
+
+
+
+
+ mapred.capacity-scheduler.queue.default.minimum-user-limit-percent
+ 100
+ Each queue enforces a limit on the percentage of resources
+ allocated to a user at any given time, if there is competition for them.
+ This user limit can vary between a minimum and maximum value. The former
+ depends on the number of users who have submitted jobs, and the latter is
+ set to this property value. For example, suppose the value of this
+ property is 25. If two users have submitted jobs to a queue, no single
+ user can use more than 50% of the queue resources. If a third user submits
+ a job, no single user can use more than 33% of the queue resources. With 4
+ or more users, no user can use more than 25% of the queue's resources. A
+ value of 100 implies no user limits are imposed.
+
+
+
+ mapred.capacity-scheduler.queue.default.maximum-initialized-jobs-per-user
+ 2
+ The maximum number of jobs to be pre-initialized for a user
+ of the job queue.
+
+
+
+
+
+ mapred.capacity-scheduler.reclaimCapacity.interval
+ 5
+ The time interval, in seconds, between which the scheduler
+ periodically determines whether capacity needs to be reclaimed for
+ any queue.
+
+
+
+
+
+
+
+ mapred.capacity-scheduler.default-reclaim-time-limit
+ 300
+ The amount of time, in seconds, before which
+ resources distributed to other queues will be reclaimed by default
+ in a job queue.
+
+
+
+
+ mapred.capacity-scheduler.default-supports-priority
+ false
+ If true, priorities of jobs will be taken into
+ account in scheduling decisions by default in a job queue.
+
+
+
+
+ mapred.capacity-scheduler.task.default-pmem-percentage-in-vmem
+ -1
+ If mapred.task.maxpmem is set to -1, this configuration will
+ be used to calculate job's physical memory requirements as a percentage of
+ the job's virtual memory requirements set via mapred.task.maxvmem. This
+ property thus provides default value of physical memory for job's that
+ don't explicitly specify physical memory requirements.
+
+ If not explicitly set to a valid value, scheduler will not consider
+ physical memory for scheduling even if virtual memory based scheduling is
+ enabled(by setting valid values for both mapred.task.default.maxvmem and
+ mapred.task.limit.maxvmem).
+
+
+
+
+ mapred.capacity-scheduler.task.limit.maxpmem
+ -1
+ Configuration that provides an upper limit on the maximum
+ physical memory that can be specified by a job. The job configuration
+ mapred.task.maxpmem should be less than this value. If not, the job will
+ be rejected by the scheduler.
+
+ If it is set to -1, scheduler will not consider physical memory for
+ scheduling even if virtual memory based scheduling is enabled(by setting
+ valid values for both mapred.task.default.maxvmem and
+ mapred.task.limit.maxvmem).
+
+
+
+
+ mapred.capacity-scheduler.default-minimum-user-limit-percent
+ 100
+ The percentage of the resources limited to a particular user
+ for the job queue at any given point of time by default.
+
+
+
+
+ mapred.capacity-scheduler.default-maximum-initialized-jobs-per-user
+ 2
+ The maximum number of jobs to be pre-initialized for a user
+ of the job queue.
+
+
+
+
+
+
+ mapred.capacity-scheduler.init-poll-interval
+ 5000
+ The amount of time in miliseconds which is used to poll
+ the job queues for jobs to initialize.
+
+
+
+ mapred.capacity-scheduler.init-worker-threads
+ 5
+ Number of worker threads which would be used by
+ Initialization poller to initialize jobs in a set of queue.
+ If number mentioned in property is equal to number of job queues
+ then a single thread would initialize jobs in a queue. If lesser
+ then a thread would get a set of queues assigned. If the number
+ is greater then number of threads would be equal to number of
+ job queues.
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/configuration.xsl b/core/lib/hadoop-0.20.0/conf/configuration.xsl
new file mode 100644
index 0000000000..377cdbeb93
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/configuration.xsl
@@ -0,0 +1,24 @@
+
+
+
+
+
+
+
+
+
name
+
value
+
description
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/core-site.xml b/core/lib/hadoop-0.20.0/conf/core-site.xml
new file mode 100644
index 0000000000..970c8fe0e8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/core-site.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/hadoop-env.sh b/core/lib/hadoop-0.20.0/conf/hadoop-env.sh
new file mode 100644
index 0000000000..ada5bef1c7
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/hadoop-env.sh
@@ -0,0 +1,54 @@
+# Set Hadoop-specific environment variables here.
+
+# The only required environment variable is JAVA_HOME. All others are
+# optional. When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use. Required.
+# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
+
+# Extra Java CLASSPATH elements. Optional.
+# export HADOOP_CLASSPATH=
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HADOOP_HEAPSIZE=2000
+
+# Extra Java runtime options. Empty by default.
+# export HADOOP_OPTS=-server
+
+# Command specific options appended to HADOOP_OPTS when specified
+export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS"
+export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS"
+export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS"
+export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS"
+export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS"
+# export HADOOP_TASKTRACKER_OPTS=
+# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
+# export HADOOP_CLIENT_OPTS
+
+# Extra ssh options. Empty by default.
+# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR"
+
+# Where log files are stored. $HADOOP_HOME/logs by default.
+# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
+
+# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default.
+# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves
+
+# host:path where hadoop code should be rsync'd from. Unset by default.
+# export HADOOP_MASTER=master:/home/$USER/src/hadoop
+
+# Seconds to sleep between slave commands. Unset by default. This
+# can be useful in large clusters, where, e.g., slave rsyncs can
+# otherwise arrive faster than the master can service them.
+# export HADOOP_SLAVE_SLEEP=0.1
+
+# The directory where pid files are stored. /tmp by default.
+# export HADOOP_PID_DIR=/var/hadoop/pids
+
+# A string representing this instance of hadoop. $USER by default.
+# export HADOOP_IDENT_STRING=$USER
+
+# The scheduling priority for daemon processes. See 'man nice'.
+# export HADOOP_NICENESS=10
diff --git a/core/lib/hadoop-0.20.0/conf/hadoop-metrics.properties b/core/lib/hadoop-0.20.0/conf/hadoop-metrics.properties
new file mode 100644
index 0000000000..d04dffc438
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/hadoop-metrics.properties
@@ -0,0 +1,40 @@
+# Configuration of the "dfs" context for null
+dfs.class=org.apache.hadoop.metrics.spi.NullContext
+
+# Configuration of the "dfs" context for file
+#dfs.class=org.apache.hadoop.metrics.file.FileContext
+#dfs.period=10
+#dfs.fileName=/tmp/dfsmetrics.log
+
+# Configuration of the "dfs" context for ganglia
+# dfs.class=org.apache.hadoop.metrics.ganglia.GangliaContext
+# dfs.period=10
+# dfs.servers=localhost:8649
+
+
+# Configuration of the "mapred" context for null
+mapred.class=org.apache.hadoop.metrics.spi.NullContext
+
+# Configuration of the "mapred" context for file
+#mapred.class=org.apache.hadoop.metrics.file.FileContext
+#mapred.period=10
+#mapred.fileName=/tmp/mrmetrics.log
+
+# Configuration of the "mapred" context for ganglia
+# mapred.class=org.apache.hadoop.metrics.ganglia.GangliaContext
+# mapred.period=10
+# mapred.servers=localhost:8649
+
+
+# Configuration of the "jvm" context for null
+jvm.class=org.apache.hadoop.metrics.spi.NullContext
+
+# Configuration of the "jvm" context for file
+#jvm.class=org.apache.hadoop.metrics.file.FileContext
+#jvm.period=10
+#jvm.fileName=/tmp/jvmmetrics.log
+
+# Configuration of the "jvm" context for ganglia
+# jvm.class=org.apache.hadoop.metrics.ganglia.GangliaContext
+# jvm.period=10
+# jvm.servers=localhost:8649
diff --git a/core/lib/hadoop-0.20.0/conf/hadoop-policy.xml b/core/lib/hadoop-0.20.0/conf/hadoop-policy.xml
new file mode 100644
index 0000000000..ef48f2bbed
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/hadoop-policy.xml
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+ security.client.protocol.acl
+ *
+ ACL for ClientProtocol, which is used by user code
+ via the DistributedFileSystem.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.client.datanode.protocol.acl
+ *
+ ACL for ClientDatanodeProtocol, the client-to-datanode protocol
+ for block recovery.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.datanode.protocol.acl
+ *
+ ACL for DatanodeProtocol, which is used by datanodes to
+ communicate with the namenode.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.inter.datanode.protocol.acl
+ *
+ ACL for InterDatanodeProtocol, the inter-datanode protocol
+ for updating generation timestamp.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.namenode.protocol.acl
+ *
+ ACL for NamenodeProtocol, the protocol used by the secondary
+ namenode to communicate with the namenode.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.inter.tracker.protocol.acl
+ *
+ ACL for InterTrackerProtocol, used by the tasktrackers to
+ communicate with the jobtracker.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.job.submission.protocol.acl
+ *
+ ACL for JobSubmissionProtocol, used by job clients to
+ communciate with the jobtracker for job submission, querying job status etc.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.task.umbilical.protocol.acl
+ *
+ ACL for TaskUmbilicalProtocol, used by the map and reduce
+ tasks to communicate with the parent tasktracker.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
+ security.refresh.policy.protocol.acl
+ *
+ ACL for RefreshAuthorizationPolicyProtocol, used by the
+ dfsadmin and mradmin commands to refresh the security policy in-effect.
+ The ACL is a comma-separated list of user and group names. The user and
+ group list is separated by a blank. For e.g. "alice,bob users,wheel".
+ A special value of "*" means all users are allowed.
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/hdfs-site.xml b/core/lib/hadoop-0.20.0/conf/hdfs-site.xml
new file mode 100644
index 0000000000..970c8fe0e8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/hdfs-site.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/log4j.properties b/core/lib/hadoop-0.20.0/conf/log4j.properties
new file mode 100644
index 0000000000..d797df6dab
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/log4j.properties
@@ -0,0 +1,94 @@
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=INFO,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=ALL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# FSNamesystem Audit logging
+# All audit events are logged at INFO level
+#
+log4j.logger.org.apache.hadoop.fs.FSNamesystem.audit=WARN
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
+#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
+#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG
+
+# Jets3t library
+log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
diff --git a/core/lib/hadoop-0.20.0/conf/mapred-site.xml b/core/lib/hadoop-0.20.0/conf/mapred-site.xml
new file mode 100644
index 0000000000..970c8fe0e8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/mapred-site.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/masters b/core/lib/hadoop-0.20.0/conf/masters
new file mode 100644
index 0000000000..2fbb50c4a8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/masters
@@ -0,0 +1 @@
+localhost
diff --git a/core/lib/hadoop-0.20.0/conf/slaves b/core/lib/hadoop-0.20.0/conf/slaves
new file mode 100644
index 0000000000..2fbb50c4a8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/slaves
@@ -0,0 +1 @@
+localhost
diff --git a/core/lib/hadoop-0.20.0/conf/ssl-client.xml.example b/core/lib/hadoop-0.20.0/conf/ssl-client.xml.example
new file mode 100644
index 0000000000..ec3fd41fa8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/ssl-client.xml.example
@@ -0,0 +1,57 @@
+
+
+
+
+
+
+ ssl.client.truststore.location
+
+ Truststore to be used by clients like distcp. Must be
+ specified.
+
+
+
+
+ ssl.client.truststore.password
+
+ Optional. Default value is "".
+
+
+
+
+ ssl.client.truststore.type
+ jks
+ Optional. Default value is "jks".
+
+
+
+
+ ssl.client.keystore.location
+
+ Keystore to be used by clients like distcp. Must be
+ specified.
+
+
+
+
+ ssl.client.keystore.password
+
+ Optional. Default value is "".
+
+
+
+
+ ssl.client.keystore.keypassword
+
+ Optional. Default value is "".
+
+
+
+
+ ssl.client.keystore.type
+ jks
+ Optional. Default value is "jks".
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/conf/ssl-server.xml.example b/core/lib/hadoop-0.20.0/conf/ssl-server.xml.example
new file mode 100644
index 0000000000..22e9cb0ebb
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/conf/ssl-server.xml.example
@@ -0,0 +1,55 @@
+
+
+
+
+
+
+ ssl.server.truststore.location
+
+ Truststore to be used by NN and DN. Must be specified.
+
+
+
+
+ ssl.server.truststore.password
+
+ Optional. Default value is "".
+
+
+
+
+ ssl.server.truststore.type
+ jks
+ Optional. Default value is "jks".
+
+
+
+
+ ssl.server.keystore.location
+
+ Keystore to be used by NN and DN. Must be specified.
+
+
+
+
+ ssl.server.keystore.password
+
+ Must be specified.
+
+
+
+
+ ssl.server.keystore.keypassword
+
+ Must be specified.
+
+
+
+
+ ssl.server.keystore.type
+ jks
+ Optional. Default value is "jks".
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/capacity-scheduler/hadoop-0.20.0-capacity-scheduler.jar b/core/lib/hadoop-0.20.0/contrib/capacity-scheduler/hadoop-0.20.0-capacity-scheduler.jar
new file mode 100644
index 0000000000..b4900e565e
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/capacity-scheduler/hadoop-0.20.0-capacity-scheduler.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/datajoin/hadoop-0.20.0-datajoin.jar b/core/lib/hadoop-0.20.0/contrib/datajoin/hadoop-0.20.0-datajoin.jar
new file mode 100644
index 0000000000..21294d4d1d
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/datajoin/hadoop-0.20.0-datajoin.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/eclipse-plugin/hadoop-0.20.0-eclipse-plugin.jar b/core/lib/hadoop-0.20.0/contrib/eclipse-plugin/hadoop-0.20.0-eclipse-plugin.jar
new file mode 100644
index 0000000000..7b316393f6
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/eclipse-plugin/hadoop-0.20.0-eclipse-plugin.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/fairscheduler/hadoop-0.20.0-fairscheduler.jar b/core/lib/hadoop-0.20.0/contrib/fairscheduler/hadoop-0.20.0-fairscheduler.jar
new file mode 100644
index 0000000000..758b98367c
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/fairscheduler/hadoop-0.20.0-fairscheduler.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/README b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/README
new file mode 100644
index 0000000000..2c33988926
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/README
@@ -0,0 +1,30 @@
+HDFSPROXY is an HTTPS proxy server that exposes the same HSFTP interface as a
+real cluster. It authenticates users via user certificates and enforce access
+control based on configuration files.
+
+Starting up an HDFSPROXY server is similar to starting up an HDFS cluster.
+Simply run "hdfsproxy" shell command. The main configuration file is
+hdfsproxy-default.xml, which should be on the classpath. hdfsproxy-env.sh
+can be used to set up environmental variables. In particular, JAVA_HOME should
+be set. Additional configuration files include user-certs.xml,
+user-permissions.xml and ssl-server.xml, which are used to specify allowed user
+certs, allowed directories/files, and ssl keystore information for the proxy,
+respectively. The location of these files can be specified in
+hdfsproxy-default.xml. Environmental variable HDFSPROXY_CONF_DIR can be used to
+point to the directory where these configuration files are located. The
+configuration files of the proxied HDFS cluster should also be available on the
+classpath (hdfs-default.xml and hdfs-site.xml).
+
+Mirroring those used in HDFS, a few shell scripts are provided to start and
+stop a group of proxy servers. The hosts to run hdfsproxy on are specified in
+hdfsproxy-hosts file, one host per line. All hdfsproxy servers are stateless
+and run independently from each other. Simple load balancing can be set up by
+mapping all hdfsproxy server IP addresses to a single hostname. Users should
+use that hostname to access the proxy. If an IP address look up for that
+hostname returns more than one IP addresses, an HFTP/HSFTP client will randomly
+pick one to use.
+
+Command "hdfsproxy -reloadPermFiles" can be used to trigger reloading of
+user-certs.xml and user-permissions.xml files on all proxy servers listed in
+the hdfsproxy-hosts file. Similarly, "hdfsproxy -clearUgiCache" command can be
+used to clear the UGI caches on all proxy servers.
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy
new file mode 100755
index 0000000000..1b1e597891
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The HdfsProxy command script
+#
+# Environment Variables
+#
+# JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# HDFSPROXY_CLASSPATH Extra Java CLASSPATH entries.
+#
+# HDFSPROXY_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 1000.
+#
+# HDFSPROXY_OPTS Extra Java runtime options.
+#
+# HDFSPROXY_NAMENODE_OPTS These options are added to HDFSPROXY_OPTS
+# HDFSPROXY_CLIENT_OPTS when the respective command is run.
+# HDFSPROXY_{COMMAND}_OPTS etc HDFSPROXY_JT_OPTS applies to JobTracker
+# for e.g. HDFSPROXY_CLIENT_OPTS applies to
+# more than one command (fs, dfs, fsck,
+# dfsadmin etc)
+#
+# HDFSPROXY_CONF_DIR Alternate conf dir. Default is ${HDFSPROXY_HOME}/conf.
+#
+# HDFSPROXY_ROOT_LOGGER The root appender. Default is INFO,console
+#
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hdfsproxy-config.sh
+
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+if [ -f "${HDFSPROXY_CONF_DIR}/hdfsproxy-env.sh" ]; then
+ . "${HDFSPROXY_CONF_DIR}/hdfsproxy-env.sh"
+fi
+
+# some Java parameters
+if [ "$JAVA_HOME" != "" ]; then
+ #echo "run java in $JAVA_HOME"
+ JAVA_HOME=$JAVA_HOME
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx1000m
+
+# check envvars which might override default args
+if [ "$HDFSPROXY_HEAPSIZE" != "" ]; then
+ #echo "run with heapsize $HDFSPROXY_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$HDFSPROXY_HEAPSIZE""m"
+ #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $HDFSPROXY_CONF_DIR
+CLASSPATH="${HDFSPROXY_CONF_DIR}"
+CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
+
+# for developers, add HdfsProxy classes to CLASSPATH
+if [ -d "$HDFSPROXY_HOME/build/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$HDFSPROXY_HOME/build/classes
+fi
+if [ -d "$HDFSPROXY_HOME/build/webapps" ]; then
+ CLASSPATH=${CLASSPATH}:$HDFSPROXY_HOME/build
+fi
+if [ -d "$HDFSPROXY_HOME/build/test/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$HDFSPROXY_HOME/build/test/classes
+fi
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# for releases, add hdfsproxy jar & webapps to CLASSPATH
+if [ -d "$HDFSPROXY_HOME/webapps" ]; then
+ CLASSPATH=${CLASSPATH}:$HDFSPROXY_HOME
+fi
+for f in $HDFSPROXY_HOME/hdfsproxy-*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# add libs to CLASSPATH
+if [ -d "$HDFSPROXY_HOME/lib" ]; then
+ for f in $HDFSPROXY_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+fi
+
+if [ -d "$HDFSPROXY_HOME/../../" ]; then
+ for f in $HDFSPROXY_HOME/../../*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+fi
+if [ -d "$HDFSPROXY_HOME/../../lib" ]; then
+ for f in $HDFSPROXY_HOME/../../lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+fi
+if [ -d "$HDFSPROXY_HOME/../../lib/jsp-2.1" ]; then
+ for f in $HDFSPROXY_HOME/../../lib/jsp-2.1/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+fi
+
+
+# add user-specified CLASSPATH last
+if [ "$HDFSPROXY_CLASSPATH" != "" ]; then
+ CLASSPATH=${CLASSPATH}:${HDFSPROXY_CLASSPATH}
+fi
+
+# default log directory & file
+if [ "$HDFSPROXY_LOG_DIR" = "" ]; then
+ HDFSPROXY_LOG_DIR="$HDFSPROXY_HOME/logs"
+fi
+if [ "$HDFSPROXY_LOGFILE" = "" ]; then
+ HDFSPROXY_LOGFILE='hdfsproxy.log'
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# figure out which class to run
+CLASS='org.apache.hadoop.hdfsproxy.HdfsProxy'
+
+# cygwin path translation
+if $cygwin; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+ HDFSPROXY_HOME=`cygpath -d "$HDFSPROXY_HOME"`
+ HDFSPROXY_LOG_DIR=`cygpath -d "$HDFSPROXY_LOG_DIR"`
+fi
+
+# cygwin path translation
+if $cygwin; then
+ JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
+fi
+
+HDFSPROXY_OPTS="$HDFSPROXY_OPTS -Dhdfsproxy.log.dir=$HDFSPROXY_LOG_DIR"
+HDFSPROXY_OPTS="$HDFSPROXY_OPTS -Dhdfsproxy.log.file=$HDFSPROXY_LOGFILE"
+HDFSPROXY_OPTS="$HDFSPROXY_OPTS -Dhdfsproxy.home.dir=$HDFSPROXY_HOME"
+HDFSPROXY_OPTS="$HDFSPROXY_OPTS -Dhdfsproxy.id.str=$HDFSPROXY_IDENT_STRING"
+HDFSPROXY_OPTS="$HDFSPROXY_OPTS -Dhdfsproxy.root.logger=${HDFSPROXY_ROOT_LOGGER:-INFO,console}"
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ HDFSPROXY_OPTS="$HDFSPROXY_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+fi
+
+# run it
+exec "$JAVA" $JAVA_HEAP_MAX $HDFSPROXY_OPTS -classpath "$CLASSPATH" $CLASS "$@"
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-config.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-config.sh
new file mode 100755
index 0000000000..8fe6aac68b
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-config.sh
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# included in all the hadoop scripts with source command
+# should not be executable directly
+# also should not be passed any arguments, since we need original $*
+
+# resolve links - $0 may be a softlink
+
+this="$0"
+while [ -h "$this" ]; do
+ ls=`ls -ld "$this"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ this="$link"
+ else
+ this=`dirname "$this"`/"$link"
+ fi
+done
+
+# convert relative path to absolute path
+bin=`dirname "$this"`
+script=`basename "$this"`
+bin=`cd "$bin"; pwd`
+this="$bin/$script"
+
+# the root of the HdfsProxy installation
+export HDFSPROXY_HOME=`dirname "$this"`/..
+
+#check to see if the conf dir is given as an optional argument
+if [ $# -gt 1 ]
+then
+ if [ "--config" = "$1" ]
+ then
+ shift
+ confdir=$1
+ shift
+ HDFSPROXY_CONF_DIR=$confdir
+ fi
+fi
+
+# Allow alternate conf dir location.
+HDFSPROXY_CONF_DIR="${HDFSPROXY_CONF_DIR:-$HDFSPROXY_HOME/conf}"
+
+#check to see it is specified whether to use the slaves file
+if [ $# -gt 1 ]
+then
+ if [ "--hosts" = "$1" ]
+ then
+ shift
+ slavesfile=$1
+ shift
+ export HDFSPROXY_SLAVES="${HDFSPROXY_CONF_DIR}/$slavesfile"
+ fi
+fi
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemon.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemon.sh
new file mode 100755
index 0000000000..6d5a75247f
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemon.sh
@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Runs a HdfsProxy as a daemon.
+#
+# Environment Variables
+#
+# HDFSPROXY_CONF_DIR Alternate conf dir. Default is ${HDFSPROXY_HOME}/conf.
+# HDFSPROXY_LOG_DIR Where log files are stored. PWD by default.
+# HDFSPROXY_MASTER host:path where hdfsproxy code should be rsync'd from
+# HDFSPROXY_PID_DIR The pid files are stored. /tmp by default.
+# HDFSPROXY_IDENT_STRING A string representing this instance of hdfsproxy. $USER by default
+# HDFSPROXY_NICENESS The scheduling priority for daemons. Defaults to 0.
+##
+
+usage="Usage: hdfsproxy-daemon.sh [--config ] [--hosts hostlistfile] (start|stop) "
+
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hdfsproxy-config.sh
+
+# get arguments
+startStop=$1
+shift
+
+hdfsproxy_rotate_log ()
+{
+ log=$1;
+ num=5;
+ if [ -n "$2" ]; then
+ num=$2
+ fi
+ if [ -f "$log" ]; then # rotate logs
+ while [ $num -gt 1 ]; do
+ prev=`expr $num - 1`
+ [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"
+ num=$prev
+ done
+ mv "$log" "$log.$num";
+ fi
+}
+
+if [ -f "${HDFSPROXY_CONF_DIR}/hdfsproxy-env.sh" ]; then
+ . "${HDFSPROXY_CONF_DIR}/hdfsproxy-env.sh"
+fi
+
+# get log directory
+if [ "$HDFSPROXY_LOG_DIR" = "" ]; then
+ export HDFSPROXY_LOG_DIR="$HDFSPROXY_HOME/logs"
+fi
+mkdir -p "$HDFSPROXY_LOG_DIR"
+
+if [ "$HDFSPROXY_PID_DIR" = "" ]; then
+ HDFSPROXY_PID_DIR=/tmp
+fi
+
+if [ "$HDFSPROXY_IDENT_STRING" = "" ]; then
+ export HDFSPROXY_IDENT_STRING="$USER"
+fi
+
+# some variables
+export HDFSPROXY_LOGFILE=hdfsproxy-$HDFSPROXY_IDENT_STRING-$HOSTNAME.log
+export HDFSPROXY_ROOT_LOGGER="INFO,DRFA"
+log=$HDFSPROXY_LOG_DIR/hdfsproxy-$HDFSPROXY_IDENT_STRING-$HOSTNAME.out
+pid=$HDFSPROXY_PID_DIR/hdfsproxy-$HDFSPROXY_IDENT_STRING.pid
+
+# Set default scheduling priority
+if [ "$HDFSPROXY_NICENESS" = "" ]; then
+ export HDFSPROXY_NICENESS=0
+fi
+
+case $startStop in
+
+ (start)
+
+ mkdir -p "$HDFSPROXY_PID_DIR"
+
+ if [ -f $pid ]; then
+ if kill -0 `cat $pid` > /dev/null 2>&1; then
+ echo hdfsproxy running as process `cat $pid`. Stop it first.
+ exit 1
+ fi
+ fi
+
+ if [ "$HDFSPROXY_MASTER" != "" ]; then
+ echo rsync from $HDFSPROXY_MASTER
+ rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' $HDFSPROXY_MASTER/ "$HDFSPROXY_HOME"
+ fi
+
+ hdfsproxy_rotate_log $log
+ echo starting hdfsproxy, logging to $log
+ cd "$HDFSPROXY_HOME"
+ nohup nice -n $HDFSPROXY_NICENESS "$HDFSPROXY_HOME"/bin/hdfsproxy --config $HDFSPROXY_CONF_DIR "$@" > "$log" 2>&1 < /dev/null &
+ echo $! > $pid
+ sleep 1; head "$log"
+ ;;
+
+ (stop)
+
+ if [ -f $pid ]; then
+ if kill -0 `cat $pid` > /dev/null 2>&1; then
+ echo stopping hdfsproxy
+ kill `cat $pid`
+ else
+ echo no hdfsproxy to stop
+ fi
+ else
+ echo no hdfsproxy to stop
+ fi
+ ;;
+
+ (*)
+ echo $usage
+ exit 1
+ ;;
+
+esac
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemons.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemons.sh
new file mode 100755
index 0000000000..7dd8568a3b
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-daemons.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Run a HdfsProxy command on all slave hosts.
+
+usage="Usage: hdfsproxy-daemons.sh [--config confdir] [--hosts hostlistfile] [start|stop] "
+
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. $bin/hdfsproxy-config.sh
+
+exec "$bin/hdfsproxy-slaves.sh" --config $HDFSPROXY_CONF_DIR cd "$HDFSPROXY_HOME" \; "$bin/hdfsproxy-daemon.sh" --config $HDFSPROXY_CONF_DIR "$@"
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-slaves.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-slaves.sh
new file mode 100755
index 0000000000..db54bd5b38
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/hdfsproxy-slaves.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Run a shell command on all slave hosts.
+#
+# Environment Variables
+#
+# HDFSPROXY_SLAVES File naming remote hosts.
+# Default is ${HDFSPROXY_CONF_DIR}/hdfsproxy-hosts.
+# HDFSPROXY_CONF_DIR Alternate conf dir. Default is ${HDFSPROXY_HOME}/conf.
+# HDFSPROXY_SLAVE_SLEEP Seconds to sleep between spawning remote commands.
+# HDFSPROXY_SSH_OPTS Options passed to ssh when running remote commands.
+##
+
+usage="Usage: hdfsproxy-slaves.sh [--config confdir] command..."
+
+# if no args specified, show usage
+if [ $# -le 0 ]; then
+ echo $usage
+ exit 1
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hdfsproxy-config.sh
+
+# If the slaves file is specified in the command line,
+# then it takes precedence over the definition in
+# hdfsproxy-env.sh. Save it here.
+HOSTLIST=$HDFSPROXY_SLAVES
+
+if [ -f "${HDFSPROXY_CONF_DIR}/hdfsproxy-env.sh" ]; then
+ . "${HDFSPROXY_CONF_DIR}/hdfsproxy-env.sh"
+fi
+
+if [ "$HOSTLIST" = "" ]; then
+ if [ "$HDFSPROXY_SLAVES" = "" ]; then
+ export HOSTLIST="${HDFSPROXY_CONF_DIR}/hdfsproxy-hosts"
+ else
+ export HOSTLIST="${HDFSPROXY_SLAVES}"
+ fi
+fi
+
+for slave in `cat "$HOSTLIST"`; do
+ ssh $HDFSPROXY_SSH_OPTS $slave $"${@// /\\ }" \
+ 2>&1 | sed "s/^/$slave: /" &
+ if [ "$HDFSPROXY_SLAVE_SLEEP" != "" ]; then
+ sleep $HDFSPROXY_SLAVE_SLEEP
+ fi
+done
+
+wait
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/start-hdfsproxy.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/start-hdfsproxy.sh
new file mode 100755
index 0000000000..2592d9c8cc
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/start-hdfsproxy.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Start hdfsproxy daemons.
+# Run this on master node.
+
+usage="Usage: start-hdfsproxy.sh"
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hdfsproxy-config.sh
+
+# get arguments
+if [ $# -ge 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+# start hdfsproxy daemons
+# "$bin"/hdfsproxy-daemon.sh --config $HDFSPROXY_CONF_DIR start
+"$bin"/hdfsproxy-daemons.sh --config $HDFSPROXY_CONF_DIR --hosts hdfsproxy-hosts start
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/stop-hdfsproxy.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/stop-hdfsproxy.sh
new file mode 100755
index 0000000000..78089e31cf
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/bin/stop-hdfsproxy.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Stop hdfsproxy daemons. Run this on master node.
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hdfsproxy-config.sh
+
+# "$bin"/hdfsproxy-daemon.sh --config $HDFSPROXY_CONF_DIR stop
+"$bin"/hdfsproxy-daemons.sh --config $HDFSPROXY_CONF_DIR --hosts hdfsproxy-hosts stop
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml
new file mode 100644
index 0000000000..e62b2f279a
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/build.xml
@@ -0,0 +1,183 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Building the .jar files.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/configuration.xsl b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/configuration.xsl
new file mode 100644
index 0000000000..377cdbeb93
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/configuration.xsl
@@ -0,0 +1,24 @@
+
+
+
+
+
+
+
+
+
name
+
value
+
description
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-default.xml b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-default.xml
new file mode 100644
index 0000000000..0d2a006c8e
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-default.xml
@@ -0,0 +1,59 @@
+
+
+
+
+
+
+
+
+ hdfsproxy.https.address
+ 0.0.0.0:50479
+ the SSL port that hdfsproxy listens on
+
+
+
+
+ hdfsproxy.hosts
+ hdfsproxy-hosts
+ location of hdfsproxy-hosts file
+
+
+
+
+ hdfsproxy.dfs.namenode.address
+
+ namenode address of the HDFS cluster being proxied
+
+
+
+
+ hdfsproxy.https.server.keystore.resource
+ ssl-server.xml
+ location of the resource from which ssl server keystore
+ information will be extracted
+
+
+
+
+ hdfsproxy.user.permissions.file.location
+ user-permissions.xml
+ location of the user permissions file
+
+
+
+
+ hdfsproxy.user.certs.file.location
+ user-certs.xml
+ location of the user certs file
+
+
+
+
+ hdfsproxy.ugi.cache.ugi.lifetime
+ 15
+ The lifetime (in minutes) of a cached ugi
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh
new file mode 100644
index 0000000000..a0ff7a5d27
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh
@@ -0,0 +1,44 @@
+# Set HdfsProxy-specific environment variables here.
+
+# The only required environment variable is JAVA_HOME. All others are
+# optional. When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use. Required.
+# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
+
+# Extra Java CLASSPATH elements. Optional.
+# export HDFSPROXY_CLASSPATH=
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HDFSPROXY_HEAPSIZE=2000
+
+# Extra Java runtime options. Empty by default.
+# export HDFSPROXY_OPTS=
+
+# Extra ssh options. Empty by default.
+# export HDFSPROXY_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HDFSPROXY_CONF_DIR"
+
+# Where log files are stored. $HDFSPROXY_HOME/logs by default.
+# export HDFSPROXY_LOG_DIR=${HDFSPROXY_HOME}/logs
+
+# File naming remote slave hosts. $HDFSPROXY_HOME/conf/slaves by default.
+# export HDFSPROXY_SLAVES=${HDFSPROXY_HOME}/conf/slaves
+
+# host:path where hdfsproxy code should be rsync'd from. Unset by default.
+# export HDFSPROXY_MASTER=master:/home/$USER/src/hdfsproxy
+
+# Seconds to sleep between slave commands. Unset by default. This
+# can be useful in large clusters, where, e.g., slave rsyncs can
+# otherwise arrive faster than the master can service them.
+# export HDFSPROXY_SLAVE_SLEEP=0.1
+
+# The directory where pid files are stored. /tmp by default.
+# export HDFSPROXY_PID_DIR=/var/hdfsproxy/pids
+
+# A string representing this instance of hdfsproxy. $USER by default.
+# export HDFSPROXY_IDENT_STRING=$USER
+
+# The scheduling priority for daemon processes. See 'man nice'.
+# export HDFSPROXY_NICENESS=10
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh.template b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh.template
new file mode 100644
index 0000000000..a0ff7a5d27
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-env.sh.template
@@ -0,0 +1,44 @@
+# Set HdfsProxy-specific environment variables here.
+
+# The only required environment variable is JAVA_HOME. All others are
+# optional. When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use. Required.
+# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
+
+# Extra Java CLASSPATH elements. Optional.
+# export HDFSPROXY_CLASSPATH=
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HDFSPROXY_HEAPSIZE=2000
+
+# Extra Java runtime options. Empty by default.
+# export HDFSPROXY_OPTS=
+
+# Extra ssh options. Empty by default.
+# export HDFSPROXY_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HDFSPROXY_CONF_DIR"
+
+# Where log files are stored. $HDFSPROXY_HOME/logs by default.
+# export HDFSPROXY_LOG_DIR=${HDFSPROXY_HOME}/logs
+
+# File naming remote slave hosts. $HDFSPROXY_HOME/conf/slaves by default.
+# export HDFSPROXY_SLAVES=${HDFSPROXY_HOME}/conf/slaves
+
+# host:path where hdfsproxy code should be rsync'd from. Unset by default.
+# export HDFSPROXY_MASTER=master:/home/$USER/src/hdfsproxy
+
+# Seconds to sleep between slave commands. Unset by default. This
+# can be useful in large clusters, where, e.g., slave rsyncs can
+# otherwise arrive faster than the master can service them.
+# export HDFSPROXY_SLAVE_SLEEP=0.1
+
+# The directory where pid files are stored. /tmp by default.
+# export HDFSPROXY_PID_DIR=/var/hdfsproxy/pids
+
+# A string representing this instance of hdfsproxy. $USER by default.
+# export HDFSPROXY_IDENT_STRING=$USER
+
+# The scheduling priority for daemon processes. See 'man nice'.
+# export HDFSPROXY_NICENESS=10
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-hosts b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-hosts
new file mode 100644
index 0000000000..2fbb50c4a8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/hdfsproxy-hosts
@@ -0,0 +1 @@
+localhost
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/log4j.properties b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/log4j.properties
new file mode 100644
index 0000000000..2520ab3795
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/log4j.properties
@@ -0,0 +1,61 @@
+# Define some default values that can be overridden by system properties
+hdfsproxy.root.logger=INFO,console
+hdfsproxy.log.dir=.
+hdfsproxy.log.file=hdfsproxy.log
+
+# Define the root logger to the system property "hdfsproxy.root.logger".
+log4j.rootLogger=${hdfsproxy.root.logger}
+
+# Logging Threshold
+log4j.threshhold=ALL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hdfsproxy.log.dir}/${hdfsproxy.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hdfsproxy.log.dir}/${hdfsproxy.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.hdfsproxy.HttpsProxy=DEBUG
+#log4j.logger.org.apache.hadoop.hdfsproxy.ProxyFilter=DEBUG
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-certs.xml b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-certs.xml
new file mode 100644
index 0000000000..f572a55294
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-certs.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+ Admin
+
+ Special hdfsproxy admin user
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-permissions.xml b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-permissions.xml
new file mode 100644
index 0000000000..b7373751bd
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/conf/user-permissions.xml
@@ -0,0 +1,28 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/hdfsproxy/hdfsproxy-1.0.jar b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/hdfsproxy-1.0.jar
new file mode 100644
index 0000000000..a313391dfb
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/hdfsproxy/hdfsproxy-1.0.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/index/hadoop-0.20.0-index.jar b/core/lib/hadoop-0.20.0/contrib/index/hadoop-0.20.0-index.jar
new file mode 100644
index 0000000000..f1f850fcd3
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/index/hadoop-0.20.0-index.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/streaming/hadoop-0.20.0-streaming.jar b/core/lib/hadoop-0.20.0/contrib/streaming/hadoop-0.20.0-streaming.jar
new file mode 100644
index 0000000000..84251e3a3c
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/streaming/hadoop-0.20.0-streaming.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/thriftfs/hadoop-0.20.0-thriftfs.jar b/core/lib/hadoop-0.20.0/contrib/thriftfs/hadoop-0.20.0-thriftfs.jar
new file mode 100644
index 0000000000..bf10c05e1d
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/thriftfs/hadoop-0.20.0-thriftfs.jar differ
diff --git a/core/lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh b/core/lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh
new file mode 100755
index 0000000000..ada6715342
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/vaidya/bin/vaidya.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+this="$0"
+while [ -h "$this" ]; do
+ ls=`ls -ld "$this"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ this="$link"
+ else
+ this=`dirname "$this"`/"$link"
+ fi
+done
+
+# convert relative path to absolute path
+bin=`dirname "$this"`
+script=`basename "$this"`
+bin=`cd "$bin"; pwd`
+this="$bin/$script"
+
+# Check if HADOOP_HOME AND JAVA_HOME is set.
+if [ -z $HADOOP_HOME ] ; then
+ echo "HADOOP_HOME environment variable not defined"
+ exit -1;
+fi
+
+if [ -z $JAVA_HOME ] ; then
+ echo "JAVA_HOME environment variable not defined"
+ exit -1;
+fi
+
+hadoopVersion=`$HADOOP_HOME/bin/hadoop version | awk 'BEGIN { RS = "" ; FS = "\n" } ; { print $1 }' | awk '{print $2}'`
+
+$JAVA_HOME/bin/java -classpath $HADOOP_HOME/hadoop-${hadoopVersion}-core.jar:$HADOOP_HOME/contrib/vaidya/hadoop-${hadoopVersion}-vaidya.jar:$HADOOP_HOME/lib/commons-logging-1.0.4.jar:${CLASSPATH} org.apache.hadoop.vaidya.postexdiagnosis.PostExPerformanceDiagnoser $@
diff --git a/core/lib/hadoop-0.20.0/contrib/vaidya/conf/postex_diagnosis_tests.xml b/core/lib/hadoop-0.20.0/contrib/vaidya/conf/postex_diagnosis_tests.xml
new file mode 100644
index 0000000000..f30d5d9cc8
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/contrib/vaidya/conf/postex_diagnosis_tests.xml
@@ -0,0 +1,104 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 3.0
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/contrib/vaidya/hadoop-0.20.0-vaidya.jar b/core/lib/hadoop-0.20.0/contrib/vaidya/hadoop-0.20.0-vaidya.jar
new file mode 100644
index 0000000000..534b18d974
Binary files /dev/null and b/core/lib/hadoop-0.20.0/contrib/vaidya/hadoop-0.20.0-vaidya.jar differ
diff --git a/core/lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar b/core/lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar
new file mode 100644
index 0000000000..0c2b74e74f
Binary files /dev/null and b/core/lib/hadoop-0.20.0/hadoop-0.20.0-ant.jar differ
diff --git a/core/lib/hadoop-0.20.0/hadoop-0.20.0-core.jar b/core/lib/hadoop-0.20.0/hadoop-0.20.0-core.jar
new file mode 100644
index 0000000000..c99ce6d44d
Binary files /dev/null and b/core/lib/hadoop-0.20.0/hadoop-0.20.0-core.jar differ
diff --git a/core/lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar b/core/lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar
new file mode 100644
index 0000000000..23b88f885d
Binary files /dev/null and b/core/lib/hadoop-0.20.0/hadoop-0.20.0-examples.jar differ
diff --git a/core/lib/hadoop-0.20.0/hadoop-0.20.0-test.jar b/core/lib/hadoop-0.20.0/hadoop-0.20.0-test.jar
new file mode 100644
index 0000000000..02b17d4160
Binary files /dev/null and b/core/lib/hadoop-0.20.0/hadoop-0.20.0-test.jar differ
diff --git a/core/lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar b/core/lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar
new file mode 100644
index 0000000000..60f5e600a6
Binary files /dev/null and b/core/lib/hadoop-0.20.0/hadoop-0.20.0-tools.jar differ
diff --git a/core/lib/hadoop-0.20.0/ivy.xml b/core/lib/hadoop-0.20.0/ivy.xml
new file mode 100644
index 0000000000..051ac6efb0
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/ivy.xml
@@ -0,0 +1,261 @@
+
+
+
+
+
+
+
+
+
+ Hadoop Core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/ivy/hadoop-core.pom b/core/lib/hadoop-0.20.0/ivy/hadoop-core.pom
new file mode 100644
index 0000000000..ffdd18951e
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/ivy/hadoop-core.pom
@@ -0,0 +1,257 @@
+
+
+
+
+ 4.0.0
+ org.apache.hadoop
+ hadoop-core
+ jar
+ ${hadoop.version}
+
+ Hadoop is the distributed computing framework of Apache; hadoop-core contains
+ the filesystem, job tracker and map/reduce modules
+
+
+
+ Apache License, Version 2.0
+ http://apache.org/licenses/LICENSE-2.0
+
+
+
+
+
+
+
+ commons-logging
+ commons-logging
+ ${commons-logging.version}
+
+
+ avalon-framework
+ avalon-framework
+
+
+ javax.servlet
+ servlet-api
+
+
+ junit
+ junit
+
+
+ logkit
+ logkit
+
+
+ log4j
+ log4j
+
+
+
+
+
+ log4j
+ log4j
+ ${log4j.version}
+ optional
+
+
+ javax.mail
+ mail
+
+
+ javax.jms
+ jms
+
+
+ com.sun.jdmk
+ jmxtools
+
+
+ com.sun.jmx
+ jmxri
+
+
+
+
+
+
+ org.slf4j
+ slf4j-api
+ ${slf4j-api.version}
+ optional
+
+
+ org.slf4j
+ slf4j-log4j12
+ ${slf4j-log4j12.version}
+ optional
+
+
+ log4j
+ log4j
+
+
+
+
+
+
+
+ commons-httpclient
+ commons-httpclient
+ 3.1
+ optional
+
+
+ commons-logging
+ commons-logging
+
+
+ junit
+ junit
+
+
+
+
+ commons-codec
+ commons-codec
+ 1.3
+ optional
+
+
+
+
+ commons-cli
+ commons-cli
+ 2.0-20070823
+ optional
+
+
+
+
+
+ commons-net
+ commons-net
+ 1.4.1
+ optional
+
+
+
+
+ javax.servlet
+ servlet-api
+ ${servlet-api.version}
+ optional
+
+
+ jetty
+ org.mortbay.jetty
+ ${jetty.version}
+ optional
+
+
+
+
+
+
+ org.mortbay.jetty
+ jsp-2.1
+ ${jetty.version}
+ optional
+
+
+ org.mortbay.jetty
+ jsp-api-2.1
+ ${jetty.version}
+ optional
+
+
+ commons-el
+ commons-el
+ ${commons-el.version}
+ optional
+
+
+
+
+
+
+ org.eclipse.jdt
+ core
+ ${core.version}
+ optional
+
+
+ org.apache.ant
+ ant
+ ${apacheant.version}
+ optional
+
+
+
+
+
+ net.java.dev.jets3t
+ jets3t
+ ${jets3t.version}
+ optional
+
+
+ commons-logging
+ commons-logging
+
+
+ junit
+ junit
+
+
+
+
+
+
+
+
+
+ xmlenc
+ xmlenc
+ 0.52
+ optional
+
+
+
diff --git a/core/lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar b/core/lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar
new file mode 100644
index 0000000000..fa9ef21c7b
Binary files /dev/null and b/core/lib/hadoop-0.20.0/ivy/ivy-2.0.0-rc2.jar differ
diff --git a/core/lib/hadoop-0.20.0/ivy/ivysettings.xml b/core/lib/hadoop-0.20.0/ivy/ivysettings.xml
new file mode 100644
index 0000000000..a7fcd22031
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/ivy/ivysettings.xml
@@ -0,0 +1,81 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/ivy/libraries.properties b/core/lib/hadoop-0.20.0/ivy/libraries.properties
new file mode 100644
index 0000000000..17cf390d92
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/ivy/libraries.properties
@@ -0,0 +1,71 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#This properties file lists the versions of the various artifacts used by hadoop and components.
+#It drives ivy and the generation of a maven POM
+
+# This is the version of hadoop we are generating
+hadoop.version=0.20.0
+
+#These are the versions of our dependencies (in alphabetical order)
+apacheant.version=1.7.0
+
+checkstyle.version=4.2
+
+commons-cli.version=2.0-SNAPSHOT
+commons-codec.version=1.3
+commons-collections.version=3.1
+commons-httpclient.version=3.0.1
+commons-lang.version=2.4
+commons-logging.version=1.0.4
+commons-logging-api.version=1.0.4
+commons-el.version=1.0
+commons-fileupload.version=1.2
+commons-io.version=1.4
+commons-net.version=1.4.1
+core.version=3.1.1
+coreplugin.version=1.3.2
+
+hsqldb.version=1.8.0.10
+
+#ivy.version=2.0.0-beta2
+ivy.version=2.0.0-rc2
+
+jasper.version=5.5.12
+#not able to figureout the version of jsp & jsp-api version to get it resolved throught ivy
+# but still declared here as we are going to have a local copy from the lib folder
+jsp.version=2.1
+jsp-api.version=5.5.12
+jets3t.version=0.6.1
+jetty.version=6.1.14
+jetty-util.version=6.1.14
+junit.version=3.8.1
+jdiff.version=1.0.9
+json.version=1.0
+
+kfs.version=0.1
+
+log4j.version=1.2.15
+lucene-core.version=2.3.1
+
+oro.version=2.0.8
+
+rats-lib.version=0.5.1
+
+servlet.version=4.0.6
+servlet-api-2.5.version=6.1.14
+servlet-api.version=2.5
+slf4j-api.version=1.4.3
+slf4j-log4j12.version=1.4.3
+
+xmlenc.version=0.52
+xerces.version=1.4.4
diff --git a/core/lib/hadoop-0.20.0/lib/.DS_Store b/core/lib/hadoop-0.20.0/lib/.DS_Store
new file mode 100644
index 0000000000..e0d363a012
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/.DS_Store differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar b/core/lib/hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar
new file mode 100644
index 0000000000..0b1d51072a
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-cli-2.0-SNAPSHOT.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-codec-1.3.jar b/core/lib/hadoop-0.20.0/lib/commons-codec-1.3.jar
new file mode 100644
index 0000000000..957b6752af
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-codec-1.3.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-el-1.0.jar b/core/lib/hadoop-0.20.0/lib/commons-el-1.0.jar
new file mode 100644
index 0000000000..608ed796ca
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-el-1.0.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar b/core/lib/hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar
new file mode 100644
index 0000000000..cfc777c71d
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-httpclient-3.0.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-logging-1.0.4.jar b/core/lib/hadoop-0.20.0/lib/commons-logging-1.0.4.jar
new file mode 100644
index 0000000000..b73a80fab6
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-logging-1.0.4.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-logging-api-1.0.4.jar b/core/lib/hadoop-0.20.0/lib/commons-logging-api-1.0.4.jar
new file mode 100644
index 0000000000..ade9a13c78
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-logging-api-1.0.4.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar b/core/lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar
new file mode 100644
index 0000000000..9666a92c80
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/commons-net-1.4.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/core-3.1.1.jar b/core/lib/hadoop-0.20.0/lib/core-3.1.1.jar
new file mode 100644
index 0000000000..ae0b635867
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/core-3.1.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt b/core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt
new file mode 100644
index 0000000000..d45b9f8cc0
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.LICENSE.txt
@@ -0,0 +1,66 @@
+/* Copyright (c) 1995-2000, The Hypersonic SQL Group.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of the Hypersonic SQL Group nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE HYPERSONIC SQL GROUP,
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This software consists of voluntary contributions made by many individuals
+ * on behalf of the Hypersonic SQL Group.
+ *
+ *
+ * For work added by the HSQL Development Group:
+ *
+ * Copyright (c) 2001-2004, The HSQL Development Group
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of the HSQL Development Group nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HSQL DEVELOPMENT GROUP, HSQLDB.ORG,
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar b/core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar
new file mode 100644
index 0000000000..e010269ddf
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/hsqldb-1.8.0.10.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar b/core/lib/hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar
new file mode 100644
index 0000000000..2a410b4b58
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jasper-compiler-5.5.12.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar b/core/lib/hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar
new file mode 100644
index 0000000000..743d906c1f
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jasper-runtime-5.5.12.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml
new file mode 100644
index 0000000000..69dded3140
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.17.0.xml
@@ -0,0 +1,43272 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The balancer is a tool that balances disk space usage on an HDFS cluster
+ when some datanodes become full or when new empty nodes join the cluster.
+ The tool is deployed as an application program that can be run by the
+ cluster administrator on a live HDFS cluster while applications
+ adding and deleting files.
+
+
SYNOPSIS
+
+ To start:
+ bin/start-balancer.sh [-threshold ]
+ Example: bin/ start-balancer.sh
+ start the balancer with a default threshold of 10%
+ bin/ start-balancer.sh -threshold 5
+ start the balancer with a threshold of 5%
+ To stop:
+ bin/ stop-balancer.sh
+
+
+
DESCRIPTION
+
The threshold parameter is a fraction in the range of (0%, 100%) with a
+ default value of 10%. The threshold sets a target for whether the cluster
+ is balanced. A cluster is balanced if for each datanode, the utilization
+ of the node (ratio of used space at the node to total capacity of the node)
+ differs from the utilization of the (ratio of used space in the cluster
+ to total capacity of the cluster) by no more than the threshold value.
+ The smaller the threshold, the more balanced a cluster will become.
+ It takes more time to run the balancer for small threshold values.
+ Also for a very small threshold the cluster may not be able to reach the
+ balanced state when applications write and delete files concurrently.
+
+
The tool moves blocks from highly utilized datanodes to poorly
+ utilized datanodes iteratively. In each iteration a datanode moves or
+ receives no more than the lesser of 10G bytes or the threshold fraction
+ of its capacity. Each iteration runs no more than 20 minutes.
+ At the end of each iteration, the balancer obtains updated datanodes
+ information from the namenode.
+
+
A system property that limits the balancer's use of bandwidth is
+ defined in the default configuration file:
+
+
+ dfs.balance.bandwidthPerSec
+ 1048576
+ Specifies the maximum bandwidth that each datanode
+ can utilize for the balancing purpose in term of the number of bytes
+ per second.
+
+
+
+
This property determines the maximum speed at which a block will be
+ moved from one datanode to another. The default value is 1MB/s. The higher
+ the bandwidth, the faster a cluster can reach the balanced state,
+ but with greater competition with application processes. If an
+ administrator changes the value of this property in the configuration
+ file, the change is observed when HDFS is next restarted.
+
+
MONITERING BALANCER PROGRESS
+
After the balancer is started, an output file name where the balancer
+ progress will be recorded is printed on the screen. The administrator
+ can monitor the running of the balancer by reading the output file.
+ The output shows the balancer's status iteration by iteration. In each
+ iteration it prints the starting time, the iteration number, the total
+ number of bytes that have been moved in the previous iterations,
+ the total number of bytes that are left to move in order for the cluster
+ to be balanced, and the number of bytes that are being moved in this
+ iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left
+ To Move" is decreasing.
+
+
Running multiple instances of the balancer in an HDFS cluster is
+ prohibited by the tool.
+
+
The balancer automatically exits when any of the following five
+ conditions is satisfied:
+
+
The cluster is balanced;
+
No block can be moved;
+
No block has been moved for five consecutive iterations;
+
An IOException occurs while communicating with the namenode;
+
Another balancer is running.
+
+
+
Upon exit, a balancer returns an exit code and prints one of the
+ following messages to the output file in corresponding to the above exit
+ reasons:
+
+
The cluster is balanced. Exiting
+
No block can be moved. Exiting...
+
No block has been moved for 3 iterations. Exiting...
+
Received an IO exception: failure reason. Exiting...
+
Another balancer is running. Exiting...
+
+
+
The administrator can interrupt the execution of the balancer at any
+ time by running the command "stop-balancer.sh" on the machine where the
+ balancer is running.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ stream of bytes (of BLOCK_SIZE or less)
+
+ This info is stored on a local disk. The DataNode
+ reports the table's contents to the NameNode upon startup
+ and every so often afterwards.
+
+ DataNodes spend their lives in an endless loop of asking
+ the NameNode for something to do. A NameNode cannot connect
+ to a DataNode directly; a NameNode simply returns values from
+ functions invoked by a DataNode.
+
+ DataNodes maintain an open server socket so that client code
+ or other DataNodes can read/write data. The host/port for
+ this server is reported to the NameNode, which then sends that
+ information to clients or other DataNodes that might be interested.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The tool scans all files and directories, starting from an indicated
+ root path. The following abnormal conditions are detected and handled:
+
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link NamenodeFsck#FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
{@link FSConstants.StartupOption#REGULAR REGULAR} - normal startup
+
{@link FSConstants.StartupOption#FORMAT FORMAT} - format name node
+
{@link FSConstants.StartupOption#UPGRADE UPGRADE} - start the cluster
+ upgrade and create a snapshot of the current file system state
+
{@link FSConstants.StartupOption#ROLLBACK ROLLBACK} - roll the
+ cluster back to the previous state
+
+ The option is passed via configuration field:
+ dfs.namenode.startup
+
+ The conf will be modified to reflect the actual ports on which
+ the NameNode is up and running if the user passes the port as
+ zero in the conf.
+
+ @param conf confirguration
+ @throws IOException]]>
+
+
+
+
+
+ zero.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ datanode whose
+ total size is size
+
+ @param datanode on which blocks are located
+ @param size total size of blocks]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blocksequence (namespace)
+ 2) block->machinelist ("inodes")
+
+ The first table is stored on disk and is very precious.
+ The second table is rebuilt every time the NameNode comes
+ up.
+
+ 'NameNode' refers to both this class as well as the 'NameNode server'.
+ The 'FSNamesystem' class actually performs most of the filesystem
+ management. The majority of the 'NameNode' class itself is concerned
+ with exposing the IPC interface to the outside world, plus some
+ configuration management.
+
+ NameNode implements the ClientProtocol interface, which allows
+ clients to ask for DFS services. ClientProtocol is not
+ designed for direct use by authors of DFS client code. End-users
+ should instead use the org.apache.nutch.hadoop.fs.FileSystem class.
+
+ NameNode also implements the DatanodeProtocol interface, used by
+ DataNode programs that actually store DFS data blocks. These
+ methods are invoked repeatedly and automatically by all the
+ DataNodes in a DFS deployment.
+
+ NameNode also implements the NamenodeProtocol interface, used by
+ secondary namenodes or rebalancing processes to get partial namenode's
+ state, for example partial blocksMap etc.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The tool scans all files and directories, starting from an indicated
+ root path. The following abnormal conditions are detected and handled:
+
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link #FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link #FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
+
delete corrupted files ({@link #FIXING_DELETE})
+
+
+
detect files with under-replicated or over-replicated blocks
+
+ Additionally, the tool collects a detailed overall DFS statistics, and
+ optionally can print detailed statistics on block locations and replication
+ factors of each file.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
The most important difference is that unlike GFS, Hadoop DFS files
+have strictly one writer at any one time. Bytes are always appended
+to the end of the writer's stream. There is no notion of "record appends"
+or "mutations" that are then checked or reordered. Writers simply emit
+a byte stream. That byte stream is guaranteed to be stored in the
+order written.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip files) are un-archived at the slave nodes. Jars maybe be
+ optionally added to the classpath of the tasks, a rudimentary software
+ distribution mechanism. Files have execution permissions. Optionally users
+ can also direct it to symlink the distributed cache file(s) into
+ the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+This pages describes how to use Kosmos Filesystem
+( KFS ) as a backing
+store with Hadoop. This page assumes that you have downloaded the
+KFS software and installed necessary binaries as outlined in the KFS
+documentation.
+
+
Steps
+
+
+
In the Hadoop conf directory edit hadoop-default.xml,
+ add the following:
+
In the Hadoop conf directory edit hadoop-site.xml,
+ adding the following (with appropriate values for
+ <server> and <port>):
+
+<property>
+ <name>fs.default.name</name>
+ <value>kfs://<server:port></value>
+</property>
+
+<property>
+ <name>fs.kfs.metaServerHost</name>
+ <value><server></value>
+ <description>The location of the KFS meta server.</description>
+</property>
+
+<property>
+ <name>fs.kfs.metaServerPort</name>
+ <value><port></value>
+ <description>The location of the meta server's port.</description>
+</property>
+
+
+
+
+
Copy KFS's kfs-0.1.jar to Hadoop's lib directory. This step
+ enables Hadoop's to load the KFS specific modules. Note
+ that, kfs-0.1.jar was built when you compiled KFS source
+ code. This jar file contains code that calls KFS's client
+ library code via JNI; the native code is in KFS's
+ libkfsClient.so library.
+
+
+
When the Hadoop map/reduce trackers start up, those
+processes (on local as well as remote nodes) will now need to load
+KFS's libkfsClient.so library. To simplify this process, it is advisable to
+store libkfsClient.so in an NFS accessible directory (similar to where
+Hadoop binaries/scripts are stored); then, modify Hadoop's
+conf/hadoop-env.sh adding the following line and providing suitable
+value for <path>:
+
+export LD_LIBRARY_PATH=<path>
+
+
+
+
Start only the map/reduce trackers
+
+ example: execute Hadoop's bin/start-mapred.sh
+Files are stored in S3 as blocks (represented by
+{@link org.apache.hadoop.fs.s3.Block}), which have an ID and a length.
+Block metadata is stored in S3 as a small record (represented by
+{@link org.apache.hadoop.fs.s3.INode}) using the URL-encoded
+path string as a key. Inodes record the file type (regular file or directory) and the list of blocks.
+This design makes it easy to seek to any given position in a file by reading the inode data to compute
+which block to access, then using S3's support for
+HTTP Range headers
+to start streaming from the correct position.
+Renames are also efficient since only the inode is moved (by a DELETE followed by a PUT since
+S3 does not support renames).
+
+
+For a single file /dir1/file1 which takes two blocks of storage, the file structure in S3
+would be something like this:
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+A mechanism for selectively retrying methods that throw exceptions under certain circumstances.
+
+
+
+This will retry any method called on unreliable four times - in this case the call()
+method - sleeping 10 seconds between
+each retry. There are a number of {@link org.apache.hadoop.io.retry.RetryPolicies retry policies}
+available, or you can implement a custom one by implementing {@link org.apache.hadoop.io.retry.RetryPolicy}.
+It is also possible to specify retry policies on a
+{@link org.apache.hadoop.io.retry.RetryProxy#create(Class, Object, Map) per-method basis}.
+
]]>
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This package provides a mechanism for using different serialization frameworks
+in Hadoop. The property "io.serializations" defines a list of
+{@link org.apache.hadoop.io.serializer.Serialization}s that know how to create
+{@link org.apache.hadoop.io.serializer.Serializer}s and
+{@link org.apache.hadoop.io.serializer.Deserializer}s.
+
+
+
+To add a new serialization framework write an implementation of
+{@link org.apache.hadoop.io.serializer.Serialization} and add its name to the
+"io.serializations" property.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcDiscardedOps}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides generic implementations of
+ {@link #validateInput(JobConf)} and {@link #getSplits(JobConf, int)}.
+ Implementations fo FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the taskid, say
+ task_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. Typically all values are combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
A Map-Reduce job usually splits the input data-set into independent
+chunks which processed by map tasks in completely parallel manner,
+followed by reduce tasks which aggregating their output. Typically both
+the input and the output of the job are stored in a
+{@link org.apache.hadoop.fs.FileSystem}. The framework takes care of monitoring
+tasks and re-executing failed ones. Since, usually, the compute nodes and the
+storage nodes are the same i.e. Hadoop's Map-Reduce framework and Distributed
+FileSystem are running on the same set of nodes, tasks are effectively scheduled
+on the nodes where data is already present, resulting in very high aggregate
+bandwidth across the cluster.
+
+
The Map-Reduce framework operates exclusively on <key, value>
+pairs i.e. the input to the job is viewed as a set of <key, value>
+pairs and the output as another, possibly different, set of
+<key, value> pairs. The keys and values have to
+be serializable as {@link org.apache.hadoop.io.Writable}s and additionally the
+keys have to be {@link org.apache.hadoop.io.WritableComparable}s in
+order to facilitate grouping by the framework.
+
+
Data flow:
+
+ (input)
+ <k1, v1>
+
+ |
+ V
+
+ map
+
+ |
+ V
+
+ <k2, v2>
+
+ |
+ V
+
+ combine
+
+ |
+ V
+
+ <k2, v2>
+
+ |
+ V
+
+ reduce
+
+ |
+ V
+
+ <k3, v3>
+ (output)
+
+
+
Applications typically implement
+{@link org.apache.hadoop.mapred.Mapper#map(Object, Object, OutputCollector, Reporter)}
+and
+{@link org.apache.hadoop.mapred.Reducer#reduce(Object, Iterator, OutputCollector, Reporter)}
+methods. The application-writer also specifies various facets of the job such
+as input and output locations, the Partitioner, InputFormat
+& OutputFormat implementations to be used etc. as
+a {@link org.apache.hadoop.mapred.JobConf}. The client program,
+{@link org.apache.hadoop.mapred.JobClient}, then submits the job to the framework
+and optionally monitors it.
+
+
The framework spawns one map task per
+{@link org.apache.hadoop.mapred.InputSplit} generated by the
+{@link org.apache.hadoop.mapred.InputFormat} of the job and calls
+{@link org.apache.hadoop.mapred.Mapper#map(Object, Object, OutputCollector, Reporter)}
+with each <key, value> pair read by the
+{@link org.apache.hadoop.mapred.RecordReader} from the InputSplit for
+the task. The intermediate outputs of the maps are then grouped by keys
+and optionally aggregated by combiner. The key space of intermediate
+outputs are paritioned by the {@link org.apache.hadoop.mapred.Partitioner}, where
+the number of partitions is exactly the number of reduce tasks for the job.
+
+
The reduce tasks fetch the sorted intermediate outputs of the maps, via http,
+merge the <key, value> pairs and call
+{@link org.apache.hadoop.mapred.Reducer#reduce(Object, Iterator, OutputCollector, Reporter)}
+for each <key, list of values> pair. The output of the reduce tasks' is
+stored on the FileSystem by the
+{@link org.apache.hadoop.mapred.RecordWriter} provided by the
+{@link org.apache.hadoop.mapred.OutputFormat} of the job.
+
+
Map-Reduce application to perform a distributed grep:
+
+public class Grep extends Configured implements Tool {
+
+ // map: Search for the pattern specified by 'grep.mapper.regex' &
+ // 'grep.mapper.regex.group'
+
+ class GrepMapper<K, Text>
+ extends MapReduceBase implements Mapper<K, Text, Text, LongWritable> {
+
+ private Pattern pattern;
+ private int group;
+
+ public void configure(JobConf job) {
+ pattern = Pattern.compile(job.get("grep.mapper.regex"));
+ group = job.getInt("grep.mapper.regex.group", 0);
+ }
+
+ public void map(K key, Text value,
+ OutputCollector<Text, LongWritable> output,
+ Reporter reporter)
+ throws IOException {
+ String text = value.toString();
+ Matcher matcher = pattern.matcher(text);
+ while (matcher.find()) {
+ output.collect(new Text(matcher.group(group)), new LongWritable(1));
+ }
+ }
+ }
+
+ // reduce: Count the number of occurrences of the pattern
+
+ class GrepReducer<K> extends MapReduceBase
+ implements Reducer<K, LongWritable, K, LongWritable> {
+
+ public void reduce(K key, Iterator<LongWritable> values,
+ OutputCollector<K, LongWritable> output,
+ Reporter reporter)
+ throws IOException {
+
+ // sum all values for this key
+ long sum = 0;
+ while (values.hasNext()) {
+ sum += values.next().get();
+ }
+
+ // output sum
+ output.collect(key, new LongWritable(sum));
+ }
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 3) {
+ System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
+ ToolRunner.printGenericCommandUsage(System.out);
+ return -1;
+ }
+
+ JobConf grepJob = new JobConf(getConf(), Grep.class);
+
+ grepJob.setJobName("grep");
+
+ grepJob.setInputPath(new Path(args[0]));
+ grepJob.setOutputPath(args[1]);
+
+ grepJob.setMapperClass(GrepMapper.class);
+ grepJob.setCombinerClass(GrepReducer.class);
+ grepJob.setReducerClass(GrepReducer.class);
+
+ grepJob.set("mapred.mapper.regex", args[2]);
+ if (args.length == 4)
+ grepJob.set("mapred.mapper.regex.group", args[3]);
+
+ grepJob.setOutputFormat(SequenceFileOutputFormat.class);
+ grepJob.setOutputKeyClass(Text.class);
+ grepJob.setOutputValueClass(LongWritable.class);
+
+ JobClient.runJob(grepJob);
+
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(new Configuration(), new Grep(), args);
+ System.exit(res);
+ }
+
+}
+
+
+
Notice how the data-flow of the above grep job is very similar to doing the
+same via the unix pipeline:
+
+
+cat input/* | grep | sort | uniq -c > out
+
+
+
+ input | map | shuffle | reduce > out
+
+
+
Hadoop Map-Reduce applications need not be written in
+JavaTM only.
+Hadoop Streaming is a utility
+which allows users to create and run jobs with any executables (e.g. shell
+utilities) as the mapper and/or the reducer.
+Hadoop Pipes is a
+SWIG-compatible C++ API to implement
+Map-Reduce applications (non JNITM based).
Operations included in this patch are partitioned into one of two types:
+join operations emitting tuples and "multi-filter" operations emitting a
+single value from (but not necessarily included in) a set of input values.
+For a given key, each operation will consider the cross product of all
+values for all sources at that node.
+
+
Identifiers supported by default:
+
+
+
identifier
type
description
+
inner
Join
Full inner join
+
outer
Join
Full outer join
+
override
MultiFilter
+
For a given key, prefer values from the rightmost source
+
+
+
A user of this class must set the InputFormat for the job to
+CompositeInputFormat and define a join expression accepted by the
+preceding grammar. For example, both of the following are acceptable:
CompositeInputFormat includes a handful of convenience methods to
+aid construction of these verbose statements.
+
+
As in the second example, joins may be nested. Users may provide a
+comparator class in the mapred.join.keycomparator property to specify
+the ordering of their keys, or accept the default comparator as returned by
+WritableComparator.get(keyclass).
+
+
Users can specify their own join operations, typically by overriding
+JoinRecordReader or MultiFilterRecordReader and mapping that
+class to an identifier in the join expression using the
+mapred.join.define.ident property, where ident is
+the identifier appearing in the join expression. Users may elect to emit- or
+modify- values passing through their join operation. Consulting the existing
+operations for guidance is recommended. Adding arguments is considerably more
+complex (and only partially supported), as one must also add a Node
+type to the parse tree. One is probably better off extending
+RecordReader in most cases.
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+
+Generally speaking, in order to implement an application using Map/Reduce
+model, the developer needs to implement Map and Reduce functions (and possibly
+Combine function). However, for a lot of applications related to counting and
+statistics computing, these functions have very similar
+characteristics. This provides a package implementing
+those patterns. In particular, the package provides a generic mapper class,
+a reducer class and a combiner class, and a set of built-in value aggregators.
+It also provides a generic utility class, ValueAggregatorJob, that offers a static function that
+creates map/reduce jobs:
+
+To call this function, the user needs to pass in arguments specifying the input directories, the output directory,
+the number of reducers, the input data format (textinputformat or sequencefileinputformat), and a file specifying user plugin class(es) to load by the mapper.
+A user plugin class is responsible for specifying what
+aggregators to use and what values are for which aggregators.
+A plugin class must implement the following interface:
+
+
+ public interface ValueAggregatorDescriptor {
+ public ArrayList<Entry> generateKeyValPairs(Object key, Object value);
+ public void configure(JobConfjob);
+}
+
+
+Function generateKeyValPairs will generate aggregation key/value pairs for the
+input key/value pair. Each aggregation key encodes two pieces of information: the aggregation type and aggregation ID.
+The value is the value to be aggregated onto the aggregation ID according to the aggregation type. Here
+is a simple example user plugin class for counting the words in the input texts:
+
+
+public class WordCountAggregatorDescriptor extends ValueAggregatorBaseDescriptor {
+ public ArrayList<Entry> generateKeyValPairs(Object key, Object val) {
+ String words [] = val.toString().split(" |\t");
+ ArrayList<Entry> retv = new ArrayList<Entry>();
+ for (int i = 0; i < words.length; i++) {
+ retv.add(generateEntry(LONG_VALUE_SUM, words[i], ONE))
+ }
+ return retv;
+ }
+ public void configure(JobConf job) {}
+}
+
+
+In the above code, LONG_VALUE_SUM is a string denoting the aggregation type LongValueSum, which sums over long values.
+ONE denotes a string "1". Function generateEntry(LONG_VALUE_SUM, words[i], ONE) will inperpret the first argument as an aggregation type, the second as an aggregation ID, and the third argumnent as the value to be aggregated. The output will look like: "LongValueSum:xxxx", where XXXX is the string value of words[i]. The value will be "1". The mapper will call generateKeyValPairs(Object key, Object val) for each input key/value pair to generate the desired aggregation id/value pairs.
+The down stream combiner/reducer will interpret these pairs as adding one to the aggregator XXXX.
+
+Class ValueAggregatorBaseDescriptor is a base class that user plugin classes can extend. Here is the XML fragment specifying the user plugin class:
+
+Thus, if no user plugin class is specified, the default behavior of the map/reduce job is to count the number of records (lines) in the imput files.
+
+During runtime, the mapper will invoke the generateKeyValPairs function for each input key/value pair, and emit the generated
+key/value pairs:
+
+The reducer will create an aggregator object for each key/value list pair, and perform the appropriate aggregation.
+At the end, it will emit the aggregator's results:
+
+In order to be able to use combiner, all the aggregation type be aggregators must be associative and communitive.
+The following are the types supported:
+
LongValueSum: sum over long values
+
DoubleValueSum: sum over float/double values
+
uniqValueCount: count the number of distinct values
+
ValueHistogram: compute the histogram of values compute the minimum, maximum, media,average, standard deviation of numeric values
+
+
+
Create and run an application
+
+To create an application, the user needs to do the following things:
+
+1. Implement a user plugin:
+
+
+The application programs link against a thin C++ wrapper library that
+handles the communication with the rest of the Hadoop system. The C++
+interface is "swigable" so that interfaces can be generated for python
+and other scripting languages. All of the C++ functions and classes
+are in the HadoopPipes namespace. The job may consist of any
+combination of Java and C++ RecordReaders, Mappers, Paritioner,
+Combiner, Reducer, and RecordWriter.
+
+
+
+Hadoop Pipes has a generic Java class for handling the mapper and
+reducer (PipesMapRunner and PipesReducer). They fork off the
+application program and communicate with it over a socket. The
+communication is handled by the C++ wrapper library and the
+PipesMapRunner and PipesReducer.
+
+
+
+The application program passes in a factory object that can create
+the various objects needed by the framework to the runTask
+function. The framework creates the Mapper or Reducer as
+appropriate and calls the map or reduce method to invoke the
+application's code. The JobConf is available to the application.
+
+
+
+The Mapper and Reducer objects get all of their inputs, outputs, and
+context via context objects. The advantage of using the context
+objects is that their interface can be extended with additional
+methods without breaking clients. Although this interface is different
+from the current Java interface, the plan is to migrate the Java
+interface in this direction.
+
+
+
+Although the Java implementation is typed, the C++ interfaces of keys
+and values is just a byte buffer. Since STL strings provide precisely
+the right functionality and are standard, they will be used. The
+decision to not use stronger types was to simplify the interface.
+
+
+
+The application can also define combiner functions. The combiner will
+be run locally by the framework in the application process to avoid
+the round trip to the Java process and back. Because the compare
+function is not available in C++, the combiner will use memcmp to
+sort the inputs to the combiner. This is not as general as the Java
+equivalent, which uses the user's comparator, but should cover the
+majority of the use cases. As the map function outputs key/value
+pairs, they will be buffered. When the buffer is full, it will be
+sorted and passed to the combiner. The output of the combiner will be
+sent to the Java process.
+
+
+
+The application can also set a partition function to control which key
+is given to a particular reduce. If a partition function is not
+defined, the Java one will be used. The partition function will be
+called by the C++ framework before the key/value pair is sent back to
+Java.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+The API is abstract so that it can be implemented on top of
+a variety of metrics client libraries. The choice of
+client library is a configuration option, and different
+modules within the same application can use
+different metrics implementation libraries.
+
+Sub-packages:
+
+
org.apache.hadoop.metrics.spi
+
The abstract Server Provider Interface package. Those wishing to
+ integrate the metrics API with a particular metrics client library should
+ extend this package.
+
+
org.apache.hadoop.metrics.file
+
An implementation package which writes the metric data to
+ a file, or sends it to the standard output stream.
+
+
org.apache.hadoop.metrics.ganglia
+
An implementation package which sends metric data to
+ Ganglia.
+
+
+
Introduction to the Metrics API
+
+Here is a simple example of how to use this package to report a single
+metric value:
+
The context name will typically identify either the application, or else a
+ module within an application or library.
+
+
myRecord
+
The record name generally identifies some entity for which a set of
+ metrics are to be reported. For example, you could have a record named
+ "cacheStats" for reporting a number of statistics relating to the usage of
+ some cache in your application.
+
+
myMetric
+
This identifies a particular metric. For example, you might have metrics
+ named "cache_hits" and "cache_misses".
+
+
+
+
Tags
+
+In some cases it is useful to have multiple records with the same name. For
+example, suppose that you want to report statistics about each disk on a computer.
+In this case, the record name would be something like "diskStats", but you also
+need to identify the disk which is done by adding a tag to the record.
+The code could look something like this:
+
+
+Data is not sent immediately to the metrics system when
+MetricsRecord.update() is called. Instead it is stored in an
+internal table, and the contents of the table are sent periodically.
+This can be important for two reasons:
+
+
It means that a programmer is free to put calls to this API in an
+ inner loop, since updates can be very frequent without slowing down
+ the application significantly.
+
Some implementations can gain efficiency by combining many metrics
+ into a single UDP message.
+
+
+The API provides a timer-based callback via the
+registerUpdater() method. The benefit of this
+versus using java.util.Timer is that the callbacks will be done
+immediately before sending the data, making the data as current as possible.
+
+
Configuration
+
+It is possible to programmatically examine and modify configuration data
+before creating a context, like this:
+
+The factory attributes can be examined and modified using the following
+ContextFactorymethods:
+
+
Object getAttribute(String attributeName)
+
String[] getAttributeNames()
+
void setAttribute(String name, Object value)
+
void removeAttribute(attributeName)
+
+
+
+ContextFactory.getFactory() initializes the factory attributes by
+reading the properties file hadoop-metrics.properties if it exists
+on the class path.
+
+
+A factory attribute named:
+
+contextName.class
+
+should have as its value the fully qualified name of the class to be
+instantiated by a call of the CodeFactory method
+getContext(contextName). If this factory attribute is not
+specified, the default is to instantiate
+org.apache.hadoop.metrics.file.FileContext.
+
+
+Other factory attributes are specific to a particular implementation of this
+API and are documented elsewhere. For example, configuration attributes for
+the file and Ganglia implementations can be found in the javadoc for
+their respective packages.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+These are the implementation specific factory attributes
+(See ContextFactory.getFactory()):
+
+
+
contextName.fileName
+
The path of the file to which metrics in context contextName
+ are to be appended. If this attribute is not specified, the metrics
+ are written to standard output by default.
+
+
contextName.period
+
The period in seconds on which the metric data is written to the
+ file.
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation of the metrics package that sends metric data to
+Ganglia.
+Programmers should not normally need to use this package directly. Instead
+they should use org.hadoop.metrics.
+
+
+These are the implementation specific factory attributes
+(See ContextFactory.getFactory()):
+
+
+
contextName.servers
+
Space and/or comma separated sequence of servers to which UDP
+ messages should be sent.
+
+
contextName.period
+
The period in seconds on which the metric data is sent to the
+ server(s).
+
+
contextName.units.recordName.metricName
+
The units for the specified metric in the specified record.
+
+
contextName.slope.recordName.metricName
+
The slope for the specified metric in the specified record.
+
+
contextName.tmax.recordName.metricName
+
The tmax for the specified metric in the specified record.
+
+
contextName.dmax.recordName.metricName
+
The dmax for the specified metric in the specified record.
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+ org.apache.hadoop.metrics.file and
+org.apache.hadoop.metrics.ganglia.
+
+Plugging in an implementation involves writing a concrete subclass of
+AbstractMetricsContext. The subclass should get its
+ configuration information using the getAttribute(attributeName)
+ method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see #getOutputStream(Socket, long)
+
+ @param socket
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ node
+
+ @param node
+ a node
+ @return true if node is already in the tree; false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ scope
+ if scope starts with ~, choose one from the all nodes except for the
+ ones in scope; otherwise, choose one from scope
+ @param scope range of nodes from which a node will be choosen
+ @return the choosen node]]>
+
+
+
+
+
+
+ scope but not in excludedNodes
+ if scope starts with ~, return the number of nodes that are not
+ in scope and excludedNodes;
+ @param scope a path string that may start with ~
+ @param excludedNodes a list of nodes
+ @return number of available nodes]]>
+
+
+
+
+
+
+
+
+
+
+
+ reader
+ It linearly scans the array, if a local node is found, swap it with
+ the first element of the array.
+ If a local rack node is found, swap it with the first element following
+ the local node.
+ If neither local node or local rack node is found, put a random replica
+ location at postion 0.
+ It leaves the rest nodes untouched.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new ouput stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ = getCount().
+ @param newCapacity The new capacity in bytes.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Index idx = startVector(...);
+ while (!idx.done()) {
+ .... // read element of a vector
+ idx.incr();
+ }
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Introduction
+
+ Software systems of any significant complexity require mechanisms for data
+interchange with the outside world. These interchanges typically involve the
+marshaling and unmarshaling of logical units of data to and from data streams
+(files, network connections, memory buffers etc.). Applications usually have
+some code for serializing and deserializing the data types that they manipulate
+embedded in them. The work of serialization has several features that make
+automatic code generation for it worthwhile. Given a particular output encoding
+(binary, XML, etc.), serialization of primitive types and simple compositions
+of primitives (structs, vectors etc.) is a very mechanical task. Manually
+written serialization code can be susceptible to bugs especially when records
+have a large number of fields or a record definition changes between software
+versions. Lastly, it can be very useful for applications written in different
+programming languages to be able to share and interchange data. This can be
+made a lot easier by describing the data records manipulated by these
+applications in a language agnostic manner and using the descriptions to derive
+implementations of serialization in multiple target languages.
+
+This document describes Hadoop Record I/O, a mechanism that is aimed
+at
+
+
enabling the specification of simple serializable data types (records)
+
enabling the generation of code in multiple target languages for
+marshaling and unmarshaling such types
+
providing target language specific support that will enable application
+programmers to incorporate generated code into their applications
+
+
+The goals of Hadoop Record I/O are similar to those of mechanisms such as XDR,
+ASN.1, PADS and ICE. While these systems all include a DDL that enables
+the specification of most record types, they differ widely in what else they
+focus on. The focus in Hadoop Record I/O is on data marshaling and
+multi-lingual support. We take a translator-based approach to serialization.
+Hadoop users have to describe their data in a simple data description
+language. The Hadoop DDL translator rcc generates code that users
+can invoke in order to read/write their data from/to simple stream
+abstractions. Next we list explicitly some of the goals and non-goals of
+Hadoop Record I/O.
+
+
+
Goals
+
+
+
Support for commonly used primitive types. Hadoop should include as
+primitives commonly used builtin types from programming languages we intend to
+support.
+
+
Support for common data compositions (including recursive compositions).
+Hadoop should support widely used composite types such as structs and
+vectors.
+
+
Code generation in multiple target languages. Hadoop should be capable of
+generating serialization code in multiple target languages and should be
+easily extensible to new target languages. The initial target languages are
+C++ and Java.
+
+
Support for generated target languages. Hadooop should include support
+in the form of headers, libraries, packages for supported target languages
+that enable easy inclusion and use of generated code in applications.
+
+
Support for multiple output encodings. Candidates include
+packed binary, comma-separated text, XML etc.
+
+
Support for specifying record types in a backwards/forwards compatible
+manner. This will probably be in the form of support for optional fields in
+records. This version of the document does not include a description of the
+planned mechanism, we intend to include it in the next iteration.
+
+
+
+
Non-Goals
+
+
+
Serializing existing arbitrary C++ classes.
+
Serializing complex data structures such as trees, linked lists etc.
+
Built-in indexing schemes, compression, or check-sums.
+
Dynamic construction of objects from an XML schema.
+
+
+The remainder of this document describes the features of Hadoop record I/O
+in more detail. Section 2 describes the data types supported by the system.
+Section 3 lays out the DDL syntax with some examples of simple records.
+Section 4 describes the process of code generation with rcc. Section 5
+describes target language mappings and support for Hadoop types. We include a
+fairly complete description of C++ mappings with intent to include Java and
+others in upcoming iterations of this document. The last section talks about
+supported output encodings.
+
+
+
Data Types and Streams
+
+This section describes the primitive and composite types supported by Hadoop.
+We aim to support a set of types that can be used to simply and efficiently
+express a wide range of record types in different programming languages.
+
+
Primitive Types
+
+For the most part, the primitive types of Hadoop map directly to primitive
+types in high level programming languages. Special cases are the
+ustring (a Unicode string) and buffer types, which we believe
+find wide use and which are usually implemented in library code and not
+available as language built-ins. Hadoop also supplies these via library code
+when a target language built-in is not present and there is no widely
+adopted "standard" implementation. The complete list of primitive types is:
+
+
+
byte: An 8-bit unsigned integer.
+
boolean: A boolean value.
+
int: A 32-bit signed integer.
+
long: A 64-bit signed integer.
+
float: A single precision floating point number as described by
+ IEEE-754.
+
double: A double precision floating point number as described by
+ IEEE-754.
+
ustring: A string consisting of Unicode characters.
+
buffer: An arbitrary sequence of bytes.
+
+
+
+
Composite Types
+Hadoop supports a small set of composite types that enable the description
+of simple aggregate types and containers. A composite type is serialized
+by sequentially serializing it constituent elements. The supported
+composite types are:
+
+
+
+
record: An aggregate type like a C-struct. This is a list of
+typed fields that are together considered a single unit of data. A record
+is serialized by sequentially serializing its constituent fields. In addition
+to serialization a record has comparison operations (equality and less-than)
+implemented for it, these are defined as memberwise comparisons.
+
+
vector: A sequence of entries of the same data type, primitive
+or composite.
+
+
map: An associative container mapping instances of a key type to
+instances of a value type. The key and value types may themselves be primitive
+or composite types.
+
+
+
+
Streams
+
+Hadoop generates code for serializing and deserializing record types to
+abstract streams. For each target language Hadoop defines very simple input
+and output stream interfaces. Application writers can usually develop
+concrete implementations of these by putting a one method wrapper around
+an existing stream implementation.
+
+
+
DDL Syntax and Examples
+
+We now describe the syntax of the Hadoop data description language. This is
+followed by a few examples of DDL usage.
+
+
+
+A DDL file describes one or more record types. It begins with zero or
+more include declarations, a single mandatory module declaration
+followed by zero or more class declarations. The semantics of each of
+these declarations are described below:
+
+
+
+
include: An include declaration specifies a DDL file to be
+referenced when generating code for types in the current DDL file. Record types
+in the current compilation unit may refer to types in all included files.
+File inclusion is recursive. An include does not trigger code
+generation for the referenced file.
+
+
module: Every Hadoop DDL file must have a single module
+declaration that follows the list of includes and precedes all record
+declarations. A module declaration identifies a scope within which
+the names of all types in the current file are visible. Module names are
+mapped to C++ namespaces, Java packages etc. in generated code.
+
+
class: Records types are specified through class
+declarations. A class declaration is like a Java class declaration.
+It specifies a named record type and a list of fields that constitute records
+of the type. Usage is illustrated in the following examples.
+
+
+
+
Examples
+
+
+
A simple DDL file links.jr with just one record declaration.
+
+module links {
+ class Link {
+ ustring URL;
+ boolean isRelative;
+ ustring anchorText;
+ };
+}
+
+
+The Hadoop translator is written in Java. Invocation is done by executing a
+wrapper shell script named named rcc. It takes a list of
+record description files as a mandatory argument and an
+optional language argument (the default is Java) --language or
+-l. Thus a typical invocation would look like:
+
+$ rcc -l C++ ...
+
+
+
+
Target Language Mappings and Support
+
+For all target languages, the unit of code generation is a record type.
+For each record type, Hadoop generates code for serialization and
+deserialization, record comparison and access to record members.
+
+
C++
+
+Support for including Hadoop generated C++ code in applications comes in the
+form of a header file recordio.hh which needs to be included in source
+that uses Hadoop types and a library librecordio.a which applications need
+to be linked with. The header declares the Hadoop C++ namespace which defines
+appropriate types for the various primitives, the basic interfaces for
+records and streams and enumerates the supported serialization encodings.
+Declarations of these interfaces and a description of their semantics follow:
+
+
RecFormat: An enumeration of the serialization encodings supported
+by this implementation of Hadoop.
+
+
InStream: A simple abstraction for an input stream. This has a
+single public read method that reads n bytes from the stream into
+the buffer buf. Has the same semantics as a blocking read system
+call. Returns the number of bytes read or -1 if an error occurs.
+
+
OutStream: A simple abstraction for an output stream. This has a
+single write method that writes n bytes to the stream from the
+buffer buf. Has the same semantics as a blocking write system
+call. Returns the number of bytes written or -1 if an error occurs.
+
+
RecordReader: A RecordReader reads records one at a time from
+an underlying stream in a specified record format. The reader is instantiated
+with a stream and a serialization format. It has a read method that
+takes an instance of a record and deserializes the record from the stream.
+
+
RecordWriter: A RecordWriter writes records one at a
+time to an underlying stream in a specified record format. The writer is
+instantiated with a stream and a serialization format. It has a
+write method that takes an instance of a record and serializes the
+record to the stream.
+
+
Record: The base class for all generated record types. This has two
+public methods type and signature that return the typename and the
+type signature of the record.
+
+
+
+Two files are generated for each record file (note: not for each record). If a
+record file is named "name.jr", the generated files are
+"name.jr.cc" and "name.jr.hh" containing serialization
+implementations and record type declarations respectively.
+
+For each record in the DDL file, the generated header file will contain a
+class definition corresponding to the record type, method definitions for the
+generated type will be present in the '.cc' file. The generated class will
+inherit from the abstract class hadoop::Record. The DDL files
+module declaration determines the namespace the record belongs to.
+Each '.' delimited token in the module declaration results in the
+creation of a namespace. For instance, the declaration module docs.links
+results in the creation of a docs namespace and a nested
+docs::links namespace. In the preceding examples, the Link class
+is placed in the links namespace. The header file corresponding to
+the links.jr file will contain:
+
+
+namespace links {
+ class Link : public hadoop::Record {
+ // ....
+ };
+};
+
+
+Each field within the record will cause the generation of a private member
+declaration of the appropriate type in the class declaration, and one or more
+acccessor methods. The generated class will implement the serialize and
+deserialize methods defined in hadoop::Record+. It will also
+implement the inspection methods type and signature from
+hadoop::Record. A default constructor and virtual destructor will also
+be generated. Serialization code will read/write records into streams that
+implement the hadoop::InStream and the hadoop::OutStream interfaces.
+
+For each member of a record an accessor method is generated that returns
+either the member or a reference to the member. For members that are returned
+by value, a setter method is also generated. This is true for primitive
+data members of the types byte, int, long, boolean, float and
+double. For example, for a int field called MyField the folowing
+code is generated.
+
+
+
+For a ustring or buffer or composite field. The generated code
+only contains accessors that return a reference to the field. A const
+and a non-const accessor are generated. For example:
+
+
+
+Code generation for Java is similar to that for C++. A Java class is generated
+for each record type with private members corresponding to the fields. Getters
+and setters for fields are also generated. Some differences arise in the
+way comparison is expressed and in the mapping of modules to packages and
+classes to files. For equality testing, an equals method is generated
+for each record type. As per Java requirements a hashCode method is also
+generated. For comparison a compareTo method is generated for each
+record type. This has the semantics as defined by the Java Comparable
+interface, that is, the method returns a negative integer, zero, or a positive
+integer as the invoked object is less than, equal to, or greater than the
+comparison parameter.
+
+A .java file is generated per record type as opposed to per DDL
+file as in C++. The module declaration translates to a Java
+package declaration. The module name maps to an identical Java package
+name. In addition to this mapping, the DDL compiler creates the appropriate
+directory hierarchy for the package and places the generated .java
+files in the correct directories.
+
+
Mapping Summary
+
+
+DDL Type C++ Type Java Type
+
+boolean bool boolean
+byte int8_t byte
+int int32_t int
+long int64_t long
+float float float
+double double double
+ustring std::string java.lang.String
+buffer std::string org.apache.hadoop.record.Buffer
+class type class type class type
+vector std::vector java.util.ArrayList
+map std::map java.util.TreeMap
+
+
+
Data encodings
+
+This section describes the format of the data encodings supported by Hadoop.
+Currently, three data encodings are supported, namely binary, CSV and XML.
+
+
Binary Serialization Format
+
+The binary data encoding format is fairly dense. Serialization of composite
+types is simply defined as a concatenation of serializations of the constituent
+elements (lengths are included in vectors and maps).
+
+Composite types are serialized as follows:
+
+
class: Sequence of serialized members.
+
vector: The number of elements serialized as an int. Followed by a
+sequence of serialized elements.
+
map: The number of key value pairs serialized as an int. Followed
+by a sequence of serialized (key,value) pairs.
+
+
+Serialization of primitives is more interesting, with a zero compression
+optimization for integral types and normalization to UTF-8 for strings.
+Primitive types are serialized as follows:
+
+
+
byte: Represented by 1 byte, as is.
+
boolean: Represented by 1-byte (0 or 1)
+
int/long: Integers and longs are serialized zero compressed.
+Represented as 1-byte if -120 <= value < 128. Otherwise, serialized as a
+sequence of 2-5 bytes for ints, 2-9 bytes for longs. The first byte represents
+the number of trailing bytes, N, as the negative number (-120-N). For example,
+the number 1024 (0x400) is represented by the byte sequence 'x86 x04 x00'.
+This doesn't help much for 4-byte integers but does a reasonably good job with
+longs without bit twiddling.
+
float/double: Serialized in IEEE 754 single and double precision
+format in network byte order. This is the format used by Java.
+
ustring: Serialized as 4-byte zero compressed length followed by
+data encoded as UTF-8. Strings are normalized to UTF-8 regardless of native
+language representation.
+
buffer: Serialized as a 4-byte zero compressed length followed by the
+raw bytes in the buffer.
+
+
+
+
CSV Serialization Format
+
+The CSV serialization format has a lot more structure than the "standard"
+Excel CSV format, but we believe the additional structure is useful because
+
+
+
it makes parsing a lot easier without detracting too much from legibility
+
the delimiters around composites make it obvious when one is reading a
+sequence of Hadoop records
+
+
+Serialization formats for the various types are detailed in the grammar that
+follows. The notable feature of the formats is the use of delimiters for
+indicating the certain field types.
+
+
+
A string field begins with a single quote (').
+
A buffer field begins with a sharp (#).
+
A class, vector or map begins with 's{', 'v{' or 'm{' respectively and
+ends with '}'.
+
+
+The CSV format can be described by the following grammar:
+
+
+
+The XML serialization format is the same used by Apache XML-RPC
+(http://ws.apache.org/xmlrpc/types.html). This is an extension of the original
+XML-RPC format and adds some additional data types. All record I/O types are
+not directly expressible in this format, and access to a DDL is required in
+order to convert these to valid types. All types primitive or composite are
+represented by <value> elements. The particular XML-RPC type is
+indicated by a nested element in the <value> element. The encoding for
+records is always UTF-8. Primitive types are serialized as follows:
+
+
+
byte: XML tag <ex:i1>. Values: 1-byte unsigned
+integers represented in US-ASCII
+
boolean: XML tag <boolean>. Values: "0" or "1"
+
int: XML tags <i4> or <int>. Values: 4-byte
+signed integers represented in US-ASCII.
+
long: XML tag <ex:i8>. Values: 8-byte signed integers
+represented in US-ASCII.
+
float: XML tag <ex:float>. Values: Single precision
+floating point numbers represented in US-ASCII.
+
double: XML tag <double>. Values: Double precision
+floating point numbers represented in US-ASCII.
+
ustring: XML tag <;string>. Values: String values
+represented as UTF-8. XML does not permit all Unicode characters in literal
+data. In particular, NULLs and control chars are not allowed. Additionally,
+XML processors are required to replace carriage returns with line feeds and to
+replace CRLF sequences with line feeds. Programming languages that we work
+with do not impose these restrictions on string types. To work around these
+restrictions, disallowed characters and CRs are percent escaped in strings.
+The '%' character is also percent escaped.
+
buffer: XML tag <string&>. Values: Arbitrary binary
+data. Represented as hexBinary, each byte is replaced by its 2-byte
+hexadecimal representation.
+
+
+Composite types are serialized as follows:
+
+
+
class: XML tag <struct>. A struct is a sequence of
+<member> elements. Each <member> element has a <name>
+element and a <value> element. The <name> is a string that must
+match /[a-zA-Z][a-zA-Z0-9_]*/. The value of the member is represented
+by a <value> element.
+
+
vector: XML tag <array<. An <array> contains a
+single <data> element. The <data> element is a sequence of
+<value> elements each of which represents an element of the vector.
+
+
map: XML tag <array>. Same as vector.
+
+
+
+For example:
+
+
+class {
+ int MY_INT; // value 5
+ vector MY_VEC; // values 0.1, -0.89, 2.45e4
+ buffer MY_BUF; // value '\00\n\tabc%'
+}
+
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml
new file mode 100644
index 0000000000..fd844cbed0
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.1.xml
@@ -0,0 +1,44778 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The balancer is a tool that balances disk space usage on an HDFS cluster
+ when some datanodes become full or when new empty nodes join the cluster.
+ The tool is deployed as an application program that can be run by the
+ cluster administrator on a live HDFS cluster while applications
+ adding and deleting files.
+
+
SYNOPSIS
+
+ To start:
+ bin/start-balancer.sh [-threshold ]
+ Example: bin/ start-balancer.sh
+ start the balancer with a default threshold of 10%
+ bin/ start-balancer.sh -threshold 5
+ start the balancer with a threshold of 5%
+ To stop:
+ bin/ stop-balancer.sh
+
+
+
DESCRIPTION
+
The threshold parameter is a fraction in the range of (0%, 100%) with a
+ default value of 10%. The threshold sets a target for whether the cluster
+ is balanced. A cluster is balanced if for each datanode, the utilization
+ of the node (ratio of used space at the node to total capacity of the node)
+ differs from the utilization of the (ratio of used space in the cluster
+ to total capacity of the cluster) by no more than the threshold value.
+ The smaller the threshold, the more balanced a cluster will become.
+ It takes more time to run the balancer for small threshold values.
+ Also for a very small threshold the cluster may not be able to reach the
+ balanced state when applications write and delete files concurrently.
+
+
The tool moves blocks from highly utilized datanodes to poorly
+ utilized datanodes iteratively. In each iteration a datanode moves or
+ receives no more than the lesser of 10G bytes or the threshold fraction
+ of its capacity. Each iteration runs no more than 20 minutes.
+ At the end of each iteration, the balancer obtains updated datanodes
+ information from the namenode.
+
+
A system property that limits the balancer's use of bandwidth is
+ defined in the default configuration file:
+
+
+ dfs.balance.bandwidthPerSec
+ 1048576
+ Specifies the maximum bandwidth that each datanode
+ can utilize for the balancing purpose in term of the number of bytes
+ per second.
+
+
+
+
This property determines the maximum speed at which a block will be
+ moved from one datanode to another. The default value is 1MB/s. The higher
+ the bandwidth, the faster a cluster can reach the balanced state,
+ but with greater competition with application processes. If an
+ administrator changes the value of this property in the configuration
+ file, the change is observed when HDFS is next restarted.
+
+
MONITERING BALANCER PROGRESS
+
After the balancer is started, an output file name where the balancer
+ progress will be recorded is printed on the screen. The administrator
+ can monitor the running of the balancer by reading the output file.
+ The output shows the balancer's status iteration by iteration. In each
+ iteration it prints the starting time, the iteration number, the total
+ number of bytes that have been moved in the previous iterations,
+ the total number of bytes that are left to move in order for the cluster
+ to be balanced, and the number of bytes that are being moved in this
+ iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left
+ To Move" is decreasing.
+
+
Running multiple instances of the balancer in an HDFS cluster is
+ prohibited by the tool.
+
+
The balancer automatically exits when any of the following five
+ conditions is satisfied:
+
+
The cluster is balanced;
+
No block can be moved;
+
No block has been moved for five consecutive iterations;
+
An IOException occurs while communicating with the namenode;
+
Another balancer is running.
+
+
+
Upon exit, a balancer returns an exit code and prints one of the
+ following messages to the output file in corresponding to the above exit
+ reasons:
+
+
The cluster is balanced. Exiting
+
No block can be moved. Exiting...
+
No block has been moved for 3 iterations. Exiting...
+
Received an IO exception: failure reason. Exiting...
+
Another balancer is running. Exiting...
+
+
+
The administrator can interrupt the execution of the balancer at any
+ time by running the command "stop-balancer.sh" on the machine where the
+ balancer is running.]]>
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link NamenodeFsck#FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
{@link FSConstants.StartupOption#REGULAR REGULAR} - normal startup
+
{@link FSConstants.StartupOption#FORMAT FORMAT} - format name node
+
{@link FSConstants.StartupOption#UPGRADE UPGRADE} - start the cluster
+ upgrade and create a snapshot of the current file system state
+
{@link FSConstants.StartupOption#ROLLBACK ROLLBACK} - roll the
+ cluster back to the previous state
+
+ The option is passed via configuration field:
+ dfs.namenode.startup
+
+ The conf will be modified to reflect the actual ports on which
+ the NameNode is up and running if the user passes the port as
+ zero in the conf.
+
+ @param conf confirguration
+ @throws IOException]]>
+
+
+
+
+
+ zero.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ datanode whose
+ total size is size
+
+ @param datanode on which blocks are located
+ @param size total size of blocks]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blocksequence (namespace)
+ 2) block->machinelist ("inodes")
+
+ The first table is stored on disk and is very precious.
+ The second table is rebuilt every time the NameNode comes
+ up.
+
+ 'NameNode' refers to both this class as well as the 'NameNode server'.
+ The 'FSNamesystem' class actually performs most of the filesystem
+ management. The majority of the 'NameNode' class itself is concerned
+ with exposing the IPC interface to the outside world, plus some
+ configuration management.
+
+ NameNode implements the ClientProtocol interface, which allows
+ clients to ask for DFS services. ClientProtocol is not
+ designed for direct use by authors of DFS client code. End-users
+ should instead use the org.apache.nutch.hadoop.fs.FileSystem class.
+
+ NameNode also implements the DatanodeProtocol interface, used by
+ DataNode programs that actually store DFS data blocks. These
+ methods are invoked repeatedly and automatically by all the
+ DataNodes in a DFS deployment.
+
+ NameNode also implements the NamenodeProtocol interface, used by
+ secondary namenodes or rebalancing processes to get partial namenode's
+ state, for example partial blocksMap etc.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The tool scans all files and directories, starting from an indicated
+ root path. The following abnormal conditions are detected and handled:
+
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link #FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link #FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
+
delete corrupted files ({@link #FIXING_DELETE})
+
+
+
detect files with under-replicated or over-replicated blocks
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input
+ @deprecated getSplits is called in the client and can perform any
+ necessary validation of the input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .
+ @param name The name of the server
+ @param port The port to use on the server
+ @param findPort whether the server should start at the given port and
+ increment by 1 until it finds a free port.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ points to the log directory
+ "/static/" -> points to common static files (src/webapps/static)
+ "/" -> the jsp server code from (src/webapps/)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ pairs. Uses
+ {@link StringTokenizer} to break text into tokens.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ generateKeyValPairs(Object key, Object value); public void
+ configure(JobConfjob); }
+
+ The package also provides a base class, ValueAggregatorBaseDescriptor,
+ implementing the above interface. The user can extend the base class and
+ implement generateKeyValPairs accordingly.
+
+ The primary work of generateKeyValPairs is to emit one or more key/value
+ pairs based on the input key/value pair. The key in an output key/value pair
+ encode two pieces of information: aggregation type and aggregation id. The
+ value will be aggregated onto the aggregation id according the aggregation
+ type.
+
+ This class offers a function to generate a map/reduce job using Aggregate
+ framework. The function takes the following parameters: input directory spec
+ input format (text or sequence file) output directory a file specifying the
+ user plugin class]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml
new file mode 100644
index 0000000000..08173ab82d
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.2.xml
@@ -0,0 +1,38788 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input
+ @deprecated getSplits is called in the client and can perform any
+ necessary validation of the input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .
+ @param name The name of the server
+ @param port The port to use on the server
+ @param findPort whether the server should start at the given port and
+ increment by 1 until it finds a free port.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ points to the log directory
+ "/static/" -> points to common static files (src/webapps/static)
+ "/" -> the jsp server code from (src/webapps/)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ pairs. Uses
+ {@link StringTokenizer} to break text into tokens.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ generateKeyValPairs(Object key, Object value); public void
+ configure(JobConfjob); }
+
+ The package also provides a base class, ValueAggregatorBaseDescriptor,
+ implementing the above interface. The user can extend the base class and
+ implement generateKeyValPairs accordingly.
+
+ The primary work of generateKeyValPairs is to emit one or more key/value
+ pairs based on the input key/value pair. The key in an output key/value pair
+ encode two pieces of information: aggregation type and aggregation id. The
+ value will be aggregated onto the aggregation id according the aggregation
+ type.
+
+ This class offers a function to generate a map/reduce job using Aggregate
+ framework. The function takes the following parameters: input directory spec
+ input format (text or sequence file) output directory a file specifying the
+ user plugin class]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml
new file mode 100644
index 0000000000..564916fef7
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.18.3.xml
@@ -0,0 +1,38826 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input
+ @deprecated getSplits is called in the client and can perform any
+ necessary validation of the input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .
+ @param name The name of the server
+ @param port The port to use on the server
+ @param findPort whether the server should start at the given port and
+ increment by 1 until it finds a free port.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ points to the log directory
+ "/static/" -> points to common static files (src/webapps/static)
+ "/" -> the jsp server code from (src/webapps/)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ pairs. Uses
+ {@link StringTokenizer} to break text into tokens.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ generateKeyValPairs(Object key, Object value); public void
+ configure(JobConfjob); }
+
+ The package also provides a base class, ValueAggregatorBaseDescriptor,
+ implementing the above interface. The user can extend the base class and
+ implement generateKeyValPairs accordingly.
+
+ The primary work of generateKeyValPairs is to emit one or more key/value
+ pairs based on the input key/value pair. The key in an output key/value pair
+ encode two pieces of information: aggregation type and aggregation id. The
+ value will be aggregated onto the aggregation id according the aggregation
+ type.
+
+ This class offers a function to generate a map/reduce job using Aggregate
+ framework. The function takes the following parameters: input directory spec
+ input format (text or sequence file) output directory a file specifying the
+ user plugin class]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml
new file mode 100644
index 0000000000..557ac3cc59
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.0.xml
@@ -0,0 +1,43972 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param in InputStream to deserialize the object from.]]>
+
+
+
+
+
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property
+ as an array of Class.
+ The value of the property specifies a list of comma separated class names.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the property name.
+ @param defaultValue default value.
+ @return property value as a Class[],
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Unless explicitly turned off, Hadoop by default specifies two
+ resources, loaded in-order from the classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link org.apache.hadoop.mapred.JobConf}.
+ The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+ or {@link org.apache.hadoop.mapred.Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is DistributedFileSystem.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair compatible with lzop.
+ http://www.lzop.org/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FIXME: This array should be in a private or package private location,
+ since it could be modified by malicious code.
+ ]]>
+
+
+
+
+ This interface is public for historical purposes. You should have no need to
+ use it.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+ Although BZip2 headers are marked with the magic "Bz" this
+ constructor expects the next byte in the stream to be the first one after
+ the magic. Thus callers have to skip the first two bytes. Otherwise this
+ constructor will throw an exception.
+
+
+ @throws IOException
+ if the stream content is malformed or an I/O error occurs.
+ @throws NullPointerException
+ if in == null]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The decompression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2InputStream to release the allocated memory. See
+ {@link CBZip2OutputStream CBZip2OutputStream} for information about memory
+ usage.
+
+
+
+ CBZip2InputStream reads bytes from the compressed source stream via
+ the single byte {@link java.io.InputStream#read() read()} method exclusively.
+ Thus you should consider to use a buffered source stream.
+
+
+
+ Instances of this class are not threadsafe.
+
]]>
+
+
+
+
+
+
+
+
+
+ CBZip2OutputStream with a blocksize of 900k.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+ @param out *
+ the destination stream.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws NullPointerException
+ if out == null.]]>
+
+
+
+
+
+ CBZip2OutputStream with specified blocksize.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+
+ @param out
+ the destination stream.
+ @param blockSize
+ the blockSize as 100k units.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws IllegalArgumentException
+ if (blockSize < 1) || (blockSize > 9).
+ @throws NullPointerException
+ if out == null.
+
+ @see #MIN_BLOCKSIZE
+ @see #MAX_BLOCKSIZE]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ inputLength this method returns MAX_BLOCKSIZE
+ always.
+
+ @param inputLength
+ The length of the data which will be compressed by
+ CBZip2OutputStream.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ == 1.]]>
+
+
+
+
+ == 9.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If you are ever unlucky/improbable enough to get a stack overflow whilst
+ sorting, increase the following constant and try again. In practice I
+ have never seen the stack go above 27 elems, so the following limit seems
+ very generous.
+ ]]>
+
+
+
+
+ The compression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2OutputStream to release the allocated memory.
+
+
+
+ You can shrink the amount of allocated memory and maybe raise the compression
+ speed by choosing a lower blocksize, which in turn may cause a lower
+ compression ratio. You can avoid unnecessary memory allocation by avoiding
+ using a blocksize which is bigger than the size of the input.
+
+
+
+ You can compute the memory usage for compressing by the following formula:
+
+
+
+ <code>400k + (9 * blocksize)</code>.
+
+
+
+ To get the memory required for decompression by {@link CBZip2InputStream
+ CBZip2InputStream} use
+
+
+
+ <code>65k + (5 * blocksize)</code>.
+
+
+
+
+
+
+
Memory usage by blocksize
+
+
+
Blocksize
Compression
+ memory usage
Decompression
+ memory usage
+
+
+
100k
+
1300k
+
565k
+
+
+
200k
+
2200k
+
1065k
+
+
+
300k
+
3100k
+
1565k
+
+
+
400k
+
4000k
+
2065k
+
+
+
500k
+
4900k
+
2565k
+
+
+
600k
+
5800k
+
3065k
+
+
+
700k
+
6700k
+
3565k
+
+
+
800k
+
7600k
+
4065k
+
+
+
900k
+
8500k
+
4565k
+
+
+
+
+ For decompression CBZip2InputStream allocates less memory if the
+ bzipped input is smaller than one block.
+
+
+
+ Instances of this class are not threadsafe.
+
+
+
+ TODO: Update to BZip2 1.0.1
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Note: The following is valid only if the {@link OutputCommitter}
+ is {@link FileOutputCommitter}. If OutputCommitter is not
+ a FileOutputCommitter, the task's temporary output
+ directory is same as {@link #getOutputPath(JobConf)} i.e.
+ ${mapred.output.dir}$
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The generated name can be used to create custom files from within the
+ different tasks for the job, the names for different tasks will not collide
+ with each other.
+
+
The given name is postfixed with the task type, 'm' for maps, 'r' for
+ reduces and the task partition number. For example, give a name 'test'
+ running on the first map o the job the generated name will be
+ 'test-m-00000'.
+
+ @param conf the configuration for the job.
+ @param name the name to make unique.
+ @return a unique name accross all tasks of the job.]]>
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueName} method to make the file name
+ unique for the task.
+
+ @param conf the configuration for the job.
+ @param name the name for the file.
+ @return a unique path accross all tasks of the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
+
+
+ @see JobConf
+ @see ClusterStatus
+ @see Tool
+ @see DistributedCache]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If the parameter {@code loadDefaults} is false, the new instance
+ will not load resources from the default files.
+
+ @param loadDefaults specifies whether to load from the default files]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if framework should keep the intermediate files
+ for failed tasks, false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the outputs of the maps are to be compressed,
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This comparator should be provided if the equivalence rules for keys
+ for sorting the intermediates are different from those for grouping keys
+ before each call to
+ {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
+
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's cleanup-tasks, as a float between 0.0
+ and 1.0. When all cleanup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's cleanup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's setup-tasks, as a float between 0.0
+ and 1.0. When all setup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's setup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop provides an optional mode of execution in which the bad records
+ are detected and skipped in further attempts.
+
+
This feature can be used when map/reduce tasks crashes deterministically on
+ certain input. This happens due to bugs in the map/reduce function. The usual
+ course would be to fix these bugs. But sometimes this is not possible;
+ perhaps the bug is in third party libraries for which the source code is
+ not available. Due to this, the task never reaches to completion even with
+ multiple attempts and complete data for that task is lost.
+
+
With this feature, only a small portion of data is lost surrounding
+ the bad record, which may be acceptable for some user applications.
+ see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}
+
+
The skipping mode gets kicked off after certain no of failures
+ see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}
+
+
In the skipping mode, the map/reduce task maintains the record range which
+ is getting processed at all times. Before giving the input to the
+ map/reduce function, it sends this record range to the Task tracker.
+ If task crashes, the Task tracker knows which one was the last reported
+ range. On further attempts that range get skipped.
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+ mapred.join.define.<ident> to a classname. In the expression
+ mapred.join.expr, the identifier will be assumed to be a
+ ComposableRecordReader.
+ mapred.join.keycomparator can be a classname used to compare keys
+ in the join.
+ @see JoinRecordReader
+ @see MultiFilterRecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ capacity children to position
+ id in the parent reader.
+ The id of a root CompositeRecordReader is -1 by convention, but relying
+ on this is not recommended.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ override(S1,S2,S3) will prefer values
+ from S3 over S2, and values from S2 over S1 for all keys
+ emitted from all sources.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [,,...,]]]>
+
+
+
+
+
+
+ out.
+ TupleWritable format:
+ {@code
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+
+
+ @param job job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ The Mapper classes are invoked in a chained (or piped) fashion, the output of
+ the first becomes the input of the second, and so on until the last Mapper,
+ the output of the last Mapper will be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed in a chain. This enables having
+ reusable specialized Mappers that can be combined to perform composite
+ operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain.
+
+ ChainMapper usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Reducer leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Reducer does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Reducer the configuration given for it,
+ reducerConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ @param job job's JobConf to add the Reducer class.
+ @param klass the Reducer class to add.
+ @param inputKeyClass reducer input key class.
+ @param inputValueClass reducer input value class.
+ @param outputKeyClass reducer output key class.
+ @param outputValueClass reducer output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param reducerConf a JobConf with the configuration for the Reducer
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+ .
+
+ @param job chain job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ reduce(...) method of the Reducer with the
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ For each record output by the Reducer, the Mapper classes are invoked in a
+ chained (or piped) fashion, the output of the first becomes the input of the
+ second, and so on until the last Mapper, the output of the last Mapper will
+ be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed after the Reducer or in a chain.
+ This enables having reusable specialized Mappers that can be combined to
+ perform composite operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ ChainReducer usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ @param freq The frequency with which records will be emitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ This will read every split at the client, which is very expensive.
+ @param freq Probability with which a key will be chosen.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ Takes the first numSamples / numSplits records from each split.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the name output is multi, false
+ if it is single. If the name output is not defined it returns
+ false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ @param conf job conf to enableadd the named output.
+ @param enabled indicates if the counters will be enabled or not.]]>
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+
+ @param conf job conf to enableadd the named output.
+ @return TRUE if the counters are enabled, FALSE if they are disabled.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param multiName the multi name part
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+ If overriden subclasses must invoke super.close() at the
+ end of their close()
+
+ @throws java.io.IOException thrown if any of the MultipleOutput files
+ could not be closed properly.]]>
+
+
+
+ OutputCollector passed to
+ the map() and reduce() methods of the
+ Mapper and Reducer implementations.
+
+ Each additional output, or named output, may be configured with its own
+ OutputFormat, with its own key class and with its own value
+ class.
+
+ A named output can be a single file or a multi file. The later is refered as
+ a multi named output.
+
+ A multi named output is an unbound set of files all sharing the same
+ OutputFormat, key class and value class configuration.
+
+ When named outputs are used within a Mapper implementation,
+ key/values written to a name output are not part of the reduce phase, only
+ key/values written to the job OutputCollector are part of the
+ reduce phase.
+
+ MultipleOutputs supports counters, by default the are disabled. The counters
+ group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ Job configuration usage pattern is:
+
+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+ SequenceFileOutputFormat.class,
+ LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+
+
+ Job configuration usage pattern is:
+
+
+ public class MOReduce implements
+ Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+ Alternatively, the properties can be set in the configuration with proper
+ values.
+
+ @see DBConfiguration#configureDB(JobConf, String, String, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...)
+ @see DBOutputFormat#setOutput(JobConf, String, String...)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 20070101 AND length > 0)'
+ @param orderBy the fieldNames in the orderBy clause.
+ @param fieldNames The field names in the table
+ @see #setInput(JobConf, Class, String, String)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBInputFormat emits LongWritables containing the record number as
+ key and DBWritables as value.
+
+ The SQL query, and input class can be using one of the two
+ setInput methods.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {@link DBOutputFormat} accepts <key,value> pairs, where
+ key has a type extending DBWritable. Returned {@link RecordWriter}
+ writes only the key to the database with a batch SQL query.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBWritable. DBWritable, is similar to {@link Writable}
+ except that the {@link #write(PreparedStatement)} method takes a
+ {@link PreparedStatement}, and {@link #readFields(ResultSet)}
+ takes a {@link ResultSet}.
+
+ Implementations are responsible for writing the fields of the object
+ to PreparedStatement, and reading the fields of the object from the
+ ResultSet.
+
+
Example:
+ If we have the following table in the database :
+
+ CREATE TABLE MyTable (
+ counter INTEGER NOT NULL,
+ timestamp BIGINT NOT NULL,
+ );
+
+ then we can read/write the tuples from/to the table with :
+
+ public class MyWritable implements Writable, DBWritable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ //Writable#write() implementation
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ //Writable#readFields() implementation
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public void write(PreparedStatement statement) throws SQLException {
+ statement.setInt(1, counter);
+ statement.setLong(2, timestamp);
+ }
+
+ public void readFields(ResultSet resultSet) throws SQLException {
+ counter = resultSet.getInt(1);
+ timestamp = resultSet.getLong(2);
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ io.file.buffer.size specified in the given
+ Configuration.
+ @param in input stream
+ @param conf configuration
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param conf configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml
new file mode 100644
index 0000000000..92bdd2c799
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.19.1.xml
@@ -0,0 +1,44195 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param in InputStream to deserialize the object from.]]>
+
+
+
+
+
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property
+ as an array of Class.
+ The value of the property specifies a list of comma separated class names.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the property name.
+ @param defaultValue default value.
+ @return property value as a Class[],
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Unless explicitly turned off, Hadoop by default specifies two
+ resources, loaded in-order from the classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link org.apache.hadoop.mapred.JobConf}.
+ The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+ or {@link org.apache.hadoop.mapred.Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is DistributedFileSystem.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair compatible with lzop.
+ http://www.lzop.org/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FIXME: This array should be in a private or package private location,
+ since it could be modified by malicious code.
+ ]]>
+
+
+
+
+ This interface is public for historical purposes. You should have no need to
+ use it.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Although BZip2 headers are marked with the magic "Bz" this
+ constructor expects the next byte in the stream to be the first one after
+ the magic. Thus callers have to skip the first two bytes. Otherwise this
+ constructor will throw an exception.
+
+
+ @throws IOException
+ if the stream content is malformed or an I/O error occurs.
+ @throws NullPointerException
+ if in == null]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The decompression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2InputStream to release the allocated memory. See
+ {@link CBZip2OutputStream CBZip2OutputStream} for information about memory
+ usage.
+
+
+
+ CBZip2InputStream reads bytes from the compressed source stream via
+ the single byte {@link java.io.InputStream#read() read()} method exclusively.
+ Thus you should consider to use a buffered source stream.
+
+
+
+ Instances of this class are not threadsafe.
+
]]>
+
+
+
+
+
+
+
+
+
+ CBZip2OutputStream with a blocksize of 900k.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+ @param out *
+ the destination stream.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws NullPointerException
+ if out == null.]]>
+
+
+
+
+
+ CBZip2OutputStream with specified blocksize.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+
+ @param out
+ the destination stream.
+ @param blockSize
+ the blockSize as 100k units.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws IllegalArgumentException
+ if (blockSize < 1) || (blockSize > 9).
+ @throws NullPointerException
+ if out == null.
+
+ @see #MIN_BLOCKSIZE
+ @see #MAX_BLOCKSIZE]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ inputLength this method returns MAX_BLOCKSIZE
+ always.
+
+ @param inputLength
+ The length of the data which will be compressed by
+ CBZip2OutputStream.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ == 1.]]>
+
+
+
+
+ == 9.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If you are ever unlucky/improbable enough to get a stack overflow whilst
+ sorting, increase the following constant and try again. In practice I
+ have never seen the stack go above 27 elems, so the following limit seems
+ very generous.
+ ]]>
+
+
+
+
+ The compression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2OutputStream to release the allocated memory.
+
+
+
+ You can shrink the amount of allocated memory and maybe raise the compression
+ speed by choosing a lower blocksize, which in turn may cause a lower
+ compression ratio. You can avoid unnecessary memory allocation by avoiding
+ using a blocksize which is bigger than the size of the input.
+
+
+
+ You can compute the memory usage for compressing by the following formula:
+
+
+
+ <code>400k + (9 * blocksize)</code>.
+
+
+
+ To get the memory required for decompression by {@link CBZip2InputStream
+ CBZip2InputStream} use
+
+
+
+ <code>65k + (5 * blocksize)</code>.
+
+
+
+
+
+
+
Memory usage by blocksize
+
+
+
Blocksize
Compression
+ memory usage
Decompression
+ memory usage
+
+
+
100k
+
1300k
+
565k
+
+
+
200k
+
2200k
+
1065k
+
+
+
300k
+
3100k
+
1565k
+
+
+
400k
+
4000k
+
2065k
+
+
+
500k
+
4900k
+
2565k
+
+
+
600k
+
5800k
+
3065k
+
+
+
700k
+
6700k
+
3565k
+
+
+
800k
+
7600k
+
4065k
+
+
+
900k
+
8500k
+
4565k
+
+
+
+
+ For decompression CBZip2InputStream allocates less memory if the
+ bzipped input is smaller than one block.
+
+
+
+ Instances of this class are not threadsafe.
+
+
+
+ TODO: Update to BZip2 1.0.1
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Note: The following is valid only if the {@link OutputCommitter}
+ is {@link FileOutputCommitter}. If OutputCommitter is not
+ a FileOutputCommitter, the task's temporary output
+ directory is same as {@link #getOutputPath(JobConf)} i.e.
+ ${mapred.output.dir}$
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The generated name can be used to create custom files from within the
+ different tasks for the job, the names for different tasks will not collide
+ with each other.
+
+
The given name is postfixed with the task type, 'm' for maps, 'r' for
+ reduces and the task partition number. For example, give a name 'test'
+ running on the first map o the job the generated name will be
+ 'test-m-00000'.
+
+ @param conf the configuration for the job.
+ @param name the name to make unique.
+ @return a unique name accross all tasks of the job.]]>
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueName} method to make the file name
+ unique for the task.
+
+ @param conf the configuration for the job.
+ @param name the name for the file.
+ @return a unique path accross all tasks of the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
+
+
+ @see JobConf
+ @see ClusterStatus
+ @see Tool
+ @see DistributedCache]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If the parameter {@code loadDefaults} is false, the new instance
+ will not load resources from the default files.
+
+ @param loadDefaults specifies whether to load from the default files]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if framework should keep the intermediate files
+ for failed tasks, false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the outputs of the maps are to be compressed,
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This comparator should be provided if the equivalence rules for keys
+ for sorting the intermediates are different from those for grouping keys
+ before each call to
+ {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
+
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's cleanup-tasks, as a float between 0.0
+ and 1.0. When all cleanup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's cleanup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's setup-tasks, as a float between 0.0
+ and 1.0. When all setup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's setup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop provides an optional mode of execution in which the bad records
+ are detected and skipped in further attempts.
+
+
This feature can be used when map/reduce tasks crashes deterministically on
+ certain input. This happens due to bugs in the map/reduce function. The usual
+ course would be to fix these bugs. But sometimes this is not possible;
+ perhaps the bug is in third party libraries for which the source code is
+ not available. Due to this, the task never reaches to completion even with
+ multiple attempts and complete data for that task is lost.
+
+
With this feature, only a small portion of data is lost surrounding
+ the bad record, which may be acceptable for some user applications.
+ see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}
+
+
The skipping mode gets kicked off after certain no of failures
+ see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}
+
+
In the skipping mode, the map/reduce task maintains the record range which
+ is getting processed at all times. Before giving the input to the
+ map/reduce function, it sends this record range to the Task tracker.
+ If task crashes, the Task tracker knows which one was the last reported
+ range. On further attempts that range get skipped.
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+ mapred.join.define.<ident> to a classname. In the expression
+ mapred.join.expr, the identifier will be assumed to be a
+ ComposableRecordReader.
+ mapred.join.keycomparator can be a classname used to compare keys
+ in the join.
+ @see JoinRecordReader
+ @see MultiFilterRecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ capacity children to position
+ id in the parent reader.
+ The id of a root CompositeRecordReader is -1 by convention, but relying
+ on this is not recommended.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ override(S1,S2,S3) will prefer values
+ from S3 over S2, and values from S2 over S1 for all keys
+ emitted from all sources.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [,,...,]]]>
+
+
+
+
+
+
+ out.
+ TupleWritable format:
+ {@code
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+
+
+ @param job job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ The Mapper classes are invoked in a chained (or piped) fashion, the output of
+ the first becomes the input of the second, and so on until the last Mapper,
+ the output of the last Mapper will be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed in a chain. This enables having
+ reusable specialized Mappers that can be combined to perform composite
+ operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain.
+
+ ChainMapper usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Reducer leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Reducer does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Reducer the configuration given for it,
+ reducerConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ @param job job's JobConf to add the Reducer class.
+ @param klass the Reducer class to add.
+ @param inputKeyClass reducer input key class.
+ @param inputValueClass reducer input value class.
+ @param outputKeyClass reducer output key class.
+ @param outputValueClass reducer output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param reducerConf a JobConf with the configuration for the Reducer
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+ .
+
+ @param job chain job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ reduce(...) method of the Reducer with the
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ For each record output by the Reducer, the Mapper classes are invoked in a
+ chained (or piped) fashion, the output of the first becomes the input of the
+ second, and so on until the last Mapper, the output of the last Mapper will
+ be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed after the Reducer or in a chain.
+ This enables having reusable specialized Mappers that can be combined to
+ perform composite operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ ChainReducer usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ @param freq The frequency with which records will be emitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ This will read every split at the client, which is very expensive.
+ @param freq Probability with which a key will be chosen.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ Takes the first numSamples / numSplits records from each split.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the name output is multi, false
+ if it is single. If the name output is not defined it returns
+ false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ @param conf job conf to enableadd the named output.
+ @param enabled indicates if the counters will be enabled or not.]]>
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+
+ @param conf job conf to enableadd the named output.
+ @return TRUE if the counters are enabled, FALSE if they are disabled.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param multiName the multi name part
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+ If overriden subclasses must invoke super.close() at the
+ end of their close()
+
+ @throws java.io.IOException thrown if any of the MultipleOutput files
+ could not be closed properly.]]>
+
+
+
+ OutputCollector passed to
+ the map() and reduce() methods of the
+ Mapper and Reducer implementations.
+
+ Each additional output, or named output, may be configured with its own
+ OutputFormat, with its own key class and with its own value
+ class.
+
+ A named output can be a single file or a multi file. The later is refered as
+ a multi named output.
+
+ A multi named output is an unbound set of files all sharing the same
+ OutputFormat, key class and value class configuration.
+
+ When named outputs are used within a Mapper implementation,
+ key/values written to a name output are not part of the reduce phase, only
+ key/values written to the job OutputCollector are part of the
+ reduce phase.
+
+ MultipleOutputs supports counters, by default the are disabled. The counters
+ group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ Job configuration usage pattern is:
+
+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+ SequenceFileOutputFormat.class,
+ LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+
+
+ Job configuration usage pattern is:
+
+
+ public class MOReduce implements
+ Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+ Alternatively, the properties can be set in the configuration with proper
+ values.
+
+ @see DBConfiguration#configureDB(JobConf, String, String, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...)
+ @see DBOutputFormat#setOutput(JobConf, String, String...)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 20070101 AND length > 0)'
+ @param orderBy the fieldNames in the orderBy clause.
+ @param fieldNames The field names in the table
+ @see #setInput(JobConf, Class, String, String)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBInputFormat emits LongWritables containing the record number as
+ key and DBWritables as value.
+
+ The SQL query, and input class can be using one of the two
+ setInput methods.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {@link DBOutputFormat} accepts <key,value> pairs, where
+ key has a type extending DBWritable. Returned {@link RecordWriter}
+ writes only the key to the database with a batch SQL query.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBWritable. DBWritable, is similar to {@link Writable}
+ except that the {@link #write(PreparedStatement)} method takes a
+ {@link PreparedStatement}, and {@link #readFields(ResultSet)}
+ takes a {@link ResultSet}.
+
+ Implementations are responsible for writing the fields of the object
+ to PreparedStatement, and reading the fields of the object from the
+ ResultSet.
+
+
Example:
+ If we have the following table in the database :
+
+ CREATE TABLE MyTable (
+ counter INTEGER NOT NULL,
+ timestamp BIGINT NOT NULL,
+ );
+
+ then we can read/write the tuples from/to the table with :
+
+ public class MyWritable implements Writable, DBWritable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ //Writable#write() implementation
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ //Writable#readFields() implementation
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public void write(PreparedStatement statement) throws SQLException {
+ statement.setInt(1, counter);
+ statement.setLong(2, timestamp);
+ }
+
+ public void readFields(ResultSet resultSet) throws SQLException {
+ counter = resultSet.getInt(1);
+ timestamp = resultSet.getLong(2);
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ io.file.buffer.size specified in the given
+ Configuration.
+ @param in input stream
+ @param conf configuration
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param conf configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml
new file mode 100644
index 0000000000..9067cf1158
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/jdiff/hadoop_0.20.0.xml
@@ -0,0 +1,52140 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param in InputStream to deserialize the object from.]]>
+
+
+
+
+
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a float.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property
+ as an array of Class.
+ The value of the property specifies a list of comma separated class names.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the property name.
+ @param defaultValue default value.
+ @return property value as a Class[],
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Unless explicitly turned off, Hadoop by default specifies two
+ resources, loaded in-order from the classpath:
core-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link org.apache.hadoop.mapred.JobConf}.
+ The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+ or {@link org.apache.hadoop.mapred.Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is DistributedFileSystem.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FIXME: This array should be in a private or package private location,
+ since it could be modified by malicious code.
+ ]]>
+
+
+
+
+ This interface is public for historical purposes. You should have no need to
+ use it.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Although BZip2 headers are marked with the magic "Bz" this
+ constructor expects the next byte in the stream to be the first one after
+ the magic. Thus callers have to skip the first two bytes. Otherwise this
+ constructor will throw an exception.
+
+
+ @throws IOException
+ if the stream content is malformed or an I/O error occurs.
+ @throws NullPointerException
+ if in == null]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The decompression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2InputStream to release the allocated memory. See
+ {@link CBZip2OutputStream CBZip2OutputStream} for information about memory
+ usage.
+
+
+
+ CBZip2InputStream reads bytes from the compressed source stream via
+ the single byte {@link java.io.InputStream#read() read()} method exclusively.
+ Thus you should consider to use a buffered source stream.
+
+
+
+ Instances of this class are not threadsafe.
+
]]>
+
+
+
+
+
+
+
+
+
+ CBZip2OutputStream with a blocksize of 900k.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+ @param out *
+ the destination stream.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws NullPointerException
+ if out == null.]]>
+
+
+
+
+
+ CBZip2OutputStream with specified blocksize.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+
+ @param out
+ the destination stream.
+ @param blockSize
+ the blockSize as 100k units.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws IllegalArgumentException
+ if (blockSize < 1) || (blockSize > 9).
+ @throws NullPointerException
+ if out == null.
+
+ @see #MIN_BLOCKSIZE
+ @see #MAX_BLOCKSIZE]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ inputLength this method returns MAX_BLOCKSIZE
+ always.
+
+ @param inputLength
+ The length of the data which will be compressed by
+ CBZip2OutputStream.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ == 1.]]>
+
+
+
+
+ == 9.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If you are ever unlucky/improbable enough to get a stack overflow whilst
+ sorting, increase the following constant and try again. In practice I
+ have never seen the stack go above 27 elems, so the following limit seems
+ very generous.
+ ]]>
+
+
+
+
+ The compression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2OutputStream to release the allocated memory.
+
+
+
+ You can shrink the amount of allocated memory and maybe raise the compression
+ speed by choosing a lower blocksize, which in turn may cause a lower
+ compression ratio. You can avoid unnecessary memory allocation by avoiding
+ using a blocksize which is bigger than the size of the input.
+
+
+
+ You can compute the memory usage for compressing by the following formula:
+
+
+
+ <code>400k + (9 * blocksize)</code>.
+
+
+
+ To get the memory required for decompression by {@link CBZip2InputStream
+ CBZip2InputStream} use
+
+
+
+ <code>65k + (5 * blocksize)</code>.
+
+
+
+
+
+
+
Memory usage by blocksize
+
+
+
Blocksize
Compression
+ memory usage
Decompression
+ memory usage
+
+
+
100k
+
1300k
+
565k
+
+
+
200k
+
2200k
+
1065k
+
+
+
300k
+
3100k
+
1565k
+
+
+
400k
+
4000k
+
2065k
+
+
+
500k
+
4900k
+
2565k
+
+
+
600k
+
5800k
+
3065k
+
+
+
700k
+
6700k
+
3565k
+
+
+
800k
+
7600k
+
4065k
+
+
+
900k
+
8500k
+
4565k
+
+
+
+
+ For decompression CBZip2InputStream allocates less memory if the
+ bzipped input is smaller than one block.
+
+
+
+ Instances of this class are not threadsafe.
+
+
+
+ TODO: Update to BZip2 1.0.1
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.
+ @deprecated Use {@link #call(Writable, InetSocketAddress, Class, UserGroupInformation)} instead]]>
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address with the ticket credentials, returning
+ the value.
+ Throws exceptions if there are network problems or if the remote code
+ threw an exception.
+ @deprecated Use {@link #call(Writable, InetSocketAddress, Class, UserGroupInformation)} instead]]>
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address which is servicing the protocol protocol,
+ with the ticket credentials, returning the value.
+ Throws exceptions if there are network problems or if the remote code
+ threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name=RpcActivityForPort"
+
+ Many of the activity metrics are sampled and averaged on an interval
+ which can be specified in the metrics config file.
+
+ For the metrics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most metrics contexts do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically
+
+
+
+ Impl details: We use a dynamic mbean that gets the list of the metrics
+ from the metrics registry passed as an argument to the constructor]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see #getOutputStream(Socket, long)
+
+ @param socket
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ socket.connect(endpoint, timeout). If
+ socket.getChannel() returns a non-null channel,
+ connect is implemented using Hadoop's selectors. This is done mainly
+ to avoid Sun's connect implementation from creating thread-local
+ selectors, since Hadoop does not have control on when these are closed
+ and could end up taking all the available file descriptors.
+
+ @see java.net.Socket#connect(java.net.SocketAddress, int)
+
+ @param socket
+ @param endpoint
+ @param timeout - timeout in milliseconds]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ node
+
+ @param node
+ a node
+ @return true if node is already in the tree; false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ scope
+ if scope starts with ~, choose one from the all nodes except for the
+ ones in scope; otherwise, choose one from scope
+ @param scope range of nodes from which a node will be choosen
+ @return the choosen node]]>
+
+
+
+
+
+
+ scope but not in excludedNodes
+ if scope starts with ~, return the number of nodes that are not
+ in scope and excludedNodes;
+ @param scope a path string that may start with ~
+ @param excludedNodes a list of nodes
+ @return number of available nodes]]>
+
+
+
+
+
+
+
+
+
+
+
+ reader
+ It linearly scans the array, if a local node is found, swap it with
+ the first element of the array.
+ If a local rack node is found, swap it with the first element following
+ the local node.
+ If neither local node or local rack node is found, put a random replica
+ location at postion 0.
+ It leaves the rest nodes untouched.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new ouput stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ = getCount().
+ @param newCapacity The new capacity in bytes.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Index idx = startVector(...);
+ while (!idx.done()) {
+ .... // read element of a vector
+ idx.incr();
+ }
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This task takes the given record definition files and compiles them into
+ java or c++
+ files. It is then up to the user to compile the generated files.
+
+
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
]]>
+
+
+
+
+
+
+
+
+
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cause==null ? null : cause.toString()) (which
+ typically contains the class and detail message of cause).
+ @param cause the cause (which is saved for later retrieval by the
+ {@link #getCause()} method). (A null value is
+ permitted, and indicates that the cause is nonexistent or
+ unknown.)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ Group with the given groupname.
+ @param group group name]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ugi.
+ @param ugi user
+ @return the {@link Subject} for the user identified by ugi]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ugi as a comma separated string in
+ conf as a property attr
+
+ The String starts with the user name followed by the default group names,
+ and other group names.
+
+ @param conf configuration
+ @param attr property name
+ @param ugi a UnixUserGroupInformation]]>
+
+
+
+
+
+
+
+ conf
+
+ The object is expected to store with the property name attr
+ as a comma separated string that starts
+ with the user name followed by group names.
+ If the property name is not defined, return null.
+ It's assumed that there is only one UGI per user. If this user already
+ has a UGI in the ugi map, return the ugi in the map.
+ Otherwise, construct a UGI from the configuration, store it in the
+ ugi map and return it.
+
+ @param conf configuration
+ @param attr property name
+ @return a UnixUGI
+ @throws LoginException if the stored string is ill-formatted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ User with the given username.
+ @param user user name]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cause==null ? null : cause.toString()) (which
+ typically contains the class and detail message of cause).
+ @param cause the cause (which is saved for later retrieval by the
+ {@link #getCause()} method). (A null value is
+ permitted, and indicates that the cause is nonexistent or
+ unknown.)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ does not provide the stack trace for security purposes.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ service as related to
+ Service Level Authorization for Hadoop.
+
+ Each service defines it's configuration key and also the necessary
+ {@link Permission} required to access the service.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ io.file.buffer.size specified in the given
+ Configuration.
+ @param in input stream
+ @param conf configuration
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param conf configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+ Shell interface.
+ @param env the map of environment key=value
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Bloom filter, as defined by Bloom in 1970.
+
+ The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by
+ the networking research community in the past decade thanks to the bandwidth efficiencies that it
+ offers for the transmission of set membership information between networked hosts. A sender encodes
+ the information into a bit vector, the Bloom filter, that is more compact than a conventional
+ representation. Computation and space costs for construction are linear in the number of elements.
+ The receiver uses the filter to test whether various elements are members of the set. Though the
+ filter will occasionally return a false positive, it will never return a false negative. When creating
+ the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size.
+
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).]]>
+
+
+
+
+
+
+
+
+ this counting Bloom filter.
+
+ Invariant: nothing happens if the specified key does not belong to this counter Bloom filter.
+ @param key The key to remove.]]>
+
+
+
+
+
+
+
+
+
+
+
+ key -> count map.
+
NOTE: due to the bucket size of this filter, inserting the same
+ key more than 15 times will cause an overflow at all filter positions
+ associated with this key, and it will significantly increase the error
+ rate for this and other keys. For this reason the filter can only be
+ used to store small count values 0 <= N << 15.
+ @param key key to be tested
+ @return 0 if the key is not present. Otherwise, a positive value v will
+ be returned such that v == count with probability equal to the
+ error rate of this filter, and v > count otherwise.
+ Additionally, if the filter experienced an underflow as a result of
+ {@link #delete(Key)} operation, the return value may be lower than the
+ count with the probability of the false negative rate of such
+ filter.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ counting Bloom filter, as defined by Fan et al. in a ToN
+ 2000 paper.
+
+ A counting Bloom filter is an improvement to standard a Bloom filter as it
+ allows dynamic additions and deletions of set membership information. This
+ is achieved through the use of a counting vector instead of a bit vector.
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Builds an empty Dynamic Bloom filter.
+ @param vectorSize The number of bits in the vector.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).
+ @param nr The threshold for the maximum number of keys to record in a
+ dynamic Bloom filter row.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ dynamic Bloom filter, as defined in the INFOCOM 2006 paper.
+
+ A dynamic Bloom filter (DBF) makes use of a s * m bit matrix but
+ each of the s rows is a standard Bloom filter. The creation
+ process of a DBF is iterative. At the start, the DBF is a 1 * m
+ bit matrix, i.e., it is composed of a single standard Bloom filter.
+ It assumes that nr elements are recorded in the
+ initial bit vector, where nr <= n (n is
+ the cardinality of the set A to record in the filter).
+
+ As the size of A grows during the execution of the application,
+ several keys must be inserted in the DBF. When inserting a key into the DBF,
+ one must first get an active Bloom filter in the matrix. A Bloom filter is
+ active when the number of recorded keys, nr, is
+ strictly less than the current cardinality of A, n.
+ If an active Bloom filter is found, the key is inserted and
+ nr is incremented by one. On the other hand, if there
+ is no active Bloom filter, a new one is created (i.e., a new row is added to
+ the matrix) according to the current size of A and the element
+ is added in this new Bloom filter and the nr value of
+ this new Bloom filter is set to one. A given key is said to belong to the
+ DBF if the k positions are set to one in one of the matrix rows.
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash functions to consider.
+ @param hashType type of the hashing function (see {@link Hash}).]]>
+
+
+
+
+
+ this filter.
+ @param key The key to add.]]>
+
+
+
+
+
+ this filter.
+ @param key The key to test.
+ @return boolean True if the specified key belongs to this filter.
+ False otherwise.]]>
+
+
+
+
+
+ this filter and a specified filter.
+
+ Invariant: The result is assigned to this filter.
+ @param filter The filter to AND with.]]>
+
+
+
+
+
+ this filter and a specified filter.
+
+ Invariant: The result is assigned to this filter.
+ @param filter The filter to OR with.]]>
+
+
+
+
+
+ this filter and a specified filter.
+
+ Invariant: The result is assigned to this filter.
+ @param filter The filter to XOR with.]]>
+
+
+
+
+ this filter.
+
+ The result is assigned to this filter.]]>
+
+
+
+
+
+ this filter.
+ @param keys The list of keys.]]>
+
+
+
+
+
+ this filter.
+ @param keys The collection of keys.]]>
+
+
+
+
+
+ this filter.
+ @param keys The array of keys.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A filter is a data structure which aims at offering a lossy summary of a set A. The
+ key idea is to map entries of A (also called keys) into several positions
+ in a vector through the use of several hash functions.
+
+ Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension).
+
+ It must be extended in order to define the real behavior.
+
+ @see Key The general behavior of a key
+ @see HashFunction A hash function]]>
+
+
+
+
+
+
+
+
+ Builds a hash function that must obey to a given maximum number of returned values and a highest value.
+ @param maxValue The maximum highest returned value.
+ @param nbHash The number of resulting hashed values.
+ @param hashType type of the hashing function (see {@link Hash}).]]>
+
+
+
+
+ this hash function. A NOOP]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Builds a key with a default weight.
+ @param value The byte value of this key.]]>
+
+
+
+
+
+ Builds a key with a specified weight.
+ @param value The value of this key.
+ @param weight The weight associated to this key.]]>
+
+
+
+
+
+
+
+
+
+
+
+ this key.]]>
+
+
+
+
+ this key.]]>
+
+
+
+
+
+ this key with a specified value.
+ @param weight The increment.]]>
+
+
+
+
+ this key by one.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The idea is to randomly select a bit to reset.]]>
+
+
+
+
+
+ The idea is to select the bit to reset that will generate the minimum
+ number of false negative.]]>
+
+
+
+
+
+ The idea is to select the bit to reset that will remove the maximum number
+ of false positive.]]>
+
+
+
+
+
+ The idea is to select the bit to reset that will, at the same time, remove
+ the maximum number of false positve while minimizing the amount of false
+ negative generated.]]>
+
+
+
+
+ Originally created by
+ European Commission One-Lab Project 034819.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).]]>
+
+
+
+
+
+
+
+
+ this retouched Bloom filter.
+
+ Invariant: if the false positive is null, nothing happens.
+ @param key The false positive key to add.]]>
+
+
+
+
+
+ this retouched Bloom filter.
+ @param coll The collection of false positive.]]>
+
+
+
+
+
+ this retouched Bloom filter.
+ @param keys The list of false positive.]]>
+
+
+
+
+
+ this retouched Bloom filter.
+ @param keys The array of false positive.]]>
+
+
+
+
+
+
+ this retouched Bloom filter.
+ @param scheme The selective clearing scheme to apply.]]>
+
+
+
+
+
+
+
+
+
+
+
+ retouched Bloom filter, as defined in the CoNEXT 2006 paper.
+
+ It allows the removal of selected false positives at the cost of introducing
+ random false negatives, and with the benefit of eliminating some random false
+ positives at the same time.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ length, and
+ the provided seed value
+ @param bytes input bytes
+ @param length length of the valid bytes to consider
+ @param initval seed value
+ @return hash value]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The best hash table sizes are powers of 2. There is no need to do mod
+ a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask.
+ For example, if you need only 10 bits, do
+ h = (h & hashmask(10));
+ In which case, the hash table should have hashsize(10) elements.
+
+
If you are hashing n strings byte[][] k, do it like this:
+ for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+
+
By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
+ code any way you wish, private, educational, or commercial. It's free.
+
+
Use for hash table lookup, or anything where one collision in 2^^32 is
+ acceptable. Do NOT use for cryptographic purposes.]]>
+
+
+
+
+
+
+
+
+
+
+ lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+
+ You can use this free for any purpose. It's in the public domain.
+ It has no warranty.
+
+
+ @see lookup3.c
+ @see Hash Functions (and how this
+ function compares to others such as CRC, MD?, etc
+ @see Has update on the
+ Dr. Dobbs Article]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The C version of MurmurHash 2.0 found at that site was ported
+ to Java by Andrzej Bialecki (ab at getopt org).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobTracker,
+ as {@link JobTracker.State}
+
+ @return the current state of the JobTracker.]]>
+
+
+
+
+ JobTracker
+
+ @return the size of heap memory used by the JobTracker]]>
+
+
+
+
+ JobTracker
+
+ @return the configured size of max heap memory that can be used by the JobTracker]]>
+
+
+
+
+
+
+
+
+
+
+
+ ClusterStatus provides clients with information such as:
+
+
+ Size of the cluster.
+
+
+ Name of the trackers.
+
+
+ Task capacity of the cluster.
+
+
+ The number of currently running map & reduce tasks.
+
+
+ State of the JobTracker.
+
+
+
+
Clients can query for the latest ClusterStatus, via
+ {@link JobClient#getClusterStatus()}.
Counters are bunched into {@link Group}s, each comprising of
+ counters from a particular Enum class.
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Counters} instead.]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.
+ @deprecated Use {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}
+ instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Note: The following is valid only if the {@link OutputCommitter}
+ is {@link FileOutputCommitter}. If OutputCommitter is not
+ a FileOutputCommitter, the task's temporary output
+ directory is same as {@link #getOutputPath(JobConf)} i.e.
+ ${mapred.output.dir}$
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The generated name can be used to create custom files from within the
+ different tasks for the job, the names for different tasks will not collide
+ with each other.
+
+
The given name is postfixed with the task type, 'm' for maps, 'r' for
+ reduces and the task partition number. For example, give a name 'test'
+ running on the first map o the job the generated name will be
+ 'test-m-00000'.
+
+ @param conf the configuration for the job.
+ @param name the name to make unique.
+ @return a unique name accross all tasks of the job.]]>
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueName} method to make the file name
+ unique for the task.
+
+ @param conf the configuration for the job.
+ @param name the name for the file.
+ @return a unique path accross all tasks of the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat
+ @deprecated Use {@link org.apache.hadoop.mapreduce.InputFormat} instead.]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader
+ @deprecated Use {@link org.apache.hadoop.mapreduce.InputSplit} instead.]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
+
+
+ @see JobConf
+ @see ClusterStatus
+ @see Tool
+ @see DistributedCache]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If the parameter {@code loadDefaults} is false, the new instance
+ will not load resources from the default files.
+
+ @param loadDefaults specifies whether to load from the default files]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if framework should keep the intermediate files
+ for failed tasks, false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the outputs of the maps are to be compressed,
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This comparator should be provided if the equivalence rules for keys
+ for sorting the intermediates are different from those for grouping keys
+ before each call to
+ {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
+
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is an application-specified aggregation operation, which
+ can help cut down the amount of data transferred between the
+ {@link Mapper} and the {@link Reducer}, leading to better performance.
+
+
The framework may invoke the combiner 0, 1, or multiple times, in both
+ the mapper and reducer tasks. In general, the combiner is called as the
+ sort/merge result is written to disk. The combiner must:
+
+
be side-effect free
+
have the same input and output key types and the same input and
+ output value types
+
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If a job doesn't specify its virtual memory requirement by setting
+ {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to {@link #DISABLED_MEMORY_LIMIT},
+ tasks are assured a memory limit set to this property. This property is
+ disabled by default, and if not explicitly set to a valid value by the
+ administrators and if a job doesn't specify its virtual memory
+ requirements, the job's tasks will not be assured anything and may be
+ killed by a TT that intends to control the total memory usage of the tasks
+ via memory management functionality.
+
+
+
+ This value should in general be less than the cluster-wide configuration
+ {@link #UPPER_LIMIT_ON_TASK_VMEM_PROPERTY} . If not or if it not set,
+ TaskTracker's memory management may be disabled and a scheduler's memory
+ based scheduling decisions will be affected. Please refer to the
+ documentation of the configured scheduler to see how this property is used.]]>
+
+
+
+
+
+
+ This value will be used by TaskTrackers for monitoring the memory usage of
+ tasks of this jobs. If a TaskTracker's memory management functionality is
+ enabled, each task of this job will be allowed to use a maximum virtual
+ memory specified by this property. If the task's memory usage goes over
+ this value, the task will be failed by the TT. If not set, the cluster-wide
+ configuration {@link #MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY} is used as the
+ default value for memory requirements. If this property cascaded with
+ {@link #MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY} becomes equal to -1, job's
+ tasks will not be assured anything and may be killed by a TT that intends
+ to control the total memory usage of the tasks via memory management
+ functionality. If the memory management functionality is disabled on a TT,
+ this value is ignored.
+
+
+
+ This value should also be not more than the cluster-wide configuration
+ {@link #UPPER_LIMIT_ON_TASK_VMEM_PROPERTY} which has to be set by the site
+ administrators.
+
+
+
+ This value may be used by schedulers that support scheduling based on job's
+ memory requirements. In general, a task of this job will be scheduled on a
+ TaskTracker only if the amount of virtual memory still unoccupied on the
+ TaskTracker is greater than or equal to this value. But different
+ schedulers can take different decisions. Please refer to the documentation
+ of the scheduler being configured to see if it does memory based scheduling
+ and if it does, how this property is used by that scheduler.
+
+ @see #setMaxVirtualMemoryForTask(long)
+ @see #getMaxVirtualMemoryForTask()]]>
+
+
+
+
+
+
+ This value may be used by schedulers that support scheduling based on job's
+ memory requirements. In general, a task of this job will be scheduled on a
+ TaskTracker, only if the amount of physical memory still unoccupied on the
+ TaskTracker is greater than or equal to this value. But different
+ schedulers can take different decisions. Please refer to the documentation
+ of the scheduler being configured to see how it does memory based
+ scheduling and how this variable is used by that scheduler.
+
+ @see #setMaxPhysicalMemoryForTask(long)
+ @see #getMaxPhysicalMemoryForTask()]]>
+
+
+
+
+
+
+ If it is not set on a TaskTracker, TaskTracker's memory management will be
+ disabled.]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("map.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Mapper} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit
+ @deprecated Use {@link org.apache.hadoop.mapred.lib.CombineFileInputFormat} instead]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat
+ @deprecated Use {@link org.apache.hadoop.mapred.lib.CombineFileSplit} instead]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext
+ @deprecated Use {@link org.apache.hadoop.mapreduce.OutputCommitter} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf
+ @deprecated Use {@link org.apache.hadoop.mapreduce.OutputFormat} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Partitioner} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if there exists a key/value,
+ false otherwise.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RawKeyValueIterator is an iterator used to iterate over
+ the raw keys and values during sort/merge of intermediate data.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Reducer} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's cleanup-tasks, as a float between 0.0
+ and 1.0. When all cleanup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's cleanup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's setup-tasks, as a float between 0.0
+ and 1.0. When all setup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's setup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop provides an optional mode of execution in which the bad records
+ are detected and skipped in further attempts.
+
+
This feature can be used when map/reduce tasks crashes deterministically on
+ certain input. This happens due to bugs in the map/reduce function. The usual
+ course would be to fix these bugs. But sometimes this is not possible;
+ perhaps the bug is in third party libraries for which the source code is
+ not available. Due to this, the task never reaches to completion even with
+ multiple attempts and complete data for that task is lost.
+
+
With this feature, only a small portion of data is lost surrounding
+ the bad record, which may be acceptable for some user applications.
+ see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}
+
+
The skipping mode gets kicked off after certain no of failures
+ see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}
+
+
In the skipping mode, the map/reduce task maintains the record range which
+ is getting processed at all times. Before giving the input to the
+ map/reduce function, it sends this record range to the Task tracker.
+ If task crashes, the Task tracker knows which one was the last reported
+ range. On further attempts that range get skipped.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+ mapred.join.define.<ident> to a classname. In the expression
+ mapred.join.expr, the identifier will be assumed to be a
+ ComposableRecordReader.
+ mapred.join.keycomparator can be a classname used to compare keys
+ in the join.
+ @see JoinRecordReader
+ @see MultiFilterRecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ capacity children to position
+ id in the parent reader.
+ The id of a root CompositeRecordReader is -1 by convention, but relying
+ on this is not recommended.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ override(S1,S2,S3) will prefer values
+ from S3 over S2, and values from S2 over S1 for all keys
+ emitted from all sources.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [,,...,]]]>
+
+
+
+
+
+
+ out.
+ TupleWritable format:
+ {@code
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+
+
+ @param job job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ The Mapper classes are invoked in a chained (or piped) fashion, the output of
+ the first becomes the input of the second, and so on until the last Mapper,
+ the output of the last Mapper will be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed in a chain. This enables having
+ reusable specialized Mappers that can be combined to perform composite
+ operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain.
+
+ ChainMapper usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Reducer leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Reducer does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Reducer the configuration given for it,
+ reducerConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ @param job job's JobConf to add the Reducer class.
+ @param klass the Reducer class to add.
+ @param inputKeyClass reducer input key class.
+ @param inputValueClass reducer input value class.
+ @param outputKeyClass reducer output key class.
+ @param outputValueClass reducer output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param reducerConf a JobConf with the configuration for the Reducer
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+ .
+
+ @param job chain job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ reduce(...) method of the Reducer with the
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ For each record output by the Reducer, the Mapper classes are invoked in a
+ chained (or piped) fashion, the output of the first becomes the input of the
+ second, and so on until the last Mapper, the output of the last Mapper will
+ be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed after the Reducer or in a chain.
+ This enables having reusable specialized Mappers that can be combined to
+ perform composite operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ ChainReducer usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RecordReader's for CombineFileSplit's.
+ @see CombineFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CombineFileSplit can be used to implement {@link org.apache.hadoop.mapred.RecordReader}'s,
+ with reading one record per file.
+ @see org.apache.hadoop.mapred.FileSplit
+ @see CombineFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ @param freq The frequency with which records will be emitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ This will read every split at the client, which is very expensive.
+ @param freq Probability with which a key will be chosen.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ Takes the first numSamples / numSplits records from each split.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the name output is multi, false
+ if it is single. If the name output is not defined it returns
+ false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ @param conf job conf to enableadd the named output.
+ @param enabled indicates if the counters will be enabled or not.]]>
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+
+ @param conf job conf to enableadd the named output.
+ @return TRUE if the counters are enabled, FALSE if they are disabled.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param multiName the multi name part
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+ If overriden subclasses must invoke super.close() at the
+ end of their close()
+
+ @throws java.io.IOException thrown if any of the MultipleOutput files
+ could not be closed properly.]]>
+
+
+
+ OutputCollector passed to
+ the map() and reduce() methods of the
+ Mapper and Reducer implementations.
+
+ Each additional output, or named output, may be configured with its own
+ OutputFormat, with its own key class and with its own value
+ class.
+
+ A named output can be a single file or a multi file. The later is refered as
+ a multi named output.
+
+ A multi named output is an unbound set of files all sharing the same
+ OutputFormat, key class and value class configuration.
+
+ When named outputs are used within a Mapper implementation,
+ key/values written to a name output are not part of the reduce phase, only
+ key/values written to the job OutputCollector are part of the
+ reduce phase.
+
+ MultipleOutputs supports counters, by default the are disabled. The counters
+ group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ Job configuration usage pattern is:
+
+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+ SequenceFileOutputFormat.class,
+ LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+
+
+ Job configuration usage pattern is:
+
+
+ public class MOReduce implements
+ Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+ Alternatively, the properties can be set in the configuration with proper
+ values.
+
+ @see DBConfiguration#configureDB(JobConf, String, String, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...)
+ @see DBOutputFormat#setOutput(JobConf, String, String...)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 20070101 AND length > 0)'
+ @param orderBy the fieldNames in the orderBy clause.
+ @param fieldNames The field names in the table
+ @see #setInput(JobConf, Class, String, String)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBInputFormat emits LongWritables containing the record number as
+ key and DBWritables as value.
+
+ The SQL query, and input class can be using one of the two
+ setInput methods.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {@link DBOutputFormat} accepts <key,value> pairs, where
+ key has a type extending DBWritable. Returned {@link RecordWriter}
+ writes only the key to the database with a batch SQL query.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBWritable. DBWritable, is similar to {@link Writable}
+ except that the {@link #write(PreparedStatement)} method takes a
+ {@link PreparedStatement}, and {@link #readFields(ResultSet)}
+ takes a {@link ResultSet}.
+
+ Implementations are responsible for writing the fields of the object
+ to PreparedStatement, and reading the fields of the object from the
+ ResultSet.
+
+
Example:
+ If we have the following table in the database :
+
+ CREATE TABLE MyTable (
+ counter INTEGER NOT NULL,
+ timestamp BIGINT NOT NULL,
+ );
+
+ then we can read/write the tuples from/to the table with :
+
+ public class MyWritable implements Writable, DBWritable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ //Writable#write() implementation
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ //Writable#readFields() implementation
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public void write(PreparedStatement statement) throws SQLException {
+ statement.setInt(1, counter);
+ statement.setLong(2, timestamp);
+ }
+
+ public void readFields(ResultSet resultSet) throws SQLException {
+ counter = resultSet.getInt(1);
+ timestamp = resultSet.getLong(2);
+ }
+ }
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple. The InputFormat
+ also creates the {@link RecordReader} to read the {@link InputSplit}.
+
+ @param context job configuration.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibility to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputFormat to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+ OutputFormat to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+ Mapper to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reducer to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+ Partitioner to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobTracker is lost]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1.
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see org.apache.hadoop.mapred.JobTracker#getNewJobId()
+ @see org.apache.hadoop.mapred.JobTracker#getStartTime()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the key input type to the Mapper
+ @param the value input type to the Mapper
+ @param the key output type from the Mapper
+ @param the value output type from the Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link Configuration} for
+ the job via the {@link JobContext#getConfiguration()}.
+
+
The framework first calls
+ {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by
+ {@link #map(Object, Object, Context)}
+ for each key/value pair in the InputSplit. Finally
+ {@link #cleanup(Context)} is called.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the sorting and grouping by
+ specifying two key {@link RawComparator} classes.
+
+
The Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link Job#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the Configuration.
+
+
If the job has zero
+ reduces then the output of the Mapper is directly written
+ to the {@link OutputFormat} without sorting by keys.
+
+
Example:
+
+ public class TokenCounterMapper
+ extends Mapper
+
+
Applications may override the {@link #run(Context)} method to exert
+ greater control on map processing e.g. multi-threaded Mappers
+ etc.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param context information about the job
+ @throws IOException when output should not be attempted]]>
+
+
+
+
+
+
+
+
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be partioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RecordWriter to future operations.
+
+ @param context the context of the task
+ @throws IOException]]>
+
+
+
+ RecordWriter writes the output <key, value> pairs
+ to an output file.
+
+
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the input keys
+ @param the class of the input values
+ @param the class of the output keys
+ @param the class of the output values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reducer implementations
+ can access the {@link Configuration} for the job via the
+ {@link JobContext#getConfiguration()} method.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
The Reducer copies the sorted output from each
+ {@link Mapper} using HTTP across the network.
+
+
+
+
Sort
+
+
The framework merge sorts Reducer inputs by
+ keys
+ (since different Mappers may have output the same key).
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
To achieve a secondary sort on the values returned by the value
+ iterator, the application should extend the key with the secondary
+ key and define a grouping comparator. The keys will be sorted using the
+ entire key, but will be grouped using the grouping comparator to decide
+ which keys and values are sent in the same call to reduce.The grouping
+ comparator is specified via
+ {@link Job#setGroupingComparatorClass(Class)}. The sort order is
+ controlled by
+ {@link Job#setSortComparatorClass(Class)}.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterable, Context)}
+ method is called for each <key, (collection of values)> in
+ the sorted inputs.
+
The output of the reduce task is typically written to a
+ {@link RecordWriter} via
+ {@link Context#write(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class IntSumReducer extends Reducer {
+ private IntWritable result = new IntWritable();
+
+ public void reduce(Key key, Iterable values,
+ Context context) throws IOException {
+ int sum = 0;
+ for (IntWritable val : values) {
+ sum += val.get();
+ }
+ result.set(sum);
+ context.collect(key, result);
+ }
+ }
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the input key type for the task
+ @param the input value type for the task
+ @param the output key type for the task
+ @param the output value type for the task]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param context the job context
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobContext)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(JobContext, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the map's input key type
+ @param the map's input value type
+ @param the map's output key type
+ @param the map's output value type
+ @param job the job
+ @return the mapper class to run]]>
+
+
+
+
+
+
+ the map input key type
+ @param the map input value type
+ @param the map output key type
+ @param the map output value type
+ @param job the job to modify
+ @param cls the class to use as the mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Mapper implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured with the mapper to use via
+ {@link #setMapperClass(Configuration, Class)} and
+ the number of thread the thread-pool can use with the
+ {@link #getNumberOfThreads(Configuration) method. The default
+ value is 10 threads.
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in a work directory during execution
+ of his task i.e. via
+ {@link #getWorkOutputPath(TaskInputOutputContext)}, and
+ the framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueFile} method to make the file name
+ unique for the task.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar b/core/lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar
new file mode 100644
index 0000000000..e4048dd685
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jets3t-0.6.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jetty-6.1.14.jar b/core/lib/hadoop-0.20.0/lib/jetty-6.1.14.jar
new file mode 100644
index 0000000000..8c503bea21
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jetty-6.1.14.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar b/core/lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar
new file mode 100644
index 0000000000..8f924bb147
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jetty-util-6.1.14.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar b/core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar
new file mode 100644
index 0000000000..bfdb566c13
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-2.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar b/core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar
new file mode 100644
index 0000000000..ac3a7a8f7e
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/jsp-2.1/jsp-api-2.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/junit-3.8.1.jar b/core/lib/hadoop-0.20.0/lib/junit-3.8.1.jar
new file mode 100644
index 0000000000..674d71e89e
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/junit-3.8.1.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/kfs-0.2.2.jar b/core/lib/hadoop-0.20.0/lib/kfs-0.2.2.jar
new file mode 100644
index 0000000000..aa32e74baf
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/kfs-0.2.2.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt b/core/lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/kfs-0.2.LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/core/lib/hadoop-0.20.0/lib/log4j-1.2.15.jar b/core/lib/hadoop-0.20.0/lib/log4j-1.2.15.jar
new file mode 100644
index 0000000000..c930a6ab4d
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/log4j-1.2.15.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/.DS_Store b/core/lib/hadoop-0.20.0/lib/native/.DS_Store
new file mode 100644
index 0000000000..ff86c6f7f2
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/.DS_Store differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.a b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.a
new file mode 100644
index 0000000000..d8d90cf067
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.a differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.la b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.la
new file mode 100644
index 0000000000..2e772fb4ae
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.la
@@ -0,0 +1,35 @@
+# libhadoop.la - a libtool library file
+# Generated by ltmain.sh - GNU libtool 1.5.22 (1.1220.2.365 2005/12/18 22:14:06)
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='libhadoop.so.1'
+
+# Names of this library.
+library_names='libhadoop.so.1.0.0 libhadoop.so.1 libhadoop.so'
+
+# The name of the static archive.
+old_library='libhadoop.a'
+
+# Libraries that this one depends upon.
+dependency_libs=' -L/home/hadoopqa/tools/java/latest1.6-64/jre/lib/amd64/server -ljvm -ldl'
+
+# Version information for libhadoop.
+current=1
+age=0
+revision=0
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='/usr/local/lib'
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so
new file mode 100644
index 0000000000..fb2cbad0b5
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1 b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1
new file mode 100644
index 0000000000..fb2cbad0b5
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1 differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1.0.0 b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1.0.0
new file mode 100644
index 0000000000..fb2cbad0b5
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-amd64-64/libhadoop.so.1.0.0 differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.a b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.a
new file mode 100644
index 0000000000..068d2d6bf6
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.a differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.la b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.la
new file mode 100644
index 0000000000..14941670b4
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.la
@@ -0,0 +1,35 @@
+# libhadoop.la - a libtool library file
+# Generated by ltmain.sh - GNU libtool 1.5.22 (1.1220.2.365 2005/12/18 22:14:06)
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='libhadoop.so.1'
+
+# Names of this library.
+library_names='libhadoop.so.1.0.0 libhadoop.so.1 libhadoop.so'
+
+# The name of the static archive.
+old_library='libhadoop.a'
+
+# Libraries that this one depends upon.
+dependency_libs=' -L/home/hadoopqa/tools/java/latest1.6-32/jre/lib/i386/server -ljvm -ldl'
+
+# Version information for libhadoop.
+current=1
+age=0
+revision=0
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='/usr/local/lib'
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so
new file mode 100644
index 0000000000..e3acc2b220
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1 b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1
new file mode 100644
index 0000000000..e3acc2b220
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1 differ
diff --git a/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1.0.0 b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1.0.0
new file mode 100644
index 0000000000..e3acc2b220
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/native/Linux-i386-32/libhadoop.so.1.0.0 differ
diff --git a/core/lib/hadoop-0.20.0/lib/oro-2.0.8.jar b/core/lib/hadoop-0.20.0/lib/oro-2.0.8.jar
new file mode 100644
index 0000000000..23488d2600
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/oro-2.0.8.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar b/core/lib/hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar
new file mode 100644
index 0000000000..6d7404fb72
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/servlet-api-2.5-6.1.14.jar differ
diff --git a/core/lib/hadoop-0.20.0/lib/xmlenc-0.52.jar b/core/lib/hadoop-0.20.0/lib/xmlenc-0.52.jar
new file mode 100644
index 0000000000..ec568b4c9e
Binary files /dev/null and b/core/lib/hadoop-0.20.0/lib/xmlenc-0.52.jar differ
diff --git a/core/lib/hadoop-0.20.0/librecordio/librecordio.a b/core/lib/hadoop-0.20.0/librecordio/librecordio.a
new file mode 100644
index 0000000000..49f7c22d1e
Binary files /dev/null and b/core/lib/hadoop-0.20.0/librecordio/librecordio.a differ
diff --git a/core/lib/hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml b/core/lib/hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml
new file mode 100644
index 0000000000..c271b62815
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/webapps/datanode/WEB-INF/web.xml
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+ org.apache.hadoop.hdfs.server.datanode.browseDirectory_jsp
+ org.apache.hadoop.hdfs.server.datanode.browseDirectory_jsp
+
+
+
+ org.apache.hadoop.hdfs.server.datanode.tail_jsp
+ org.apache.hadoop.hdfs.server.datanode.tail_jsp
+
+
+
+ org.apache.hadoop.hdfs.server.datanode.browseBlock_jsp
+ org.apache.hadoop.hdfs.server.datanode.browseBlock_jsp
+
+
+
+ org.apache.hadoop.hdfs.server.datanode.browseDirectory_jsp
+ /browseDirectory.jsp
+
+
+
+ org.apache.hadoop.hdfs.server.datanode.tail_jsp
+ /tail.jsp
+
+
+
+ org.apache.hadoop.hdfs.server.datanode.browseBlock_jsp
+ /browseBlock.jsp
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml b/core/lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml
new file mode 100644
index 0000000000..40a73d5293
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/webapps/hdfs/WEB-INF/web.xml
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+ org.apache.hadoop.hdfs.server.namenode.nn_005fbrowsedfscontent_jsp
+ org.apache.hadoop.hdfs.server.namenode.nn_005fbrowsedfscontent_jsp
+
+
+
+ org.apache.hadoop.hdfs.server.namenode.dfsnodelist_jsp
+ org.apache.hadoop.hdfs.server.namenode.dfsnodelist_jsp
+
+
+
+ org.apache.hadoop.hdfs.server.namenode.dfshealth_jsp
+ org.apache.hadoop.hdfs.server.namenode.dfshealth_jsp
+
+
+
+ org.apache.hadoop.hdfs.server.namenode.nn_005fbrowsedfscontent_jsp
+ /nn_browsedfscontent.jsp
+
+
+
+ org.apache.hadoop.hdfs.server.namenode.dfsnodelist_jsp
+ /dfsnodelist.jsp
+
+
+
+ org.apache.hadoop.hdfs.server.namenode.dfshealth_jsp
+ /dfshealth.jsp
+
+
+
+
diff --git a/core/lib/hadoop-0.20.0/webapps/hdfs/index.html b/core/lib/hadoop-0.20.0/webapps/hdfs/index.html
new file mode 100644
index 0000000000..b9ad74218a
--- /dev/null
+++ b/core/lib/hadoop-0.20.0/webapps/hdfs/index.html
@@ -0,0 +1,20 @@
+
+
+
+
+Hadoop Administration
+
+
+
+
+
+ * The main public entry points are compile(),
+ * interpret(), and bind().
+ * The compile() method loads a
+ * complete Scala file. The interpret() method executes one
+ * line of Scala code at the request of the user. The bind()
+ * method binds an object to a variable that can then be used by later
+ * interpreted code.
+ *
+ *
+ * The overall approach is based on compiling the requested code and then
+ * using a Java classloader and Java reflection to run the code
+ * and access its results.
+ *
+ *
+ * In more detail, a single compiler instance is used
+ * to accumulate all successfully compiled or interpreted Scala code. To
+ * "interpret" a line of code, the compiler generates a fresh object that
+ * includes the line of code and which has public member(s) to export
+ * all variables defined by that code. To extract the result of an
+ * interpreted line to show the user, a second "result object" is created
+ * which imports the variables exported by the above object and then
+ * exports a single member named "scala_repl_result". To accomodate user expressions
+ * that read from variables or methods defined in previous statements, "import"
+ * statements are used.
+ *
+ *
+ * This interpreter shares the strengths and weaknesses of using the
+ * full compiler-to-Java. The main strength is that interpreted code
+ * behaves exactly as does compiled code, including running at full speed.
+ * The main weakness is that redefining classes and methods is not handled
+ * properly, because rebinding at the Java level is technically difficult.
+ *
+ *
+ * @author Moez A. Abdel-Gawad
+ * @author Lex Spoon
+ */
+class SparkInterpreter(val settings: Settings, out: PrintWriter) {
+ repl =>
+
+ def println(x: Any) = {
+ out.println(x)
+ out.flush()
+ }
+
+ /** construct an interpreter that reports to Console */
+ def this(settings: Settings) = this(settings, new NewLinePrintWriter(new ConsoleWriter, true))
+ def this() = this(new Settings())
+
+ val SPARK_DEBUG_REPL: Boolean = (System.getenv("SPARK_DEBUG_REPL") == "1")
+
+ /** Local directory to save .class files too */
+ val outputDir = {
+ val tmp = System.getProperty("java.io.tmpdir")
+ val rootDir = System.getProperty("spark.repl.classdir", tmp)
+ Utils.createTempDir(rootDir)
+ }
+ if (SPARK_DEBUG_REPL) {
+ println("Output directory: " + outputDir)
+ }
+
+ /** Scala compiler virtual directory for outputDir */
+ //val virtualDirectory = new VirtualDirectory("(memory)", None)
+ val virtualDirectory = new PlainFile(outputDir)
+
+ /** Jetty server that will serve our classes to worker nodes */
+ val classServer = new HttpServer(outputDir)
+
+ // Start the classServer and store its URI in a spark system property
+ // (which will be passed to executors so that they can connect to it)
+ classServer.start()
+ System.setProperty("spark.repl.class.uri", classServer.uri)
+ if (SPARK_DEBUG_REPL) {
+ println("Class server started, URI = " + classServer.uri)
+ }
+
+ /** reporter */
+ object reporter extends ConsoleReporter(settings, null, out) {
+ override def printMessage(msg: String) {
+ out println clean(msg)
+ out.flush()
+ }
+ }
+
+ /** We're going to go to some trouble to initialize the compiler asynchronously.
+ * It's critical that nothing call into it until it's been initialized or we will
+ * run into unrecoverable issues, but the perceived repl startup time goes
+ * through the roof if we wait for it. So we initialize it with a future and
+ * use a lazy val to ensure that any attempt to use the compiler object waits
+ * on the future.
+ */
+ private val _compiler: Global = newCompiler(settings, reporter)
+ private def _initialize(): Boolean = {
+ val source = """
+ |// this is assembled to force the loading of approximately the
+ |// classes which will be loaded on the first expression anyway.
+ |class $repl_$init {
+ | val x = "abc".reverse.length + (5 max 5)
+ | scala.runtime.ScalaRunTime.stringOf(x)
+ |}
+ |""".stripMargin
+
+ try {
+ new _compiler.Run() compileSources List(new BatchSourceFile("", source))
+ if (isReplDebug || settings.debug.value)
+ println("Repl compiler initialized.")
+ true
+ }
+ catch {
+ case MissingRequirementError(msg) => println("""
+ |Failed to initialize compiler: %s not found.
+ |** Note that as of 2.8 scala does not assume use of the java classpath.
+ |** For the old behavior pass -usejavacp to scala, or if using a Settings
+ |** object programatically, settings.usejavacp.value = true.""".stripMargin.format(msg)
+ )
+ false
+ }
+ }
+
+ // set up initialization future
+ private var _isInitialized: () => Boolean = null
+ def initialize() = synchronized {
+ if (_isInitialized == null)
+ _isInitialized = scala.concurrent.ops future _initialize()
+ }
+
+ /** the public, go through the future compiler */
+ lazy val compiler: Global = {
+ initialize()
+
+ // blocks until it is ; false means catastrophic failure
+ if (_isInitialized()) _compiler
+ else null
+ }
+
+ import compiler.{ Traverser, CompilationUnit, Symbol, Name, Type }
+ import compiler.{
+ Tree, TermTree, ValOrDefDef, ValDef, DefDef, Assign, ClassDef,
+ ModuleDef, Ident, Select, TypeDef, Import, MemberDef, DocDef,
+ ImportSelector, EmptyTree, NoType }
+ import compiler.{ nme, newTermName, newTypeName }
+ import nme.{
+ INTERPRETER_VAR_PREFIX, INTERPRETER_SYNTHVAR_PREFIX, INTERPRETER_LINE_PREFIX,
+ INTERPRETER_IMPORT_WRAPPER, INTERPRETER_WRAPPER_SUFFIX, USCOREkw
+ }
+
+ import compiler.definitions
+ import definitions.{ EmptyPackage, getMember }
+
+ /** whether to print out result lines */
+ private[repl] var printResults: Boolean = true
+
+ /** Temporarily be quiet */
+ def beQuietDuring[T](operation: => T): T = {
+ val wasPrinting = printResults
+ ultimately(printResults = wasPrinting) {
+ printResults = false
+ operation
+ }
+ }
+
+ /** whether to bind the lastException variable */
+ private var bindLastException = true
+
+ /** Temporarily stop binding lastException */
+ def withoutBindingLastException[T](operation: => T): T = {
+ val wasBinding = bindLastException
+ ultimately(bindLastException = wasBinding) {
+ bindLastException = false
+ operation
+ }
+ }
+
+ /** interpreter settings */
+ lazy val isettings = new SparkInterpreterSettings(this)
+
+ /** Instantiate a compiler. Subclasses can override this to
+ * change the compiler class used by this interpreter. */
+ protected def newCompiler(settings: Settings, reporter: Reporter) = {
+ settings.outputDirs setSingleOutput virtualDirectory
+ new Global(settings, reporter)
+ }
+
+ /** the compiler's classpath, as URL's */
+ lazy val compilerClasspath: List[URL] = new PathResolver(settings) asURLs
+
+ /* A single class loader is used for all commands interpreted by this Interpreter.
+ It would also be possible to create a new class loader for each command
+ to interpret. The advantages of the current approach are:
+
+ - Expressions are only evaluated one time. This is especially
+ significant for I/O, e.g. "val x = Console.readLine"
+
+ The main disadvantage is:
+
+ - Objects, classes, and methods cannot be rebound. Instead, definitions
+ shadow the old ones, and old code objects refer to the old
+ definitions.
+ */
+ private var _classLoader: ClassLoader = null
+ def resetClassLoader() = _classLoader = makeClassLoader()
+ def classLoader: ClassLoader = {
+ if (_classLoader == null)
+ resetClassLoader()
+
+ _classLoader
+ }
+ private def makeClassLoader(): ClassLoader = {
+ /*
+ val parent =
+ if (parentClassLoader == null) ScalaClassLoader fromURLs compilerClasspath
+ else new URLClassLoader(compilerClasspath, parentClassLoader)
+
+ new AbstractFileClassLoader(virtualDirectory, parent)
+ */
+ val parent =
+ if (parentClassLoader == null)
+ new java.net.URLClassLoader(compilerClasspath.toArray)
+ else
+ new java.net.URLClassLoader(compilerClasspath.toArray,
+ parentClassLoader)
+ val virtualDirUrl = new URL("file://" + virtualDirectory.path + "/")
+ new java.net.URLClassLoader(Array(virtualDirUrl), parent)
+ }
+
+ private def loadByName(s: String): Class[_] = // (classLoader tryToInitializeClass s).get
+ Class.forName(s, true, classLoader)
+
+ private def methodByName(c: Class[_], name: String): reflect.Method =
+ c.getMethod(name, classOf[Object])
+
+ protected def parentClassLoader: ClassLoader = this.getClass.getClassLoader()
+ def getInterpreterClassLoader() = classLoader
+
+ // Set the current Java "context" class loader to this interpreter's class loader
+ def setContextClassLoader() = Thread.currentThread.setContextClassLoader(classLoader)
+
+ /** the previous requests this interpreter has processed */
+ private val prevRequests = new ArrayBuffer[Request]()
+ private val usedNameMap = new HashMap[Name, Request]()
+ private val boundNameMap = new HashMap[Name, Request]()
+ private def allHandlers = prevRequests.toList flatMap (_.handlers)
+ private def allReqAndHandlers = prevRequests.toList flatMap (req => req.handlers map (req -> _))
+
+ def printAllTypeOf = {
+ prevRequests foreach { req =>
+ req.typeOf foreach { case (k, v) => Console.println(k + " => " + v) }
+ }
+ }
+
+ /** Most recent tree handled which wasn't wholly synthetic. */
+ private def mostRecentlyHandledTree: Option[Tree] = {
+ for {
+ req <- prevRequests.reverse
+ handler <- req.handlers.reverse
+ name <- handler.generatesValue
+ if !isSynthVarName(name)
+ } return Some(handler.member)
+
+ None
+ }
+
+ def recordRequest(req: Request) {
+ def tripart[T](set1: Set[T], set2: Set[T]) = {
+ val intersect = set1 intersect set2
+ List(set1 -- intersect, intersect, set2 -- intersect)
+ }
+
+ prevRequests += req
+ req.usedNames foreach (x => usedNameMap(x) = req)
+ req.boundNames foreach (x => boundNameMap(x) = req)
+
+ // XXX temporarily putting this here because of tricky initialization order issues
+ // so right now it's not bound until after you issue a command.
+ if (prevRequests.size == 1)
+ quietBind("settings", "spark.repl.SparkInterpreterSettings", isettings)
+
+ // println("\n s1 = %s\n s2 = %s\n s3 = %s".format(
+ // tripart(usedNameMap.keysIterator.toSet, boundNameMap.keysIterator.toSet): _*
+ // ))
+ }
+
+ private def keyList[T](x: collection.Map[T, _]): List[T] = x.keys.toList sortBy (_.toString)
+ def allUsedNames = keyList(usedNameMap)
+ def allBoundNames = keyList(boundNameMap)
+ def allSeenTypes = prevRequests.toList flatMap (_.typeOf.values.toList) distinct
+ def allValueGeneratingNames = allHandlers flatMap (_.generatesValue)
+ def allImplicits = partialFlatMap(allHandlers) {
+ case x: MemberHandler if x.definesImplicit => x.boundNames
+ }
+
+ /** Generates names pre0, pre1, etc. via calls to apply method */
+ class NameCreator(pre: String) {
+ private var x = -1
+ var mostRecent: String = null
+
+ def apply(): String = {
+ x += 1
+ val name = pre + x.toString
+ // make sure we don't overwrite their unwisely named res3 etc.
+ mostRecent =
+ if (allBoundNames exists (_.toString == name)) apply()
+ else name
+
+ mostRecent
+ }
+ def reset(): Unit = x = -1
+ def didGenerate(name: String) =
+ (name startsWith pre) && ((name drop pre.length) forall (_.isDigit))
+ }
+
+ /** allocate a fresh line name */
+ private lazy val lineNameCreator = new NameCreator(INTERPRETER_LINE_PREFIX)
+
+ /** allocate a fresh var name */
+ private lazy val varNameCreator = new NameCreator(INTERPRETER_VAR_PREFIX)
+
+ /** allocate a fresh internal variable name */
+ private lazy val synthVarNameCreator = new NameCreator(INTERPRETER_SYNTHVAR_PREFIX)
+
+ /** Check if a name looks like it was generated by varNameCreator */
+ private def isGeneratedVarName(name: String): Boolean = varNameCreator didGenerate name
+ private def isSynthVarName(name: String): Boolean = synthVarNameCreator didGenerate name
+ private def isSynthVarName(name: Name): Boolean = synthVarNameCreator didGenerate name.toString
+
+ def getVarName = varNameCreator()
+ def getSynthVarName = synthVarNameCreator()
+
+ /** Truncate a string if it is longer than isettings.maxPrintString */
+ private def truncPrintString(str: String): String = {
+ val maxpr = isettings.maxPrintString
+ val trailer = "..."
+
+ if (maxpr <= 0 || str.length <= maxpr) str
+ else str.substring(0, maxpr-3) + trailer
+ }
+
+ /** Clean up a string for output */
+ private def clean(str: String) = truncPrintString(
+ if (isettings.unwrapStrings && !SPARK_DEBUG_REPL) stripWrapperGunk(str)
+ else str
+ )
+
+ /** Heuristically strip interpreter wrapper prefixes
+ * from an interpreter output string.
+ * MATEI: Copied from interpreter package object
+ */
+ def stripWrapperGunk(str: String): String = {
+ val wrapregex = """(line[0-9]+\$object[$.])?(\$?VAL.?)*(\$iwC?(.this)?[$.])*"""
+ str.replaceAll(wrapregex, "")
+ }
+
+ /** Indent some code by the width of the scala> prompt.
+ * This way, compiler error messages read better.
+ */
+ private final val spaces = List.fill(7)(" ").mkString
+ def indentCode(code: String) = {
+ /** Heuristic to avoid indenting and thereby corrupting """-strings and XML literals. */
+ val noIndent = (code contains "\n") && (List("\"\"\"", "", "/>") exists (code contains _))
+ stringFromWriter(str =>
+ for (line <- code.lines) {
+ if (!noIndent)
+ str.print(spaces)
+
+ str.print(line + "\n")
+ str.flush()
+ })
+ }
+ def indentString(s: String) = s split "\n" map (spaces + _ + "\n") mkString
+
+ implicit def name2string(name: Name) = name.toString
+
+ /** Compute imports that allow definitions from previous
+ * requests to be visible in a new request. Returns
+ * three pieces of related code:
+ *
+ * 1. An initial code fragment that should go before
+ * the code of the new request.
+ *
+ * 2. A code fragment that should go after the code
+ * of the new request.
+ *
+ * 3. An access path which can be traverested to access
+ * any bindings inside code wrapped by #1 and #2 .
+ *
+ * The argument is a set of Names that need to be imported.
+ *
+ * Limitations: This method is not as precise as it could be.
+ * (1) It does not process wildcard imports to see what exactly
+ * they import.
+ * (2) If it imports any names from a request, it imports all
+ * of them, which is not really necessary.
+ * (3) It imports multiple same-named implicits, but only the
+ * last one imported is actually usable.
+ */
+ private case class ComputedImports(prepend: String, append: String, access: String)
+ private def importsCode(wanted: Set[Name]): ComputedImports = {
+ /** Narrow down the list of requests from which imports
+ * should be taken. Removes requests which cannot contribute
+ * useful imports for the specified set of wanted names.
+ */
+ case class ReqAndHandler(req: Request, handler: MemberHandler) { }
+
+ def reqsToUse: List[ReqAndHandler] = {
+ /** Loop through a list of MemberHandlers and select which ones to keep.
+ * 'wanted' is the set of names that need to be imported.
+ */
+ def select(reqs: List[ReqAndHandler], wanted: Set[Name]): List[ReqAndHandler] = {
+ val isWanted = wanted contains _
+ // Single symbol imports might be implicits! See bug #1752. Rather than
+ // try to finesse this, we will mimic all imports for now.
+ def keepHandler(handler: MemberHandler) = handler match {
+ case _: ImportHandler => true
+ case x => x.definesImplicit || (x.boundNames exists isWanted)
+ }
+
+ reqs match {
+ case Nil => Nil
+ case rh :: rest if !keepHandler(rh.handler) => select(rest, wanted)
+ case rh :: rest =>
+ val importedNames = rh.handler match { case x: ImportHandler => x.importedNames ; case _ => Nil }
+ import rh.handler._
+ val newWanted = wanted ++ usedNames -- boundNames -- importedNames
+ rh :: select(rest, newWanted)
+ }
+ }
+
+ /** Flatten the handlers out and pair each with the original request */
+ select(allReqAndHandlers reverseMap { case (r, h) => ReqAndHandler(r, h) }, wanted).reverse
+ }
+
+ val code, trailingLines, accessPath = new StringBuffer
+ val currentImps = HashSet[Name]()
+
+ // add code for a new object to hold some imports
+ def addWrapper() {
+ /*
+ val impname = INTERPRETER_IMPORT_WRAPPER
+ code append "object %s {\n".format(impname)
+ trailingLines append "}\n"
+ accessPath append ("." + impname)
+ currentImps.clear
+ */
+ val impname = INTERPRETER_IMPORT_WRAPPER
+ code.append("@serializable class " + impname + "C {\n")
+ trailingLines.append("}\nval " + impname + " = new " + impname + "C;\n")
+ accessPath.append("." + impname)
+ currentImps.clear
+ }
+
+ addWrapper()
+
+ // loop through previous requests, adding imports for each one
+ for (ReqAndHandler(req, handler) <- reqsToUse) {
+ handler match {
+ // If the user entered an import, then just use it; add an import wrapping
+ // level if the import might conflict with some other import
+ case x: ImportHandler =>
+ if (x.importsWildcard || (currentImps exists (x.importedNames contains _)))
+ addWrapper()
+
+ code append (x.member.toString + "\n")
+
+ // give wildcard imports a import wrapper all to their own
+ if (x.importsWildcard) addWrapper()
+ else currentImps ++= x.importedNames
+
+ // For other requests, import each bound variable.
+ // import them explicitly instead of with _, so that
+ // ambiguity errors will not be generated. Also, quote
+ // the name of the variable, so that we don't need to
+ // handle quoting keywords separately.
+ case x =>
+ for (imv <- x.boundNames) {
+ // MATEI: Commented this check out because it was messing up for case classes
+ // (trying to import them twice within the same wrapper), but that is more likely
+ // due to a miscomputation of names that makes the code think they're unique.
+ // Need to evaluate whether having so many wrappers is a bad thing.
+ /*if (currentImps contains imv) */ addWrapper()
+
+ code.append("val " + req.objectName + "$VAL = " + req.objectName + ".INSTANCE;\n")
+ code.append("import " + req.objectName + "$VAL" + req.accessPath + ".`" + imv + "`;\n")
+
+ //code append ("import %s\n" format (req fullPath imv))
+ currentImps += imv
+ }
+ }
+ }
+ // add one extra wrapper, to prevent warnings in the common case of
+ // redefining the value bound in the last interpreter request.
+ addWrapper()
+ ComputedImports(code.toString, trailingLines.toString, accessPath.toString)
+ }
+
+ /** Parse a line into a sequence of trees. Returns None if the input is incomplete. */
+ private def parse(line: String): Option[List[Tree]] = {
+ var justNeedsMore = false
+ reporter.withIncompleteHandler((pos,msg) => {justNeedsMore = true}) {
+ // simple parse: just parse it, nothing else
+ def simpleParse(code: String): List[Tree] = {
+ reporter.reset
+ val unit = new CompilationUnit(new BatchSourceFile("", code))
+ val scanner = new compiler.syntaxAnalyzer.UnitParser(unit)
+
+ scanner.templateStatSeq(false)._2
+ }
+ val trees = simpleParse(line)
+
+ if (reporter.hasErrors) Some(Nil) // the result did not parse, so stop
+ else if (justNeedsMore) None
+ else Some(trees)
+ }
+ }
+
+ /** Compile an nsc SourceFile. Returns true if there are
+ * no compilation errors, or false otherwise.
+ */
+ def compileSources(sources: SourceFile*): Boolean = {
+ reporter.reset
+ new compiler.Run() compileSources sources.toList
+ !reporter.hasErrors
+ }
+
+ /** Compile a string. Returns true if there are no
+ * compilation errors, or false otherwise.
+ */
+ def compileString(code: String): Boolean =
+ compileSources(new BatchSourceFile("