Revert "[SPARK-13019][DOCS] Replace example code in mllib-statistics.md using include_example"

This reverts commit 1af8de200c4d3357bcb09e7bbc6deece00e885f2.
author: Xiangrui Meng <meng@databricks.com> 2016-03-21 17:42:30 -0700
committer: Xiangrui Meng <meng@databricks.com> 2016-03-21 17:42:30 -0700
commit: 43ef1e52bfe359f0f051a607a8dc77cc3b269508 (patch)
tree: 8b03ce50a036b684c8cb5fe0c92dc2dfa350ab90 /examples/src/main/python
parent: 3f49e0766f3a369a44e14632de68c657773b7a27 (diff)
download: spark-43ef1e52bfe359f0f051a607a8dc77cc3b269508.tar.gz
spark-43ef1e52bfe359f0f051a607a8dc77cc3b269508.tar.bz2
spark-43ef1e52bfe359f0f051a607a8dc77cc3b269508.zip
6 files changed, 0 insertions, 277 deletions
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
deleted file mode 100644
index 66d18f6e5d..0000000000
--- a/examples/src/main/python/mllib/correlations_example.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-import numpy as np
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="CorrelationsExample")  # SparkContext
-
-    # $example on$
-    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
-    # seriesY must have the same number of partitions and cardinality as seriesX
-    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
-
-    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
-    # If a method is not specified, Pearson's method will be used by default.
-    print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
-
-    data = sc.parallelize(
-        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
-    )  # an RDD of Vectors
-
-    # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-    # If a method is not specified, Pearson's method will be used by default.
-    print(Statistics.corr(data, method="pearson"))
-    # $example off$
-
-    sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
deleted file mode 100644
index e566ead0d3..0000000000
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.linalg import Matrices, Vectors
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingExample")
-
-    # $example on$
-    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events
-
-    # compute the goodness of fit. If a second vector to test against
-    # is not supplied as a parameter, the test runs against a uniform distribution.
-    goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-
-    # summary of the test including the p-value, degrees of freedom,
-    # test statistic, the method used, and the null hypothesis.
-    print("%s\n" % goodnessOfFitTestResult)
-
-    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix
-
-    # conduct Pearson's independence test on the input contingency matrix
-    independenceTestResult = Statistics.chiSqTest(mat)
-
-    # summary of the test including the p-value, degrees of freedom,
-    # test statistic, the method used, and the null hypothesis.
-    print("%s\n" % independenceTestResult)
-
-    obs = sc.parallelize(
-        [LabeledPoint(1.0, [1.0, 0.0, 3.0]),
-         LabeledPoint(1.0, [1.0, 2.0, 0.0]),
-         LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
-    )  # LabeledPoint(feature, label)
-
-    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
-    # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-    # against the label.
-    featureTestResults = Statistics.chiSqTest(obs)
-
-    for i, result in enumerate(featureTestResults):
-        print("Column %d:\n%s" % (i + 1, result))
-    # $example off$
-
-    sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
deleted file mode 100644
index ef380dee79..0000000000
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")
-
-    # $example on$
-    parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
-
-    # run a KS test for the sample versus a standard normal distribution
-    testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-    # summary of the test including the p-value, test statistic, and null hypothesis
-    # if our p-value indicates significance, we can reject the null hypothesis
-    # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
-    # a lambda to calculate the CDF is not made available in the Python API
-    print(testResult)
-    # $example off$
-
-    sc.stop()
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
deleted file mode 100644
index 3e8f7241a4..0000000000
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.stat import KernelDensity
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="KernelDensityEstimationExample")  # SparkContext
-
-    # $example on$
-    # an RDD of sample data
-    data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0])
-
-    # Construct the density estimator with the sample data and a standard deviation for the Gaussian
-    # kernels
-    kd = KernelDensity()
-    kd.setSample(data)
-    kd.setBandwidth(3.0)
-
-    # Find density estimates for the given values
-    densities = kd.estimate([-1.0, 2.0, 5.0])
-    # $example off$
-
-    print(densities)
-
-    sc.stop()
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
deleted file mode 100644
index a13f8f08dd..0000000000
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="StratifiedSamplingExample")  # SparkContext
-
-    # $example on$
-    # an RDD of any key value pairs
-    data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')])
-
-    # specify the exact fraction desired from each key as a dictionary
-    fractions = {1: 0.1, 2: 0.6, 3: 0.3}
-
-    approxSample = data.sampleByKey(False, fractions)
-    # $example off$
-
-    for each in approxSample.collect():
-        print(each)
-
-    sc.stop()
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
deleted file mode 100644
index d55d1a2c2d..0000000000
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-import numpy as np
-
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
-
-    # $example on$
-    mat = sc.parallelize(
-        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
-    )  # an RDD of Vectors
-
-    # Compute column summary statistics.
-    summary = Statistics.colStats(mat)
-    print(summary.mean())  # a dense vector containing the mean value for each column
-    print(summary.variance())  # column-wise variance
-    print(summary.numNonzeros())  # number of nonzeros in each column
-    # $example off$
-
-    sc.stop()
author	Xiangrui Meng <meng@databricks.com>	2016-03-21 17:42:30 -0700
committer	Xiangrui Meng <meng@databricks.com>	2016-03-21 17:42:30 -0700
commit	43ef1e52bfe359f0f051a607a8dc77cc3b269508 (patch)
tree	8b03ce50a036b684c8cb5fe0c92dc2dfa350ab90 /examples/src/main/python
parent	3f49e0766f3a369a44e14632de68c657773b7a27 (diff)
download	spark-43ef1e52bfe359f0f051a607a8dc77cc3b269508.tar.gz spark-43ef1e52bfe359f0f051a607a8dc77cc3b269508.tar.bz2 spark-43ef1e52bfe359f0f051a607a8dc77cc3b269508.zip