aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-09-11 08:52:28 -0700
committerXiangrui Meng <meng@databricks.com>2015-09-11 08:52:28 -0700
commitb01b26260625f0ba14e5f3010207666d62d93864 (patch)
treeb3e891231cf80f750ff4016dd738f7c3266b8288 /python
parentb656e6134fc5cd27e1fe6b6ab30fd7633cab0b14 (diff)
downloadspark-b01b26260625f0ba14e5f3010207666d62d93864.tar.gz
spark-b01b26260625f0ba14e5f3010207666d62d93864.tar.bz2
spark-b01b26260625f0ba14e5f3010207666d62d93864.zip
[SPARK-9773] [ML] [PySpark] Add Python API for MultilayerPerceptronClassifier
Add Python API for ```MultilayerPerceptronClassifier```. Author: Yanbo Liang <ybliang8@gmail.com> Closes #8067 from yanboliang/SPARK-9773.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/classification.py132
1 files changed, 131 insertions, 1 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 22bdd1b322..88815e561f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -26,7 +26,8 @@ from pyspark.mllib.common import inherit_doc
__all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
- 'NaiveBayesModel']
+ 'NaiveBayesModel', 'MultilayerPerceptronClassifier',
+ 'MultilayerPerceptronClassificationModel']
@inherit_doc
@@ -755,6 +756,135 @@ class NaiveBayesModel(JavaModel):
return self._call_java("theta")
+@inherit_doc
+class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+ HasMaxIter, HasTol, HasSeed):
+ """
+ Classifier trainer based on the Multilayer Perceptron.
+ Each layer has sigmoid activation function, output layer has softmax.
+ Number of inputs has to be equal to the size of feature vectors.
+ Number of outputs has to be equal to the total number of labels.
+
+ >>> from pyspark.mllib.linalg import Vectors
+ >>> df = sqlContext.createDataFrame([
+ ... (0.0, Vectors.dense([0.0, 0.0])),
+ ... (1.0, Vectors.dense([0.0, 1.0])),
+ ... (1.0, Vectors.dense([1.0, 0.0])),
+ ... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
+ >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11)
+ >>> model = mlp.fit(df)
+ >>> model.layers
+ [2, 5, 2]
+ >>> model.weights.size
+ 27
+ >>> testDF = sqlContext.createDataFrame([
+ ... (Vectors.dense([1.0, 0.0]),),
+ ... (Vectors.dense([0.0, 0.0]),)], ["features"])
+ >>> model.transform(testDF).show()
+ +---------+----------+
+ | features|prediction|
+ +---------+----------+
+ |[1.0,0.0]| 1.0|
+ |[0.0,0.0]| 0.0|
+ +---------+----------+
+ ...
+ """
+
+ # a placeholder to make it appear in the generated doc
+ layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
+ "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
+ "neurons and output layer of 10 neurons, default is [1, 1].")
+ blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
+ "matrices. Data is stacked within partitions. If block size is more than " +
+ "remaining data in a partition then it is adjusted to the size of this " +
+ "data. Recommended size is between 10 and 1000, default is 128.")
+
+ @keyword_only
+ def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+ maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
+ """
+ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+ maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+ """
+ super(MultilayerPerceptronClassifier, self).__init__()
+ self._java_obj = self._new_java_obj(
+ "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
+ self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
+ "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
+ "100 neurons and output layer of 10 neurons, default is [1, 1].")
+ self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
+ "matrices. Data is stacked within partitions. If block size is " +
+ "more than remaining data in a partition then it is adjusted to " +
+ "the size of this data. Recommended size is between 10 and 1000, " +
+ "default is 128.")
+ self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
+ kwargs = self.__init__._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+ maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
+ """
+ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+ maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+ Sets params for MultilayerPerceptronClassifier.
+ """
+ kwargs = self.setParams._input_kwargs
+ if layers is None:
+ return self._set(**kwargs).setLayers([1, 1])
+ else:
+ return self._set(**kwargs)
+
+ def _create_model(self, java_model):
+ return MultilayerPerceptronClassificationModel(java_model)
+
+ def setLayers(self, value):
+ """
+ Sets the value of :py:attr:`layers`.
+ """
+ self._paramMap[self.layers] = value
+ return self
+
+ def getLayers(self):
+ """
+ Gets the value of layers or its default value.
+ """
+ return self.getOrDefault(self.layers)
+
+ def setBlockSize(self, value):
+ """
+ Sets the value of :py:attr:`blockSize`.
+ """
+ self._paramMap[self.blockSize] = value
+ return self
+
+ def getBlockSize(self):
+ """
+ Gets the value of blockSize or its default value.
+ """
+ return self.getOrDefault(self.blockSize)
+
+
+class MultilayerPerceptronClassificationModel(JavaModel):
+ """
+ Model fitted by MultilayerPerceptronClassifier.
+ """
+
+ @property
+ def layers(self):
+ """
+ array of layer sizes including input and output layers.
+ """
+ return self._call_java("javaLayers")
+
+ @property
+ def weights(self):
+ """
+ vector of initial weights for the model that consists of the weights of layers.
+ """
+ return self._call_java("weights")
+
+
if __name__ == "__main__":
import doctest
from pyspark.context import SparkContext