From 7f16affa262b059580ed2775a7b05a767aa72315 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Tue, 27 Sep 2016 00:00:21 -0700 Subject: [SPARK-17138][ML][MLIB] Add Python API for multinomial logistic regression ## What changes were proposed in this pull request? Add Python API for multinomial logistic regression. - add `family` param in python api. - expose `coefficientMatrix` and `interceptVector` for `LogisticRegressionModel` - add python-side testcase for multinomial logistic regression - update python doc. ## How was this patch tested? existing and added doc tests. Author: WeichenXu Closes #14852 from WeichenXu123/add_MLOR_python. --- python/pyspark/ml/classification.py | 90 ++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 20 deletions(-) (limited to 'python') diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b4c01fd5c4..505e7bffd1 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -67,21 +67,34 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasWeightCol, HasAggregationDepth, JavaMLWritable, JavaMLReadable): """ Logistic regression. - Currently, this class only supports binary classification. + This class supports multinomial logistic (softmax) and binomial logistic regression. >>> from pyspark.sql import Row >>> from pyspark.ml.linalg import Vectors - >>> df = sc.parallelize([ + >>> bdf = sc.parallelize([ ... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF() - >>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight") - >>> model = lr.fit(df) - >>> model.coefficients + >>> blor = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight") + >>> blorModel = blor.fit(bdf) + >>> blorModel.coefficients DenseVector([5.5...]) - >>> model.intercept + >>> blorModel.intercept -2.68... + >>> mdf = sc.parallelize([ + ... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), + ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], [])), + ... Row(label=2.0, weight=2.0, features=Vectors.dense(3.0))]).toDF() + >>> mlor = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", + ... family="multinomial") + >>> mlorModel = mlor.fit(mdf) + >>> print(mlorModel.coefficientMatrix) + DenseMatrix([[-2.3...], + [ 0.2...], + [ 2.1... ]]) + >>> mlorModel.interceptVector + DenseVector([2.0..., 0.8..., -2.8...]) >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF() - >>> result = model.transform(test0).head() + >>> result = blorModel.transform(test0).head() >>> result.prediction 0.0 >>> result.probability @@ -89,23 +102,23 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti >>> result.rawPrediction DenseVector([8.22..., -8.22...]) >>> test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF() - >>> model.transform(test1).head().prediction + >>> blorModel.transform(test1).head().prediction 1.0 - >>> lr.setParams("vector") + >>> blor.setParams("vector") Traceback (most recent call last): ... TypeError: Method setParams forces keyword arguments. >>> lr_path = temp_path + "/lr" - >>> lr.save(lr_path) + >>> blor.save(lr_path) >>> lr2 = LogisticRegression.load(lr_path) >>> lr2.getMaxIter() 5 >>> model_path = temp_path + "/lr_model" - >>> model.save(model_path) + >>> blorModel.save(model_path) >>> model2 = LogisticRegressionModel.load(model_path) - >>> model.coefficients[0] == model2.coefficients[0] + >>> blorModel.coefficients[0] == model2.coefficients[0] True - >>> model.intercept == model2.intercept + >>> blorModel.intercept == model2.intercept True .. versionadded:: 1.3.0 @@ -117,24 +130,29 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", typeConverter=TypeConverters.toFloat) + family = Param(Params._dummy(), "family", + "The name of family which is a description of the label distribution to " + + "be used in the model. Supported options: auto, binomial, multinomial", + typeConverter=TypeConverters.toString) + @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2): + aggregationDepth=2, family="auto"): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2) + aggregationDepth=2, family="auto") If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.LogisticRegression", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5) + self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) self._checkThresholdConsistency() @@ -145,13 +163,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2): + aggregationDepth=2, family="auto"): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2) + aggregationDepth=2, family="auto") Sets params for logistic regression. If the threshold and thresholds Params are both set, they must be equivalent. """ @@ -232,6 +250,20 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti raise ValueError("Logistic Regression getThreshold found inconsistent values for" + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) + @since("2.1.0") + def setFamily(self, value): + """ + Sets the value of :py:attr:`family`. + """ + return self._set(family=value) + + @since("2.1.0") + def getFamily(self): + """ + Gets the value of :py:attr:`family` or its default value. + """ + return self.getOrDefault(self.family) + class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): """ @@ -244,7 +276,8 @@ class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable @since("2.0.0") def coefficients(self): """ - Model coefficients. + Model coefficients of binomial logistic regression. + An exception is thrown in the case of multinomial logistic regression. """ return self._call_java("coefficients") @@ -252,10 +285,27 @@ class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable @since("1.4.0") def intercept(self): """ - Model intercept. + Model intercept of binomial logistic regression. + An exception is thrown in the case of multinomial logistic regression. """ return self._call_java("intercept") + @property + @since("2.1.0") + def coefficientMatrix(self): + """ + Model coefficients. + """ + return self._call_java("coefficientMatrix") + + @property + @since("2.1.0") + def interceptVector(self): + """ + Model intercept. + """ + return self._call_java("interceptVector") + @property @since("2.0.0") def summary(self): -- cgit v1.2.3