/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.ml.classification

import scala.collection.JavaConverters._
import scala.language.existentials
import scala.util.Random
import scala.util.control.Breaks._

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vector, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions.{col, lit, rand}
import org.apache.spark.sql.types.LongType

class LogisticRegressionSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  private val seed = 42
  @transient var smallBinaryDataset: Dataset[_] = _
  @transient var smallMultinomialDataset: Dataset[_] = _
  @transient var binaryDataset: Dataset[_] = _
  @transient var multinomialDataset: Dataset[_] = _
  private val eps: Double = 1e-5

  override def beforeAll(): Unit = {
    super.beforeAll()

    smallBinaryDataset = generateLogisticInput(1.0, 1.0, nPoints = 100, seed = seed).toDF()

    smallMultinomialDataset = {
      val nPoints = 100
      val coefficients = Array(
        -0.57997, 0.912083, -0.371077,
        -0.16624, -0.84355, -0.048509)

      val xMean = Array(5.843, 3.057)
      val xVariance = Array(0.6856, 0.1899)

      val testData = generateMultinomialLogisticInput(
        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

      val df = sc.parallelize(testData, 4).toDF()
      df.cache()
      df
    }

    binaryDataset = {
      val nPoints = 10000
      val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
      val xMean = Array(5.843, 3.057, 3.758, 1.199)
      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)

      val testData =
        generateMultinomialLogisticInput(coefficients, xMean, xVariance,
          addIntercept = true, nPoints, seed)

      sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
    }

    multinomialDataset = {
      val nPoints = 10000
      val coefficients = Array(
        -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
        -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)

      val xMean = Array(5.843, 3.057, 3.758, 1.199)
      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)

      val testData = generateMultinomialLogisticInput(
        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
      df.cache()
      df
    }
  }

  /**
   * Enable the ignored test to export the dataset into CSV format,
   * so we can validate the training accuracy compared with R's glmnet package.
   */
  ignore("export test data into CSV format") {
    binaryDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
      label + "," + weight + "," + features.toArray.mkString(",")
    }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
    multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
      label + "," + weight + "," + features.toArray.mkString(",")
    }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
  }

  test("params") {
    ParamsSuite.checkParams(new LogisticRegression)
    val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0)
    ParamsSuite.checkParams(model)
  }

  test("logistic regression: default params") {
    val lr = new LogisticRegression
    assert(lr.getLabelCol === "label")
    assert(lr.getFeaturesCol === "features")
    assert(lr.getPredictionCol === "prediction")
    assert(lr.getRawPredictionCol === "rawPrediction")
    assert(lr.getProbabilityCol === "probability")
    assert(lr.getFamily === "auto")
    assert(!lr.isDefined(lr.weightCol))
    assert(lr.getFitIntercept)
    assert(lr.getStandardization)
    val model = lr.fit(smallBinaryDataset)
    model.transform(smallBinaryDataset)
      .select("label", "probability", "prediction", "rawPrediction")
      .collect()
    assert(model.getThreshold === 0.5)
    assert(model.getFeaturesCol === "features")
    assert(model.getPredictionCol === "prediction")
    assert(model.getRawPredictionCol === "rawPrediction")
    assert(model.getProbabilityCol === "probability")
    assert(model.intercept !== 0.0)
    assert(model.hasParent)

    MLTestingUtils.checkCopyAndUids(lr, model)
    assert(model.hasSummary)
    val copiedModel = model.copy(ParamMap.empty)
    assert(copiedModel.hasSummary)
    model.setSummary(None)
    assert(!model.hasSummary)
  }

  test("empty probabilityCol") {
    val lr = new LogisticRegression().setProbabilityCol("")
    val model = lr.fit(smallBinaryDataset)
    assert(model.hasSummary)
    // Validate that we re-insert a probability column for evaluation
    val fieldNames = model.summary.predictions.schema.fieldNames
    assert(smallBinaryDataset.schema.fieldNames.toSet.subsetOf(
      fieldNames.toSet))
    assert(fieldNames.exists(s => s.startsWith("probability_")))
  }

  test("setThreshold, getThreshold") {
    val lr = new LogisticRegression().setFamily("binomial")
    // default
    assert(lr.getThreshold === 0.5, "LogisticRegression.threshold should default to 0.5")
    withClue("LogisticRegression should not have thresholds set by default.") {
      intercept[java.util.NoSuchElementException] { // Note: The exception type may change in future
        lr.getThresholds
      }
    }
    // Set via threshold.
    // Intuition: Large threshold or large thresholds(1) makes class 0 more likely.
    lr.setThreshold(1.0)
    assert(lr.getThresholds === Array(0.0, 1.0))
    lr.setThreshold(0.0)
    assert(lr.getThresholds === Array(1.0, 0.0))
    lr.setThreshold(0.5)
    assert(lr.getThresholds === Array(0.5, 0.5))
    // Set via thresholds
    val lr2 = new LogisticRegression().setFamily("binomial")
    lr2.setThresholds(Array(0.3, 0.7))
    val expectedThreshold = 1.0 / (1.0 + 0.3 / 0.7)
    assert(lr2.getThreshold ~== expectedThreshold relTol 1E-7)
    // thresholds and threshold must be consistent
    lr2.setThresholds(Array(0.1, 0.2, 0.3))
    withClue("getThreshold should throw error if thresholds has length != 2.") {
      intercept[IllegalArgumentException] {
        lr2.getThreshold
      }
    }
    // thresholds and threshold must be consistent: values
    withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
      intercept[IllegalArgumentException] {
        lr2.fit(smallBinaryDataset,
          lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0))
      }
    }
    withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
      intercept[IllegalArgumentException] {
        val lr2model = lr2.fit(smallBinaryDataset,
          lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0))
        lr2model.getThreshold
      }
    }
  }

  test("thresholds prediction") {
    val blr = new LogisticRegression().setFamily("binomial")
    val binaryModel = blr.fit(smallBinaryDataset)

    binaryModel.setThreshold(1.0)
    val binaryZeroPredictions =
      binaryModel.transform(smallBinaryDataset).select("prediction").collect()
    assert(binaryZeroPredictions.forall(_.getDouble(0) === 0.0))

    binaryModel.setThreshold(0.0)
    val binaryOnePredictions =
      binaryModel.transform(smallBinaryDataset).select("prediction").collect()
    assert(binaryOnePredictions.forall(_.getDouble(0) === 1.0))


    val mlr = new LogisticRegression().setFamily("multinomial")
    val model = mlr.fit(smallMultinomialDataset)
    val basePredictions = model.transform(smallMultinomialDataset).select("prediction").collect()

    // should predict all zeros
    model.setThresholds(Array(1, 1000, 1000))
    val zeroPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
    assert(zeroPredictions.forall(_.getDouble(0) === 0.0))

    // should predict all ones
    model.setThresholds(Array(1000, 1, 1000))
    val onePredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
    assert(onePredictions.forall(_.getDouble(0) === 1.0))

    // should predict all twos
    model.setThresholds(Array(1000, 1000, 1))
    val twoPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
    assert(twoPredictions.forall(_.getDouble(0) === 2.0))

    // constant threshold scaling is the same as no thresholds
    model.setThresholds(Array(1000, 1000, 1000))
    val scaledPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
    assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
      scaled.getDouble(0) === base.getDouble(0)
    })

    // force it to use the predict method
    model.setRawPredictionCol("").setProbabilityCol("").setThresholds(Array(0, 1, 1))
    val predictionsWithPredict =
      model.transform(smallMultinomialDataset).select("prediction").collect()
    assert(predictionsWithPredict.forall(_.getDouble(0) === 0.0))
  }

  test("logistic regression doesn't fit intercept when fitIntercept is off") {
    val lr = new LogisticRegression().setFamily("binomial")
    lr.setFitIntercept(false)
    val model = lr.fit(smallBinaryDataset)
    assert(model.intercept === 0.0)

    val mlr = new LogisticRegression().setFamily("multinomial")
    mlr.setFitIntercept(false)
    val mlrModel = mlr.fit(smallMultinomialDataset)
    assert(mlrModel.interceptVector === Vectors.sparse(3, Seq()))
  }

  test("logistic regression with setters") {
    // Set params, train, and check as many params as we can.
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(1.0)
      .setThreshold(0.6)
      .setProbabilityCol("myProbability")
    val model = lr.fit(smallBinaryDataset)
    val parent = model.parent.asInstanceOf[LogisticRegression]
    assert(parent.getMaxIter === 10)
    assert(parent.getRegParam === 1.0)
    assert(parent.getThreshold === 0.6)
    assert(model.getThreshold === 0.6)

    // Modify model params, and check that the params worked.
    model.setThreshold(1.0)
    val predAllZero = model.transform(smallBinaryDataset)
      .select("prediction", "myProbability")
      .collect()
      .map { case Row(pred: Double, prob: Vector) => pred }
    assert(predAllZero.forall(_ === 0),
      s"With threshold=1.0, expected predictions to be all 0, but only" +
      s" ${predAllZero.count(_ === 0)} of ${smallBinaryDataset.count()} were 0.")
    // Call transform with params, and check that the params worked.
    val predNotAllZero =
      model.transform(smallBinaryDataset, model.threshold -> 0.0,
        model.probabilityCol -> "myProb")
        .select("prediction", "myProb")
        .collect()
        .map { case Row(pred: Double, prob: Vector) => pred }
    assert(predNotAllZero.exists(_ !== 0.0))

    // Call fit() with new params, and check as many params as we can.
    lr.setThresholds(Array(0.6, 0.4))
    val model2 = lr.fit(smallBinaryDataset, lr.maxIter -> 5, lr.regParam -> 0.1,
      lr.probabilityCol -> "theProb")
    val parent2 = model2.parent.asInstanceOf[LogisticRegression]
    assert(parent2.getMaxIter === 5)
    assert(parent2.getRegParam === 0.1)
    assert(parent2.getThreshold === 0.4)
    assert(model2.getThreshold === 0.4)
    assert(model2.getProbabilityCol === "theProb")
  }

  test("multinomial logistic regression: Predictor, Classifier methods") {
    val sqlContext = smallMultinomialDataset.sqlContext
    import sqlContext.implicits._
    val mlr = new LogisticRegression().setFamily("multinomial")

    val model = mlr.fit(smallMultinomialDataset)
    assert(model.numClasses === 3)
    val numFeatures = smallMultinomialDataset.select("features").first().getAs[Vector](0).size
    assert(model.numFeatures === numFeatures)

    val results = model.transform(smallMultinomialDataset)
    // check that raw prediction is coefficients dot features + intercept
    results.select("rawPrediction", "features").collect().foreach {
      case Row(raw: Vector, features: Vector) =>
        assert(raw.size === 3)
        val margins = Array.tabulate(3) { k =>
          var margin = 0.0
          features.foreachActive { (index, value) =>
            margin += value * model.coefficientMatrix(k, index)
          }
          margin += model.interceptVector(k)
          margin
        }
        assert(raw ~== Vectors.dense(margins) relTol eps)
    }

    // Compare rawPrediction with probability
    results.select("rawPrediction", "probability").collect().foreach {
      case Row(raw: Vector, prob: Vector) =>
        assert(raw.size === 3)
        assert(prob.size === 3)
        val max = raw.toArray.max
        val subtract = if (max > 0) max else 0.0
        val sum = raw.toArray.map(x => math.exp(x - subtract)).sum
        val probFromRaw0 = math.exp(raw(0) - subtract) / sum
        val probFromRaw1 = math.exp(raw(1) - subtract) / sum
        assert(prob(0) ~== probFromRaw0 relTol eps)
        assert(prob(1) ~== probFromRaw1 relTol eps)
        assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps)
    }

    // Compare prediction with probability
    results.select("prediction", "probability").collect().foreach {
      case Row(pred: Double, prob: Vector) =>
        val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
        assert(pred == predFromProb)
    }

    // force it to use raw2prediction
    model.setRawPredictionCol("rawPrediction").setProbabilityCol("")
    val resultsUsingRaw2Predict =
      model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
    resultsUsingRaw2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
      case (pred1, pred2) => assert(pred1 === pred2)
    }

    // force it to use probability2prediction
    model.setRawPredictionCol("").setProbabilityCol("probability")
    val resultsUsingProb2Predict =
      model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
    resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
      case (pred1, pred2) => assert(pred1 === pred2)
    }

    // force it to use predict
    model.setRawPredictionCol("").setProbabilityCol("")
    val resultsUsingPredict =
      model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
    resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
      case (pred1, pred2) => assert(pred1 === pred2)
    }
  }

  test("binary logistic regression: Predictor, Classifier methods") {
    val sqlContext = smallBinaryDataset.sqlContext
    import sqlContext.implicits._
    val lr = new LogisticRegression().setFamily("binomial")

    val model = lr.fit(smallBinaryDataset)
    assert(model.numClasses === 2)
    val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size
    assert(model.numFeatures === numFeatures)

    val results = model.transform(smallBinaryDataset)

    // Compare rawPrediction with probability
    results.select("rawPrediction", "probability").collect().foreach {
      case Row(raw: Vector, prob: Vector) =>
        assert(raw.size === 2)
        assert(prob.size === 2)
        val probFromRaw1 = 1.0 / (1.0 + math.exp(-raw(1)))
        assert(prob(1) ~== probFromRaw1 relTol eps)
        assert(prob(0) ~== 1.0 - probFromRaw1 relTol eps)
    }

    // Compare prediction with probability
    results.select("prediction", "probability").collect().foreach {
      case Row(pred: Double, prob: Vector) =>
        val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
        assert(pred == predFromProb)
    }

    // force it to use raw2prediction
    model.setRawPredictionCol("rawPrediction").setProbabilityCol("")
    val resultsUsingRaw2Predict =
      model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
    resultsUsingRaw2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
      case (pred1, pred2) => assert(pred1 === pred2)
    }

    // force it to use probability2prediction
    model.setRawPredictionCol("").setProbabilityCol("probability")
    val resultsUsingProb2Predict =
      model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
    resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
      case (pred1, pred2) => assert(pred1 === pred2)
    }

    // force it to use predict
    model.setRawPredictionCol("").setProbabilityCol("")
    val resultsUsingPredict =
      model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
    resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
      case (pred1, pred2) => assert(pred1 === pred2)
    }
  }

  test("coefficients and intercept methods") {
    val mlr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")
    val mlrModel = mlr.fit(smallMultinomialDataset)
    val thrownCoef = intercept[SparkException] {
      mlrModel.coefficients
    }
    val thrownIntercept = intercept[SparkException] {
      mlrModel.intercept
    }
    assert(thrownCoef.getMessage().contains("use coefficientMatrix instead"))
    assert(thrownIntercept.getMessage().contains("use interceptVector instead"))

    val blr = new LogisticRegression().setMaxIter(1).setFamily("binomial")
    val blrModel = blr.fit(smallBinaryDataset)
    assert(blrModel.coefficients.size === 1)
    assert(blrModel.intercept !== 0.0)
  }

  test("sparse coefficients in LogisticAggregator") {
    val bcCoefficientsBinary = spark.sparkContext.broadcast(Vectors.sparse(2, Array(0), Array(1.0)))
    val bcFeaturesStd = spark.sparkContext.broadcast(Array(1.0))
    val binaryAgg = new LogisticAggregator(bcCoefficientsBinary, bcFeaturesStd, 2,
      fitIntercept = true, multinomial = false)
    val thrownBinary = withClue("binary logistic aggregator cannot handle sparse coefficients") {
      intercept[IllegalArgumentException] {
        binaryAgg.add(Instance(1.0, 1.0, Vectors.dense(1.0)))
      }
    }
    assert(thrownBinary.getMessage.contains("coefficients only supports dense"))

    val bcCoefficientsMulti = spark.sparkContext.broadcast(Vectors.sparse(6, Array(0), Array(1.0)))
    val multinomialAgg = new LogisticAggregator(bcCoefficientsMulti, bcFeaturesStd, 3,
      fitIntercept = true, multinomial = true)
    val thrown = withClue("multinomial logistic aggregator cannot handle sparse coefficients") {
      intercept[IllegalArgumentException] {
        multinomialAgg.add(Instance(1.0, 1.0, Vectors.dense(1.0)))
      }
    }
    assert(thrown.getMessage.contains("coefficients only supports dense"))
    bcCoefficientsBinary.destroy(blocking = false)
    bcFeaturesStd.destroy(blocking = false)
    bcCoefficientsMulti.destroy(blocking = false)
  }

  test("overflow prediction for multiclass") {
    val model = new LogisticRegressionModel("mLogReg",
      Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)),
      Vectors.dense(0.0, 0.0, 0.0), 3, true)
    val overFlowData = Seq(
      LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)),
      LabeledPoint(1.0, Vectors.dense(0.0, -1.0))
    ).toDF()
    val results = model.transform(overFlowData).select("rawPrediction", "probability").collect()

    // probabilities are correct when margins have to be adjusted
    val raw1 = results(0).getAs[Vector](0)
    val prob1 = results(0).getAs[Vector](1)
    assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0))
    assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps)

    // probabilities are correct when margins don't have to be adjusted
    val raw2 = results(1).getAs[Vector](0)
    val prob2 = results(1).getAs[Vector](1)
    assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0))
    assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps)
  }

  test("MultiClassSummarizer") {
    val summarizer1 = (new MultiClassSummarizer)
      .add(0.0).add(3.0).add(4.0).add(3.0).add(6.0)
    assert(summarizer1.histogram === Array[Double](1, 0, 0, 2, 1, 0, 1))
    assert(summarizer1.countInvalid === 0)
    assert(summarizer1.numClasses === 7)

    val summarizer2 = (new MultiClassSummarizer)
      .add(1.0).add(5.0).add(3.0).add(0.0).add(4.0).add(1.0)
    assert(summarizer2.histogram === Array[Double](1, 2, 0, 1, 1, 1))
    assert(summarizer2.countInvalid === 0)
    assert(summarizer2.numClasses === 6)

    val summarizer3 = (new MultiClassSummarizer)
      .add(0.0).add(1.3).add(5.2).add(2.5).add(2.0).add(4.0).add(4.0).add(4.0).add(1.0)
    assert(summarizer3.histogram === Array[Double](1, 1, 1, 0, 3))
    assert(summarizer3.countInvalid === 3)
    assert(summarizer3.numClasses === 5)

    val summarizer4 = (new MultiClassSummarizer)
      .add(3.1).add(4.3).add(2.0).add(1.0).add(3.0)
    assert(summarizer4.histogram === Array[Double](0, 1, 1, 1))
    assert(summarizer4.countInvalid === 2)
    assert(summarizer4.numClasses === 4)

    val summarizer5 = new MultiClassSummarizer
    assert(summarizer5.histogram.isEmpty)
    assert(summarizer5.numClasses === 0)

    // small map merges large one
    val summarizerA = summarizer1.merge(summarizer2)
    assert(summarizerA.hashCode() === summarizer2.hashCode())
    assert(summarizerA.histogram === Array[Double](2, 2, 0, 3, 2, 1, 1))
    assert(summarizerA.countInvalid === 0)
    assert(summarizerA.numClasses === 7)

    // large map merges small one
    val summarizerB = summarizer3.merge(summarizer4)
    assert(summarizerB.hashCode() === summarizer3.hashCode())
    assert(summarizerB.histogram === Array[Double](1, 2, 2, 1, 3))
    assert(summarizerB.countInvalid === 5)
    assert(summarizerB.numClasses === 5)
  }

  test("MultiClassSummarizer with weighted samples") {
    val summarizer1 = (new MultiClassSummarizer)
      .add(label = 0.0, weight = 0.2).add(3.0, 0.8).add(4.0, 3.2).add(3.0, 1.3).add(6.0, 3.1)
    assert(Vectors.dense(summarizer1.histogram) ~==
      Vectors.dense(Array(0.2, 0, 0, 2.1, 3.2, 0, 3.1)) absTol 1E-10)
    assert(summarizer1.countInvalid === 0)
    assert(summarizer1.numClasses === 7)

    val summarizer2 = (new MultiClassSummarizer)
      .add(1.0, 1.1).add(5.0, 2.3).add(3.0).add(0.0).add(4.0).add(1.0).add(2, 0.0)
    assert(Vectors.dense(summarizer2.histogram) ~==
      Vectors.dense(Array[Double](1.0, 2.1, 0.0, 1, 1, 2.3)) absTol 1E-10)
    assert(summarizer2.countInvalid === 0)
    assert(summarizer2.numClasses === 6)

    val summarizer = summarizer1.merge(summarizer2)
    assert(Vectors.dense(summarizer.histogram) ~==
      Vectors.dense(Array(1.2, 2.1, 0.0, 3.1, 4.2, 2.3, 3.1)) absTol 1E-10)
    assert(summarizer.countInvalid === 0)
    assert(summarizer.numClasses === 7)
  }

  test("binary logistic regression with intercept without regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true)
      .setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true).setStandardization(false)
      .setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.
      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
      lambda = 0))
      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                          s0
      (Intercept)  2.7355261
      data.V3     -0.5734389
      data.V4      0.8911736
      data.V5     -0.3878645
      data.V6     -0.8060570

     */
    val coefficientsR = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570)
    val interceptR = 2.7355261

    assert(model1.intercept ~== interceptR relTol 1E-3)
    assert(model1.coefficients ~= coefficientsR relTol 1E-3)

    // Without regularization, with or without standardization will converge to the same solution.
    assert(model2.intercept ~== interceptR relTol 1E-3)
    assert(model2.coefficients ~= coefficientsR relTol 1E-3)
  }

  test("binary logistic regression without intercept without regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false).setStandardization(true)
      .setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false).setStandardization(false)
      .setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
      lambda = 0, intercept=FALSE))
      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                          s0
      (Intercept)  .
      data.V3     -0.3448461
      data.V4      1.2776453
      data.V5     -0.3539178
      data.V6     -0.7469384

     */
    val coefficientsR = Vectors.dense(-0.3448461, 1.2776453, -0.3539178, -0.7469384)

    assert(model1.intercept ~== 0.0 relTol 1E-3)
    assert(model1.coefficients ~= coefficientsR relTol 1E-2)

    // Without regularization, with or without standardization should converge to the same solution.
    assert(model2.intercept ~== 0.0 relTol 1E-3)
    assert(model2.coefficients ~= coefficientsR relTol 1E-2)
  }

  test("binary logistic regression with intercept with L1 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
      lambda = 0.12, standardize=T))
      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept) -0.06775980
      data.V3      .
      data.V4      .
      data.V5     -0.03933146
      data.V6     -0.03047580

     */
    val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.03933146, -0.03047580)
    val interceptRStd = -0.06775980

    assert(model1.intercept ~== interceptRStd relTol 1E-2)
    assert(model1.coefficients ~= coefficientsRStd absTol 2E-2)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
      lambda = 0.12, standardize=F))
      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                          s0
      (Intercept)  0.3544768
      data.V3      .
      data.V4      .
      data.V5     -0.1626191
      data.V6      .

     */
    val coefficientsR = Vectors.dense(0.0, 0.0, -0.1626191, 0.0)
    val interceptR = 0.3544768

    assert(model2.intercept ~== interceptR relTol 1E-2)
    assert(model2.coefficients ~== coefficientsR absTol 1E-3)
  }

  test("binary logistic regression without intercept with L1 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
      lambda = 0.12, intercept=F, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
      lambda = 0.12, intercept=F, standardize=F))
      coefficientsStd
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  .
      data.V3      .
      data.V4      .
      data.V5     -0.04967635
      data.V6     -0.04757757

      coefficients
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  .
      data.V3      .
      data.V4      .
      data.V5     -0.08433195
      data.V6      .

     */
    val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04967635, -0.04757757)

    val coefficientsR = Vectors.dense(0.0, 0.0, -0.08433195, 0.0)

    assert(model1.intercept ~== 0.0 absTol 1E-3)
    assert(model1.coefficients ~= coefficientsRStd absTol 1E-3)
    assert(model2.intercept ~== 0.0 absTol 1E-3)
    assert(model2.coefficients ~= coefficientsR absTol 1E-3)
  }

  test("binary logistic regression with intercept with L2 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
      lambda = 1.37, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
      lambda = 1.37, standardize=F))
      coefficientsStd
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  0.12707703
      data.V3     -0.06980967
      data.V4      0.10803933
      data.V5     -0.04800404
      data.V6     -0.10165096

      coefficients
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  0.46613016
      data.V3     -0.04944529
      data.V4      0.02326772
      data.V5     -0.11362772
      data.V6     -0.06312848

     */
    val coefficientsRStd = Vectors.dense(-0.06980967, 0.10803933, -0.04800404, -0.10165096)
    val interceptRStd = 0.12707703
    val coefficientsR = Vectors.dense(-0.04944529, 0.02326772, -0.11362772, -0.06312848)
    val interceptR = 0.46613016

    assert(model1.intercept ~== interceptRStd relTol 1E-3)
    assert(model1.coefficients ~= coefficientsRStd relTol 1E-3)
    assert(model2.intercept ~== interceptR relTol 1E-3)
    assert(model2.coefficients ~= coefficientsR relTol 1E-3)
  }

  test("binary logistic regression without intercept with L2 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
      lambda = 1.37, intercept=F, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
      lambda = 1.37, intercept=F, standardize=F))
      coefficientsStd
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  .
      data.V3     -0.06000152
      data.V4      0.12598737
      data.V5     -0.04669009
      data.V6     -0.09941025

      coefficients
      5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
      (Intercept)  .
      data.V3     -0.005482255
      data.V4      0.048106338
      data.V5     -0.093411640
      data.V6     -0.054149798

     */
    val coefficientsRStd = Vectors.dense(-0.06000152, 0.12598737, -0.04669009, -0.09941025)
    val coefficientsR = Vectors.dense(-0.005482255, 0.048106338, -0.093411640, -0.054149798)

    assert(model1.intercept ~== 0.0 absTol 1E-3)
    assert(model1.coefficients ~= coefficientsRStd relTol 1E-2)
    assert(model2.intercept ~== 0.0 absTol 1E-3)
    assert(model2.coefficients ~= coefficientsR relTol 1E-2)
  }

  test("binary logistic regression with intercept with ElasticNet regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(200)
      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
      lambda = 0.21, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
      lambda = 0.21, standardize=F))
      coefficientsStd
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  0.49991996
      data.V3     -0.04131110
      data.V4      .
      data.V5     -0.08585233
      data.V6     -0.15875400

      coefficients
      5 x 1 sparse Matrix of class "dgCMatrix"
                          s0
      (Intercept)  0.5024256
      data.V3      .
      data.V4      .
      data.V5     -0.1846038
      data.V6     -0.0559614

     */
    val coefficientsRStd = Vectors.dense(-0.04131110, 0.0, -0.08585233, -0.15875400)
    val interceptRStd = 0.49991996
    val coefficientsR = Vectors.dense(0.0, 0.0, -0.1846038, -0.0559614)
    val interceptR = 0.5024256

    assert(model1.intercept ~== interceptRStd relTol 6E-3)
    assert(model1.coefficients ~== coefficientsRStd absTol 5E-3)
    assert(model2.intercept ~== interceptR relTol 6E-3)
    assert(model2.coefficients ~= coefficientsR absTol 1E-3)
  }

  test("binary logistic regression without intercept with ElasticNet regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
      lambda = 0.21, intercept=FALSE, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
      lambda = 0.21, intercept=FALSE, standardize=F))
      coefficientsStd
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  .
      data.V3      .
      data.V4      0.06859390
      data.V5     -0.07900058
      data.V6     -0.14684320

      coefficients
      5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
      (Intercept)  .
      data.V3      .
      data.V4      0.03060637
      data.V5     -0.11126742
      data.V6      .

     */
    val coefficientsRStd = Vectors.dense(0.0, 0.06859390, -0.07900058, -0.14684320)
    val coefficientsR = Vectors.dense(0.0, 0.03060637, -0.11126742, 0.0)

    assert(model1.intercept ~== 0.0 relTol 1E-3)
    assert(model1.coefficients ~= coefficientsRStd absTol 1E-2)
    assert(model2.intercept ~== 0.0 absTol 1E-3)
    assert(model2.coefficients ~= coefficientsR absTol 1E-3)
  }

  test("binary logistic regression with intercept with strong L1 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
      .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true)
    val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
      .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false)

    val model1 = trainer1.fit(binaryDataset)
    val model2 = trainer2.fit(binaryDataset)

    val histogram = binaryDataset.as[Instance].rdd.map { i => (i.label, i.weight)}
      .treeAggregate(new MultiClassSummarizer)(
        seqOp = (c, v) => (c, v) match {
          case (classSummarizer: MultiClassSummarizer, (label: Double, weight: Double)) =>
            classSummarizer.add(label, weight)
        },
        combOp = (c1, c2) => (c1, c2) match {
          case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
            classSummarizer1.merge(classSummarizer2)
        }).histogram

    /*
       For binary logistic regression with strong L1 regularization, all the coefficients
       will be zeros. As a result,
       {{{
       P(0) = 1 / (1 + \exp(b)), and
       P(1) = \exp(b) / (1 + \exp(b))
       }}}, hence
       {{{
       b = \log{P(1) / P(0)} = \log{count_1 / count_0}
       }}}
     */
    val interceptTheory = math.log(histogram(1) / histogram(0))
    val coefficientsTheory = Vectors.dense(0.0, 0.0, 0.0, 0.0)

    assert(model1.intercept ~== interceptTheory relTol 1E-5)
    assert(model1.coefficients ~= coefficientsTheory absTol 1E-6)

    assert(model2.intercept ~== interceptTheory relTol 1E-5)
    assert(model2.coefficients ~= coefficientsTheory absTol 1E-6)

    /*
       Using the following R code to load the data and train the model using glmnet package.

       library("glmnet")
       data <- read.csv("path", header=FALSE)
       label = factor(data$V1)
       w = data$V2
       features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
       coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1.0,
       lambda = 6.0))
       coefficients

       5 x 1 sparse Matrix of class "dgCMatrix"
                           s0
       (Intercept) -0.2516986
       data.V3      0.0000000
       data.V4      .
       data.V5      .
       data.V6      .
     */
    val interceptR = -0.2516986
    val coefficientsR = Vectors.dense(0.0, 0.0, 0.0, 0.0)

    assert(model1.intercept ~== interceptR relTol 1E-5)
    assert(model1.coefficients ~== coefficientsR absTol 1E-6)
  }

  test("multinomial logistic regression with intercept with strong L1 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
      .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true)
    val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
      .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false)

    val sqlContext = multinomialDataset.sqlContext
    import sqlContext.implicits._
    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)

    val histogram = multinomialDataset.as[Instance].rdd.map(i => (i.label, i.weight))
      .treeAggregate(new MultiClassSummarizer)(
        seqOp = (c, v) => (c, v) match {
          case (classSummarizer: MultiClassSummarizer, (label: Double, weight: Double)) =>
            classSummarizer.add(label, weight)
        },
        combOp = (c1, c2) => (c1, c2) match {
          case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
            classSummarizer1.merge(classSummarizer2)
        }).histogram
    val numFeatures = multinomialDataset.as[Instance].first().features.size
    val numClasses = histogram.length

    /*
       For multinomial logistic regression with strong L1 regularization, all the coefficients
       will be zeros. As a result, the intercepts will be proportional to the log counts in the
       histogram.
       {{{
         \exp(b_k) = count_k * \exp(\lambda)
         b_k = \log(count_k) * \lambda
       }}}
       \lambda is a free parameter, so choose the phase \lambda such that the
       mean is centered. This yields
       {{{
         b_k = \log(count_k)
         b_k' = b_k - \mean(b_k)
       }}}
     */
    val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
    val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length
    val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean))
    val coefficientsTheory = new DenseMatrix(numClasses, numFeatures,
      Array.fill[Double](numClasses * numFeatures)(0.0), isTransposed = true)

    assert(model1.interceptVector ~== interceptsTheory relTol 1E-3)
    assert(model1.coefficientMatrix ~= coefficientsTheory absTol 1E-6)

    assert(model2.interceptVector ~== interceptsTheory relTol 1E-3)
    assert(model2.coefficientMatrix ~= coefficientsTheory absTol 1E-6)
  }

  test("multinomial logistic regression with intercept without regularization") {

    val trainer1 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial",
      alpha = 0, lambda = 0))
      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
              -2.10320093
      data.V3  0.24337896
      data.V4 -0.05916156
      data.V5  0.14446790
      data.V6  0.35976165

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               0.3394473
      data.V3 -0.3443375
      data.V4  0.9181331
      data.V5 -0.2283959
      data.V6 -0.4388066

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               1.76375361
      data.V3  0.10095851
      data.V4 -0.85897154
      data.V5  0.08392798
      data.V6  0.07904499


     */
    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.24337896, -0.05916156, 0.14446790, 0.35976165,
      -0.3443375, 0.9181331, -0.2283959, -0.4388066,
      0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true)
    val interceptsR = Vectors.dense(-2.10320093, 0.3394473, 1.76375361)

    assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
    assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
    assert(model1.interceptVector ~== interceptsR relTol 0.05)
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
    assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
    assert(model2.interceptVector ~== interceptsR relTol 0.05)
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression without intercept without regularization") {

    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
      lambda = 0, intercept=F))
      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               .
      data.V3  0.07276291
      data.V4 -0.36325496
      data.V5  0.12015088
      data.V6  0.31397340

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               .
      data.V3 -0.3180040
      data.V4  0.9679074
      data.V5 -0.2252219
      data.V6 -0.4319914

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               .
      data.V3  0.2452411
      data.V4 -0.6046524
      data.V5  0.1050710
      data.V6  0.1180180


     */
    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.07276291, -0.36325496, 0.12015088, 0.31397340,
      -0.3180040, 0.9679074, -0.2252219, -0.4319914,
      0.2452411, -0.6046524, 0.1050710, 0.1180180), isTransposed = true)

    assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
    assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
    assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
    assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
    assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression with intercept with L1 regularization") {

    // use tighter constraints because OWL-QN solver takes longer to converge
    val trainer1 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
      .setMaxIter(300).setTol(1e-10).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
      .setMaxIter(300).setTol(1e-10).setWeightCol("weight")

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)

    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial",
      alpha = 1, lambda = 0.05, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 1,
      lambda = 0.05, standardize=F))
      coefficientsStd
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
              -0.62244703
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  0.08419825

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              -0.2804845
      data.V3 -0.1336960
      data.V4  0.3717091
      data.V5 -0.1530363
      data.V6 -0.2035286

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               0.9029315
      data.V3  .
      data.V4 -0.4629737
      data.V5  .
      data.V6  .


      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
              -0.44215290
      data.V3  .
      data.V4  .
      data.V5  0.01767089
      data.V6  0.02542866

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               0.76308326
      data.V3 -0.06818576
      data.V4  .
      data.V5 -0.20446351
      data.V6 -0.13017924

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              -0.3209304
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .


     */
    val coefficientsRStd = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.0, 0.08419825,
      -0.1336960, 0.3717091, -0.1530363, -0.2035286,
      0.0, -0.4629737, 0.0, 0.0), isTransposed = true)
    val interceptsRStd = Vectors.dense(-0.62244703, -0.2804845, 0.9029315)
    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.01767089, 0.02542866,
      -0.06818576, 0.0, -0.20446351, -0.13017924,
      0.0, 0.0, 0.0, 0.0), isTransposed = true)
    val interceptsR = Vectors.dense(-0.44215290, 0.76308326, -0.3209304)

    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02)
    assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02)
    assert(model2.interceptVector ~== interceptsR relTol 0.1)
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression without intercept with L1 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)
    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 1,
      lambda = 0.05, intercept=F, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 1,
      lambda = 0.05, intercept=F, standardize=F))
      coefficientsStd
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              .
      data.V3 .
      data.V4 .
      data.V5 .
      data.V6 0.01144225

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               .
      data.V3 -0.1678787
      data.V4  0.5385351
      data.V5 -0.1573039
      data.V6 -0.2471624

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
              s0
               .
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .


      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
              s0
               .
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               .
      data.V3  .
      data.V4  0.1929409
      data.V5 -0.1889121
      data.V6 -0.1010413

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
              s0
               .
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .


     */
    val coefficientsRStd = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.0, 0.01144225,
      -0.1678787, 0.5385351, -0.1573039, -0.2471624,
      0.0, 0.0, 0.0, 0.0), isTransposed = true)

    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.0, 0.0,
      0.0, 0.1929409, -0.1889121, -0.1010413,
      0.0, 0.0, 0.0, 0.0), isTransposed = true)

    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
    assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
    assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression with intercept with L2 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(true)
      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)
    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame( data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial",
      alpha = 0, lambda = 0.1, intercept=T, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
      lambda = 0.1, intercept=T, standardize=F))
      coefficientsStd
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                         s0
              -1.5898288335
      data.V3  0.1691226336
      data.V4  0.0002983651
      data.V5  0.1001732896
      data.V6  0.2554575585

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               0.2125746
      data.V3 -0.2304586
      data.V4  0.6153492
      data.V5 -0.1537017
      data.V6 -0.2975443

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               1.37725427
      data.V3  0.06133600
      data.V4 -0.61564761
      data.V5  0.05352840
      data.V6  0.04208671


      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              -1.5681088
      data.V3  0.1508182
      data.V4  0.0121955
      data.V5  0.1217930
      data.V6  0.2162850

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               1.1217130
      data.V3 -0.2028984
      data.V4  0.2862431
      data.V5 -0.1843559
      data.V6 -0.2481218

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               0.44639579
      data.V3  0.05208012
      data.V4 -0.29843864
      data.V5  0.06256289
      data.V6  0.03183676


     */
    val coefficientsRStd = new DenseMatrix(3, 4, Array(
      0.1691226336, 0.0002983651, 0.1001732896, 0.2554575585,
      -0.2304586, 0.6153492, -0.1537017, -0.2975443,
      0.06133600, -0.61564761, 0.05352840, 0.04208671), isTransposed = true)
    val interceptsRStd = Vectors.dense(-1.5898288335, 0.2125746, 1.37725427)
    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.1508182, 0.0121955, 0.1217930, 0.2162850,
      -0.2028984, 0.2862431, -0.1843559, -0.2481218,
      0.05208012, -0.29843864, 0.06256289, 0.03183676), isTransposed = true)
    val interceptsR = Vectors.dense(-1.5681088, 1.1217130, 0.44639579)

    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.001)
    assert(model1.interceptVector ~== interceptsRStd relTol 0.05)
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
    assert(model2.interceptVector ~== interceptsR relTol 0.05)
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression without intercept with L2 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true).setWeightCol("weight")
    val trainer2 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false).setWeightCol("weight")

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)
    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
      lambda = 0.1, intercept=F, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
      lambda = 0.1, intercept=F, standardize=F))
      coefficientsStd
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               .
      data.V3  0.04048126
      data.V4 -0.23075758
      data.V5  0.08228864
      data.V6  0.22277648

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               .
      data.V3 -0.2149745
      data.V4  0.6478666
      data.V5 -0.1515158
      data.V6 -0.2930498

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               .
      data.V3  0.17449321
      data.V4 -0.41710901
      data.V5  0.06922716
      data.V6  0.07027332


      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                        s0
               .
      data.V3 -0.003949652
      data.V4 -0.142982415
      data.V5  0.091439598
      data.V6  0.179286241

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               .
      data.V3 -0.09071124
      data.V4  0.39752531
      data.V5 -0.16233832
      data.V6 -0.22206059

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               .
      data.V3  0.09466090
      data.V4 -0.25454290
      data.V5  0.07089872
      data.V6  0.04277435


     */
    val coefficientsRStd = new DenseMatrix(3, 4, Array(
      0.04048126, -0.23075758, 0.08228864, 0.22277648,
      -0.2149745, 0.6478666, -0.1515158, -0.2930498,
      0.17449321, -0.41710901, 0.06922716, 0.07027332), isTransposed = true)

    val coefficientsR = new DenseMatrix(3, 4, Array(
      -0.003949652, -0.142982415, 0.091439598, 0.179286241,
      -0.09071124, 0.39752531, -0.16233832, -0.22206059,
      0.09466090, -0.25454290, 0.07089872, 0.04277435), isTransposed = true)

    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
    assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
    assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression with intercept with elasticnet regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
      .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
      .setMaxIter(300).setTol(1e-10)
    val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
      .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
      .setMaxIter(300).setTol(1e-10)

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)
    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
      lambda = 0.1, intercept=T, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
      lambda = 0.1, intercept=T, standardize=F))
      coefficientsStd
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
              -0.50133383
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  0.08351653

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              -0.3151913
      data.V3 -0.1058702
      data.V4  0.3183251
      data.V5 -0.1212969
      data.V6 -0.1629778

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               0.8165252
      data.V3  .
      data.V4 -0.3943069
      data.V5  .
      data.V6  .


      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
              -0.38857157
      data.V3  .
      data.V4  .
      data.V5  0.02384198
      data.V6  0.03127749

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               0.62492165
      data.V3 -0.04949061
      data.V4  .
      data.V5 -0.18584462
      data.V6 -0.08952455

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              -0.2363501
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .


     */
    val coefficientsRStd = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.0, 0.08351653,
      -0.1058702, 0.3183251, -0.1212969, -0.1629778,
      0.0, -0.3943069, 0.0, 0.0), isTransposed = true)
    val interceptsRStd = Vectors.dense(-0.50133383, -0.3151913, 0.8165252)
    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.02384198, 0.03127749,
      -0.04949061, 0.0, -0.18584462, -0.08952455,
      0.0, 0.0, 0.0, 0.0), isTransposed = true)
    val interceptsR = Vectors.dense(-0.38857157, 0.62492165, -0.2363501)

    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
    assert(model1.interceptVector ~== interceptsRStd absTol 0.01)
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
    assert(model2.interceptVector ~== interceptsR absTol 0.01)
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("multinomial logistic regression without intercept with elasticnet regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false).setWeightCol("weight")
      .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
      .setMaxIter(300).setTol(1e-10)
    val trainer2 = (new LogisticRegression).setFitIntercept(false).setWeightCol("weight")
      .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
      .setMaxIter(300).setTol(1e-10)

    val model1 = trainer1.fit(multinomialDataset)
    val model2 = trainer2.fit(multinomialDataset)
    /*
      Use the following R code to load the data and train the model using glmnet package.

      library("glmnet")
      data <- read.csv("path", header=FALSE)
      label = as.factor(data$V1)
      w = data$V2
      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
      lambda = 0.1, intercept=F, standardize=T))
      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
      lambda = 0.1, intercept=F, standardize=F))
      coefficientsStd
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
              .
      data.V3 .
      data.V4 .
      data.V5 .
      data.V6 0.03238285

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                      s0
               .
      data.V3 -0.1328284
      data.V4  0.4219321
      data.V5 -0.1247544
      data.V6 -0.1893318

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
              .
      data.V3 0.004572312
      data.V4 .
      data.V5 .
      data.V6 .


      coefficients
      $`0`
      5 x 1 sparse Matrix of class "dgCMatrix"
              s0
               .
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .

      $`1`
      5 x 1 sparse Matrix of class "dgCMatrix"
                       s0
               .
      data.V3  .
      data.V4  0.14571623
      data.V5 -0.16456351
      data.V6 -0.05866264

      $`2`
      5 x 1 sparse Matrix of class "dgCMatrix"
              s0
               .
      data.V3  .
      data.V4  .
      data.V5  .
      data.V6  .


     */
    val coefficientsRStd = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.0, 0.03238285,
      -0.1328284, 0.4219321, -0.1247544, -0.1893318,
      0.004572312, 0.0, 0.0, 0.0), isTransposed = true)

    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.0, 0.0,
      0.0, 0.14571623, -0.16456351, -0.05866264,
      0.0, 0.0, 0.0, 0.0), isTransposed = true)

    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
    assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
    assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
    assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

  test("evaluate on test set") {
    // TODO: add for multiclass when model summary becomes available
    // Evaluate on test set should be same as that of the transformed training data.
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(1.0)
      .setThreshold(0.6)
    val model = lr.fit(smallBinaryDataset)
    val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary]

    val sameSummary =
      model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary]
    assert(summary.areaUnderROC === sameSummary.areaUnderROC)
    assert(summary.roc.collect() === sameSummary.roc.collect())
    assert(summary.pr.collect === sameSummary.pr.collect())
    assert(
      summary.fMeasureByThreshold.collect() === sameSummary.fMeasureByThreshold.collect())
    assert(summary.recallByThreshold.collect() === sameSummary.recallByThreshold.collect())
    assert(
      summary.precisionByThreshold.collect() === sameSummary.precisionByThreshold.collect())
  }

  test("evaluate with labels that are not doubles") {
    // Evaluate a test set with Label that is a numeric type other than Double
    val lr = new LogisticRegression()
      .setMaxIter(1)
      .setRegParam(1.0)
    val model = lr.fit(smallBinaryDataset)
    val summary = model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary]

    val longLabelData = smallBinaryDataset.select(col(model.getLabelCol).cast(LongType),
      col(model.getFeaturesCol))
    val longSummary = model.evaluate(longLabelData).asInstanceOf[BinaryLogisticRegressionSummary]

    assert(summary.areaUnderROC === longSummary.areaUnderROC)
  }

  test("statistics on training data") {
    // Test that loss is monotonically decreasing.
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(1.0)
      .setThreshold(0.6)
    val model = lr.fit(smallBinaryDataset)
    assert(
      model.summary
        .objectiveHistory
        .sliding(2)
        .forall(x => x(0) >= x(1)))
  }

  test("logistic regression with sample weights") {
    def modelEquals(m1: LogisticRegressionModel, m2: LogisticRegressionModel): Unit = {
      assert(m1.coefficientMatrix ~== m2.coefficientMatrix absTol 0.05)
      assert(m1.interceptVector ~== m2.interceptVector absTol 0.05)
    }
    val testParams = Seq(
      ("binomial", smallBinaryDataset, 2),
      ("multinomial", smallMultinomialDataset, 3)
    )
    testParams.foreach { case (family, dataset, numClasses) =>
      val estimator = new LogisticRegression().setFamily(family)
      MLTestingUtils.testArbitrarilyScaledWeights[LogisticRegressionModel, LogisticRegression](
        dataset.as[LabeledPoint], estimator, modelEquals)
      MLTestingUtils.testOutliersWithSmallWeights[LogisticRegressionModel, LogisticRegression](
        dataset.as[LabeledPoint], estimator, numClasses, modelEquals, outlierRatio = 3)
      MLTestingUtils.testOversamplingVsWeighting[LogisticRegressionModel, LogisticRegression](
        dataset.as[LabeledPoint], estimator, modelEquals, seed)
    }
  }

  test("set family") {
    val lr = new LogisticRegression().setMaxIter(1)
    // don't set anything for binary classification
    val model1 = lr.fit(binaryDataset)
    assert(model1.coefficientMatrix.numRows === 1 && model1.coefficientMatrix.numCols === 4)
    assert(model1.interceptVector.size === 1)

    // set to multinomial for binary classification
    val model2 = lr.setFamily("multinomial").fit(binaryDataset)
    assert(model2.coefficientMatrix.numRows === 2 && model2.coefficientMatrix.numCols === 4)
    assert(model2.interceptVector.size === 2)

    // set to binary for binary classification
    val model3 = lr.setFamily("binomial").fit(binaryDataset)
    assert(model3.coefficientMatrix.numRows === 1 && model3.coefficientMatrix.numCols === 4)
    assert(model3.interceptVector.size === 1)

    // don't set anything for multiclass classification
    val mlr = new LogisticRegression().setMaxIter(1)
    val model4 = mlr.fit(multinomialDataset)
    assert(model4.coefficientMatrix.numRows === 3 && model4.coefficientMatrix.numCols === 4)
    assert(model4.interceptVector.size === 3)

    // set to binary for multiclass classification
    mlr.setFamily("binomial")
    val thrown = intercept[IllegalArgumentException] {
      mlr.fit(multinomialDataset)
    }
    assert(thrown.getMessage.contains("Binomial family only supports 1 or 2 outcome classes"))

    // set to multinomial for multiclass
    mlr.setFamily("multinomial")
    val model5 = mlr.fit(multinomialDataset)
    assert(model5.coefficientMatrix.numRows === 3 && model5.coefficientMatrix.numCols === 4)
    assert(model5.interceptVector.size === 3)
  }

  test("set initial model") {
    val lr = new LogisticRegression().setFamily("binomial")
    val model1 = lr.fit(smallBinaryDataset)
    val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5).setFamily("binomial")
    val model2 = lr2.fit(smallBinaryDataset)
    val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect()
    val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect()
    predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
      assert(p1 === p2)
    }
    assert(model2.summary.totalIterations === 1)

    val lr3 = new LogisticRegression().setFamily("multinomial")
    val model3 = lr3.fit(smallMultinomialDataset)
    val lr4 = new LogisticRegression()
      .setInitialModel(model3).setMaxIter(5).setFamily("multinomial")
    val model4 = lr4.fit(smallMultinomialDataset)
    val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect()
    val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect()
    predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) =>
      assert(p1 === p2)
    }
    // TODO: check that it converges in a single iteration when model summary is available
  }

  test("binary logistic regression with all labels the same") {
    val sameLabels = smallBinaryDataset
      .withColumn("zeroLabel", lit(0.0))
      .withColumn("oneLabel", lit(1.0))

    // fitIntercept=true
    val lrIntercept = new LogisticRegression()
      .setFitIntercept(true)
      .setMaxIter(3)
      .setFamily("binomial")

    val allZeroInterceptModel = lrIntercept
      .setLabelCol("zeroLabel")
      .fit(sameLabels)
    assert(allZeroInterceptModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3)
    assert(allZeroInterceptModel.intercept === Double.NegativeInfinity)
    assert(allZeroInterceptModel.summary.totalIterations === 0)

    val allOneInterceptModel = lrIntercept
      .setLabelCol("oneLabel")
      .fit(sameLabels)
    assert(allOneInterceptModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3)
    assert(allOneInterceptModel.intercept === Double.PositiveInfinity)
    assert(allOneInterceptModel.summary.totalIterations === 0)

    // fitIntercept=false
    val lrNoIntercept = new LogisticRegression()
      .setFitIntercept(false)
      .setMaxIter(3)
      .setFamily("binomial")

    val allZeroNoInterceptModel = lrNoIntercept
      .setLabelCol("zeroLabel")
      .fit(sameLabels)
    assert(allZeroNoInterceptModel.intercept === 0.0)
    assert(allZeroNoInterceptModel.summary.totalIterations > 0)

    val allOneNoInterceptModel = lrNoIntercept
      .setLabelCol("oneLabel")
      .fit(sameLabels)
    assert(allOneNoInterceptModel.intercept === 0.0)
    assert(allOneNoInterceptModel.summary.totalIterations > 0)
  }

  test("multiclass logistic regression with all labels the same") {
    val constantData = Seq(
      LabeledPoint(4.0, Vectors.dense(0.0)),
      LabeledPoint(4.0, Vectors.dense(1.0)),
      LabeledPoint(4.0, Vectors.dense(2.0))).toDF()
    val mlr = new LogisticRegression().setFamily("multinomial")
    val model = mlr.fit(constantData)
    val results = model.transform(constantData)
    results.select("rawPrediction", "probability", "prediction").collect().foreach {
      case Row(raw: Vector, prob: Vector, pred: Double) =>
        assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity)))
        assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0)))
        assert(pred === 4.0)
    }

    // force the model to be trained with only one class
    val constantZeroData = Seq(
      LabeledPoint(0.0, Vectors.dense(0.0)),
      LabeledPoint(0.0, Vectors.dense(1.0)),
      LabeledPoint(0.0, Vectors.dense(2.0))).toDF()
    val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData)
    val resultsZero = modelZeroLabel.transform(constantZeroData)
    resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach {
      case Row(raw: Vector, prob: Vector, pred: Double) =>
        assert(prob === Vectors.dense(Array(1.0)))
        assert(pred === 0.0)
    }

    // ensure that the correct value is predicted when numClasses passed through metadata
    val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
    val constantDataWithMetadata = constantData
      .select(constantData("label").as("label", labelMeta), constantData("features"))
    val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata)
    val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata)
    resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach {
      case Row(raw: Vector, prob: Vector, pred: Double) =>
        assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0)))
        assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)))
        assert(pred === 4.0)
    }
    // TODO: check num iters is zero when it become available in the model
  }

  test("compressed storage for constant label") {
    /*
      When the label is constant and fit intercept is true, all the coefficients will be
      zeros, and so the model coefficients should be stored as sparse data structures, except
      when the matrix dimensions are very small.
     */
    val moreClassesThanFeatures = Seq(
      LabeledPoint(4.0, Vectors.dense(Array.fill(5)(0.0))),
      LabeledPoint(4.0, Vectors.dense(Array.fill(5)(1.0))),
      LabeledPoint(4.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
    val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
    val model = mlr.fit(moreClassesThanFeatures)
    assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
    assert(model.coefficientMatrix.isColMajor)

    // in this case, it should be stored as row major
    val moreFeaturesThanClasses = Seq(
      LabeledPoint(1.0, Vectors.dense(Array.fill(5)(0.0))),
      LabeledPoint(1.0, Vectors.dense(Array.fill(5)(1.0))),
      LabeledPoint(1.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
    val model2 = mlr.fit(moreFeaturesThanClasses)
    assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
    assert(model2.coefficientMatrix.isRowMajor)

    val blr = new LogisticRegression().setFamily("binomial").setFitIntercept(true)
    val blrModel = blr.fit(moreFeaturesThanClasses)
    assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix])
    assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2)
  }

  test("compressed coefficients") {

    val trainer1 = new LogisticRegression()
      .setRegParam(0.1)
      .setElasticNetParam(1.0)

    // compressed row major is optimal
    val model1 = trainer1.fit(multinomialDataset.limit(100))
    assert(model1.coefficientMatrix.isInstanceOf[SparseMatrix])
    assert(model1.coefficientMatrix.isRowMajor)

    // compressed column major is optimal since there are more classes than features
    val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
    val model2 = trainer1.fit(multinomialDataset
      .withColumn("label", col("label").as("label", labelMeta)).limit(100))
    assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
    assert(model2.coefficientMatrix.isColMajor)

    // coefficients are dense without L1 regularization
    val trainer2 = new LogisticRegression()
      .setElasticNetParam(0.0)
    val model3 = trainer2.fit(multinomialDataset.limit(100))
    assert(model3.coefficientMatrix.isInstanceOf[DenseMatrix])
  }

  test("numClasses specified in metadata/inferred") {
    val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")

    // specify more classes than unique label values
    val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata()
    val df = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta),
      smallMultinomialDataset("features"))
    val model1 = lr.fit(df)
    assert(model1.numClasses === 4)
    assert(model1.interceptVector.size === 4)

    // specify two classes when there are really three
    val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata()
    val df1 = smallMultinomialDataset
      .select(smallMultinomialDataset("label").as("label", labelMeta1),
        smallMultinomialDataset("features"))
    val thrown = intercept[IllegalArgumentException] {
      lr.fit(df1)
    }
    assert(thrown.getMessage.contains("less than the number of unique labels"))

    // lr should infer the number of classes if not specified
    val model3 = lr.fit(smallMultinomialDataset)
    assert(model3.numClasses === 3)
  }

  test("read/write") {
    def checkModelData(model: LogisticRegressionModel, model2: LogisticRegressionModel): Unit = {
      assert(model.intercept === model2.intercept)
      assert(model.coefficients.toArray === model2.coefficients.toArray)
      assert(model.numClasses === model2.numClasses)
      assert(model.numFeatures === model2.numFeatures)
    }
    val lr = new LogisticRegression()
    testEstimatorAndModelReadWrite(lr, smallBinaryDataset, LogisticRegressionSuite.allParamSettings,
      LogisticRegressionSuite.allParamSettings, checkModelData)
  }

  test("should support all NumericType labels and weights, and not support other types") {
    val lr = new LogisticRegression().setMaxIter(1)
    MLTestingUtils.checkNumericTypes[LogisticRegressionModel, LogisticRegression](
      lr, spark) { (expected, actual) =>
        assert(expected.intercept === actual.intercept)
        assert(expected.coefficients.toArray === actual.coefficients.toArray)
      }
  }
}

object LogisticRegressionSuite {

  /**
   * Mapping from all Params to valid settings which differ from the defaults.
   * This is useful for tests which need to exercise all Params, such as save/load.
   * This excludes input columns to simplify some tests.
   */
  val allParamSettings: Map[String, Any] = ProbabilisticClassifierSuite.allParamSettings ++ Map(
    "probabilityCol" -> "myProbability",
    "thresholds" -> Array(0.4, 0.6),
    "regParam" -> 0.01,
    "elasticNetParam" -> 0.1,
    "maxIter" -> 2,  // intentionally small
    "fitIntercept" -> true,
    "tol" -> 0.8,
    "standardization" -> false,
    "threshold" -> 0.6
  )

  def generateLogisticInputAsList(
    offset: Double,
    scale: Double,
    nPoints: Int,
    seed: Int): java.util.List[LabeledPoint] = {
    generateLogisticInput(offset, scale, nPoints, seed).asJava
  }

  // Generate input of the form Y = logistic(offset + scale*X)
  def generateLogisticInput(
      offset: Double,
      scale: Double,
      nPoints: Int,
      seed: Int): Seq[LabeledPoint] = {
    val rnd = new Random(seed)
    val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())

    val y = (0 until nPoints).map { i =>
      val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i))))
      if (rnd.nextDouble() < p) 1.0 else 0.0
    }

    val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i)))))
    testData
  }

  /**
   * Generates `k` classes multinomial synthetic logistic input in `n` dimensional space given the
   * model weights and mean/variance of the features. The synthetic data will be drawn from
   * the probability distribution constructed by weights using the following formula.
   *
   * P(y = 0 | x) = 1 / norm
   * P(y = 1 | x) = exp(x * w_1) / norm
   * P(y = 2 | x) = exp(x * w_2) / norm
   * ...
   * P(y = k-1 | x) = exp(x * w_{k-1}) / norm
   * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1})
   *
   * @param weights matrix is flatten into a vector; as a result, the dimension of weights vector
   *                will be (k - 1) * (n + 1) if `addIntercept == true`, and
   *                if `addIntercept != true`, the dimension will be (k - 1) * n.
   * @param xMean the mean of the generated features. Lots of time, if the features are not properly
   *              standardized, the algorithm with poor implementation will have difficulty
   *              to converge.
   * @param xVariance the variance of the generated features.
   * @param addIntercept whether to add intercept.
   * @param nPoints the number of instance of generated data.
   * @param seed the seed for random generator. For consistent testing result, it will be fixed.
   */
  def generateMultinomialLogisticInput(
      weights: Array[Double],
      xMean: Array[Double],
      xVariance: Array[Double],
      addIntercept: Boolean,
      nPoints: Int,
      seed: Int): Seq[LabeledPoint] = {
    val rnd = new Random(seed)

    val xDim = xMean.length
    val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim
    val nClasses = weights.length / xWithInterceptsDim + 1

    val x = Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian())))

    x.foreach { vector =>
      // This doesn't work if `vector` is a sparse vector.
      val vectorArray = vector.toArray
      var i = 0
      val len = vectorArray.length
      while (i < len) {
        vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i)
        i += 1
      }
    }

    val y = (0 until nPoints).map { idx =>
      val xArray = x(idx).toArray
      val margins = Array.ofDim[Double](nClasses)
      val probs = Array.ofDim[Double](nClasses)

      for (i <- 0 until nClasses - 1) {
        for (j <- 0 until xDim) margins(i + 1) += weights(i * xWithInterceptsDim + j) * xArray(j)
        if (addIntercept) margins(i + 1) += weights((i + 1) * xWithInterceptsDim - 1)
      }
      // Preventing the overflow when we compute the probability
      val maxMargin = margins.max
      if (maxMargin > 0) for (i <- 0 until nClasses) margins(i) -= maxMargin

      // Computing the probabilities for each class from the margins.
      val norm = {
        var temp = 0.0
        for (i <- 0 until nClasses) {
          probs(i) = math.exp(margins(i))
          temp += probs(i)
        }
        temp
      }
      for (i <- 0 until nClasses) probs(i) /= norm

      // Compute the cumulative probability so we can generate a random number and assign a label.
      for (i <- 1 until nClasses) probs(i) += probs(i - 1)
      val p = rnd.nextDouble()
      var y = 0
      breakable {
        for (i <- 0 until nClasses) {
          if (p < probs(i)) {
            y = i
            break
          }
        }
      }
      y
    }

    val testData = (0 until nPoints).map(i => LabeledPoint(y(i), x(i)))
    testData
  }
}