From 54d13bed87fcf2968f77e1f1153e85184ec91d78 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Fri, 25 Mar 2016 16:00:09 -0700 Subject: [SPARK-14159][ML] Fixed bug in StringIndexer + related issue in RFormula ## What changes were proposed in this pull request? StringIndexerModel.transform sets the output column metadata to use name inputCol. It should not. Fixing this causes a problem with the metadata produced by RFormula. Fix in RFormula: I added the StringIndexer columns to prefixesToRewrite, and I modified VectorAttributeRewriter to find and replace all "prefixes" since attributes collect multiple prefixes from StringIndexer + Interaction. Note that "prefixes" is no longer accurate since internal strings may be replaced. ## How was this patch tested? Unit test which failed before this fix. Author: Joseph K. Bradley Closes #11965 from jkbradley/StringIndexer-fix. --- .../org/apache/spark/ml/feature/StringIndexerSuite.scala | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mllib/src/test/scala/org/apache') diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index d40e69dced..2c3255ef33 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -210,4 +210,17 @@ class StringIndexerSuite .setLabels(Array("a", "b", "c")) testDefaultReadWrite(t) } + + test("StringIndexer metadata") { + val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) + val df = sqlContext.createDataFrame(data).toDF("id", "label") + val indexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("labelIndex") + .fit(df) + val transformed = indexer.transform(df) + val attrs = + NominalAttribute.decodeStructField(transformed.schema("labelIndex"), preserveName = true) + assert(attrs.name.nonEmpty && attrs.name.get === "labelIndex") + } } -- cgit v1.2.3