From 98534b2cf67efc3930de427bfd1916fcf315fa59 Mon Sep 17 00:00:00 2001 From: Paul Phillips Date: Mon, 29 Oct 2012 17:17:43 -0700 Subject: SI-6584, Stream#distinct uses too much memory. [backport] Nesting recursive calls in Stream is always a dicey business. --- src/library/scala/collection/immutable/Stream.scala | 13 ++++++++++--- test/files/run/t6584.check | 8 ++++++++ test/files/run/t6584.scala | 16 ++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 test/files/run/t6584.check create mode 100644 test/files/run/t6584.scala diff --git a/src/library/scala/collection/immutable/Stream.scala b/src/library/scala/collection/immutable/Stream.scala index 1c461973e4..5bb4ef5f21 100644 --- a/src/library/scala/collection/immutable/Stream.scala +++ b/src/library/scala/collection/immutable/Stream.scala @@ -841,9 +841,16 @@ self => * // produces: "1, 2, 3, 4, 5, 6" * }}} */ - override def distinct: Stream[A] = - if (isEmpty) this - else cons(head, tail.filter(head != _).distinct) + override def distinct: Stream[A] = { + // This should use max memory proportional to N, whereas + // recursively calling distinct on the tail is N^2. + def loop(seen: Set[A], rest: Stream[A]): Stream[A] = { + if (rest.isEmpty) rest + else if (seen(rest.head)) loop(seen, rest.tail) + else cons(rest.head, loop(seen + rest.head, rest.tail)) + } + loop(Set(), this) + } /** Returns a new sequence of given length containing the elements of this * sequence followed by zero or more occurrences of given elements. diff --git a/test/files/run/t6584.check b/test/files/run/t6584.check new file mode 100644 index 0000000000..35c8688751 --- /dev/null +++ b/test/files/run/t6584.check @@ -0,0 +1,8 @@ +Array: 102400 +Vector: 102400 +List: 102400 +Stream: 102400 +Array: 102400 +Vector: 102400 +List: 102400 +Stream: 102400 diff --git a/test/files/run/t6584.scala b/test/files/run/t6584.scala new file mode 100644 index 0000000000..24c236ef35 --- /dev/null +++ b/test/files/run/t6584.scala @@ -0,0 +1,16 @@ +object Test { + def main(args: Array[String]): Unit = { + val size = 100 * 1024 + val doubled = (1 to size) ++ (1 to size) + + println("Array: " + Array.tabulate(size)(x => x).distinct.size) + println("Vector: " + Vector.tabulate(size)(x => x).distinct.size) + println("List: " + List.tabulate(size)(x => x).distinct.size) + println("Stream: " + Stream.tabulate(size)(x => x).distinct.size) + + println("Array: " + doubled.toArray.distinct.size) + println("Vector: " + doubled.toVector.distinct.size) + println("List: " + doubled.toList.distinct.size) + println("Stream: " + doubled.toStream.distinct.size) + } +} -- cgit v1.2.3