From 285a7798e267311730b0163d37d726a81465468a Mon Sep 17 00:00:00 2001 From: Vinayak Date: Fri, 13 Jan 2017 18:35:12 +0800 Subject: [SPARK-18687][PYSPARK][SQL] Backward compatibility - creating a Dataframe on a new SQLContext object fails with a Derby error Change is for SQLContext to reuse the active SparkSession during construction if the sparkContext supplied is the same as the currently active SparkContext. Without this change, a new SparkSession is instantiated that results in a Derby error when attempting to create a dataframe using a new SQLContext object even though the SparkContext supplied to the new SQLContext is same as the currently active one. Refer https://issues.apache.org/jira/browse/SPARK-18687 for details on the error and a repro. Existing unit tests and a new unit test added to pyspark-sql: /python/run-tests --python-executables=python --modules=pyspark-sql Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Vinayak Author: Vinayak Joshi Closes #16119 from vijoshi/SPARK-18687_master. --- python/pyspark/sql/context.py | 2 +- python/pyspark/sql/tests.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index de4c335ad2..c22f4b87e1 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -73,7 +73,7 @@ class SQLContext(object): self._jsc = self._sc._jsc self._jvm = self._sc._jvm if sparkSession is None: - sparkSession = SparkSession(sparkContext) + sparkSession = SparkSession.builder.getOrCreate() if jsqlContext is None: jsqlContext = sparkSession._jwrapped self.sparkSession = sparkSession diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index d1782857e6..a8250281da 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -47,7 +47,7 @@ else: import unittest from pyspark import SparkContext -from pyspark.sql import SparkSession, HiveContext, Column, Row +from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row from pyspark.sql.types import * from pyspark.sql.types import UserDefinedType, _infer_type from pyspark.tests import ReusedPySparkTestCase, SparkSubmitTests @@ -206,6 +206,11 @@ class SQLTests(ReusedPySparkTestCase): cls.spark.stop() shutil.rmtree(cls.tempdir.name, ignore_errors=True) + def test_sqlcontext_reuses_sparksession(self): + sqlContext1 = SQLContext(self.sc) + sqlContext2 = SQLContext(self.sc) + self.assertTrue(sqlContext1.sparkSession is sqlContext2.sparkSession) + def test_row_should_be_read_only(self): row = Row(a=1, b=2) self.assertEqual(1, row.a) -- cgit v1.2.3