From ea6fb996f89e95eb889629d68184e84ea9ac8b74 Mon Sep 17 00:00:00 2001 From: Herve Yviquel Date: Thu, 15 Sep 2016 13:12:57 -0300 Subject: [PATCH 01/14] Have Spark worker advertise public IP for web app (connected to #11) --- spark/src/cgcloud/spark/spark_box.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index 5360fc5..dc19e98 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -272,7 +272,8 @@ def __install_spark( self ): SPARK_LOCAL_DIRS=self._lazy_mkdir( spark_dir, "local" ), JAVA_HOME='/usr/lib/jvm/java-7-oracle', SPARK_MASTER_IP='spark-master', - HADOOP_CONF_DIR=fmt( "{install_dir}/hadoop/etc/hadoop" ) ) + HADOOP_CONF_DIR=fmt( "{install_dir}/hadoop/etc/hadoop" ), + SPARK_PUBLIC_DNS="$(curl -s http://169.254.169.254/latest/meta-data/public-hostname)" ) with remote_open( spark_env_sh_path, use_sudo=True ) as spark_env_sh: spark_env_sh.write( '\n' ) for name, value in spark_env.iteritems( ): From ace57662df041f9d20f6159851ef9d98c22152d4 Mon Sep 17 00:00:00 2001 From: Herve Yviquel Date: Thu, 15 Sep 2016 13:05:26 -0300 Subject: [PATCH 02/14] update hadoop/spark version --- spark/src/cgcloud/spark/spark_box.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index dc19e98..94bf000 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -25,10 +25,10 @@ persistent_dir = '/mnt/persistent' var_dir = '/var/lib/sparkbox' hdfs_replication = 1 -hadoop_version = '2.6.0' -spark_version = '1.5.2' +hadoop_version = '2.7.3' +spark_version = '2.0.0' # The major version of Hadoop that the Spark binaries were built against -spark_hadoop_version = '2.6' +spark_hadoop_version = '2.7' Service = namedtuple( 'Service', [ 'init_name', From 86bf9a433d971530ce1224bc4af33280d85e372f Mon Sep 17 00:00:00 2001 From: Herve Yviquel Date: Fri, 14 Oct 2016 14:55:37 -0300 Subject: [PATCH 03/14] update spark version --- spark/src/cgcloud/spark/spark_box.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index 94bf000..680bc1e 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -26,7 +26,7 @@ var_dir = '/var/lib/sparkbox' hdfs_replication = 1 hadoop_version = '2.7.3' -spark_version = '2.0.0' +spark_version = '2.0.1' # The major version of Hadoop that the Spark binaries were built against spark_hadoop_version = '2.7' From dae176049b5873bcf7791eac1740e5b1dec6a0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Mon, 23 Jan 2017 16:10:32 -0200 Subject: [PATCH 04/14] Add new Spark2Box class --- spark/src/cgcloud/spark/__init__.py | 2 +- spark/src/cgcloud/spark/spark_box.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/spark/src/cgcloud/spark/__init__.py b/spark/src/cgcloud/spark/__init__.py index dc88eae..bfc4d4f 100644 --- a/spark/src/cgcloud/spark/__init__.py +++ b/spark/src/cgcloud/spark/__init__.py @@ -1,5 +1,5 @@ def roles( ): - from cgcloud.spark.spark_box import SparkBox, SparkSlave, SparkMaster + from cgcloud.spark.spark_box import SparkBox, Spark2Box, SparkSlave, SparkMaster return sorted( locals( ).values( ), key=lambda cls: cls.__name__ ) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index 4c6c29b..cc70f42 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -27,7 +27,8 @@ hdfs_replication = 1 hadoop_version = '2.6.0' spark_version = '1.6.2' -# The major version of Hadoop that the Spark binaries were built against +spark2_version = '2.1.0' +# The major version of Hadoop that the Spark binaries were built against spark_hadoop_version = '2.6' Service = namedtuple( 'Service', [ @@ -430,6 +431,14 @@ def __setup_path( self ): # Spark's and Hadoop's sbin f.write( fmt( 'PATH="$PATH:{install_dir}/{package}/bin"\n' ) ) +class Spark2Box( SparkBox ): + """ + A node in a Spark v2.x cluster; used only to create an image for master and worker boxes + """ + + def __init__( self, ctx ): + super( Spark2Box, self ).__init__( ctx ) + spark_version = spark2_version class SparkMaster( SparkBox, ClusterLeader ): """ From 317d90f6058cc3ff4a09d301626d017429ddecf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Mon, 23 Jan 2017 16:40:40 -0200 Subject: [PATCH 05/14] use class variable to define spark/hadoop version --- spark/src/cgcloud/spark/spark_box.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index cc70f42..d1f2d08 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -25,11 +25,6 @@ persistent_dir = '/mnt/persistent' var_dir = '/var/lib/sparkbox' hdfs_replication = 1 -hadoop_version = '2.6.0' -spark_version = '1.6.2' -spark2_version = '2.1.0' -# The major version of Hadoop that the Spark binaries were built against -spark_hadoop_version = '2.6' Service = namedtuple( 'Service', [ 'init_name', @@ -83,6 +78,11 @@ class SparkBox( ApacheSoftwareBox, typically seconds after the reservation has been submitted, the slaves can be started up. """ + hadoop_version = '2.6.0' + spark_version = '1.6.2' + # The major version of Hadoop that the Spark binaries were built against + spark_hadoop_version = '2.6' + @classmethod def get_role_options( cls ): return super( SparkBox, cls ).get_role_options( ) + [ @@ -190,7 +190,7 @@ def __ec2_keypair_name( self, ctx ): @fabric_task def __install_hadoop( self ): # Download and extract Hadoop - path = fmt( 'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz' ) + path = fmt( 'hadoop/common/hadoop-{self.hadoop_version}/hadoop-{self.hadoop_version}.tar.gz' ) self._install_apache_package( path, install_dir ) # Add environment variables to hadoop_env.sh @@ -259,7 +259,7 @@ def __to_hadoop_xml_config( properties ): @fabric_task def __install_spark( self ): # Download and extract Spark - path = fmt( 'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz' ) + path = fmt( 'spark/spark-{self.spark_version}/spark-{self.spark_version}-bin-hadoop{self.spark_hadoop_version}.tgz' ) self._install_apache_package( path, install_dir ) spark_dir = var_dir + "/spark" @@ -436,9 +436,13 @@ class Spark2Box( SparkBox ): A node in a Spark v2.x cluster; used only to create an image for master and worker boxes """ + hadoop_version = '2.7.3' + spark_version = '2.1.0' + # The major version of Hadoop that the Spark binaries were built against + spark_hadoop_version = '2.7' + def __init__( self, ctx ): super( Spark2Box, self ).__init__( ctx ) - spark_version = spark2_version class SparkMaster( SparkBox, ClusterLeader ): """ From 2a6cb3b2475648ce60c63886083d23cf79f70d35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Mon, 23 Jan 2017 18:05:09 -0200 Subject: [PATCH 06/14] Add test for both spark versions --- spark/src/cgcloud/spark/test/test_spark.py | 28 ++++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index e292c51..aab0e89 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -4,24 +4,27 @@ import time import logging import unittest +from abc import ABCMeta from cgcloud.core.test import CoreTestCase -from cgcloud.spark.spark_box import install_dir, SparkBox, SparkMaster, SparkSlave +from cgcloud.spark.spark_box import install_dir, SparkBox, Spark2Box, SparkMaster, SparkSlave log = logging.getLogger( __name__ ) master = SparkMaster.role( ) slave = SparkSlave.role( ) -node = SparkBox.role( ) num_slaves = 2 -class SparkClusterTests( CoreTestCase ): +class BaseSparkClusterTests( CoreTestCase ): """ Covers the creation of a Spark cluster from scratch and running a simple Spark job on it. Also covers persistant HDFS between two cluster incarnations. """ + __metaclass__ = ABCMeta + + node = NotImplemented cleanup = True create_image = True @@ -30,12 +33,12 @@ def setUpClass( cls ): os.environ[ 'CGCLOUD_PLUGINS' ] = 'cgcloud.spark' super( SparkClusterTests, cls ).setUpClass( ) if cls.create_image: - cls._cgcloud( 'create', node, '-IT' ) + cls._cgcloud( 'create', self.node, '-IT' ) @classmethod def tearDownClass( cls ): if cls.cleanup and cls.create_image: - cls._cgcloud( 'delete-image', node ) + cls._cgcloud( 'delete-image', self.node ) super( SparkClusterTests, cls ).tearDownClass( ) def test_wordcount( self ): @@ -131,3 +134,18 @@ def word_count( ): def _delete_volumes( self ): pass + +class SparkClusterTests( BaseSparkClusterTests ): + """ + Covers the creation of a Spark v1.x cluster from scratch and running a simple Spark job on it. + Also covers persistant HDFS between two cluster incarnations. + """ + node = SparkBox.role( ) + + +class Spark2ClusterTests( BaseSparkClusterTests ): + """ + Covers the creation of a Spark v2.x cluster from scratch and running a simple Spark job on it. + Also covers persistant HDFS between two cluster incarnations. + """ + node = Spark2Box.role( ) From 016c7b291e485f62b69fdb650f5cd04992f081c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Mon, 23 Jan 2017 18:26:15 -0200 Subject: [PATCH 07/14] fix changed class name --- spark/src/cgcloud/spark/test/test_spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index aab0e89..41decca 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -31,7 +31,7 @@ class BaseSparkClusterTests( CoreTestCase ): @classmethod def setUpClass( cls ): os.environ[ 'CGCLOUD_PLUGINS' ] = 'cgcloud.spark' - super( SparkClusterTests, cls ).setUpClass( ) + super( BaseSparkClusterTests, cls ).setUpClass( ) if cls.create_image: cls._cgcloud( 'create', self.node, '-IT' ) @@ -39,7 +39,7 @@ def setUpClass( cls ): def tearDownClass( cls ): if cls.cleanup and cls.create_image: cls._cgcloud( 'delete-image', self.node ) - super( SparkClusterTests, cls ).tearDownClass( ) + super( BaseSparkClusterTests, cls ).tearDownClass( ) def test_wordcount( self ): self._create_cluster( ) From 3a0610066411345833e1e84fd3f26ab7bf1afbc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Mon, 23 Jan 2017 18:33:15 -0200 Subject: [PATCH 08/14] fix self/cls mistakes --- spark/src/cgcloud/spark/test/test_spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index 41decca..a657d33 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -33,12 +33,12 @@ def setUpClass( cls ): os.environ[ 'CGCLOUD_PLUGINS' ] = 'cgcloud.spark' super( BaseSparkClusterTests, cls ).setUpClass( ) if cls.create_image: - cls._cgcloud( 'create', self.node, '-IT' ) + cls._cgcloud( 'create', cls.node, '-IT' ) @classmethod def tearDownClass( cls ): if cls.cleanup and cls.create_image: - cls._cgcloud( 'delete-image', self.node ) + cls._cgcloud( 'delete-image', cls.node ) super( BaseSparkClusterTests, cls ).tearDownClass( ) def test_wordcount( self ): From 39627d97f9a474c4cc6772f77e223269299bf267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Tue, 24 Jan 2017 14:35:38 -0200 Subject: [PATCH 09/14] do not test abstract class --- spark/src/cgcloud/spark/test/test_spark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index a657d33..ff630b6 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -23,6 +23,7 @@ class BaseSparkClusterTests( CoreTestCase ): Also covers persistant HDFS between two cluster incarnations. """ __metaclass__ = ABCMeta + __test__ = False node = NotImplemented cleanup = True From 1641103cdb7348a2c1183a16ad29091c947a2c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Tue, 24 Jan 2017 17:56:20 -0200 Subject: [PATCH 10/14] Switch to SPARK_MASTER_HOST env variable since SPARK_MASTER_IP is deprecated (fix webgui problems) --- spark/src/cgcloud/spark/spark_box.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index 6ba2a2f..61acb38 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -272,7 +272,7 @@ def __install_spark( self ): SPARK_WORKER_DIR=self._lazy_mkdir( spark_dir, "work" ), SPARK_LOCAL_DIRS=self._lazy_mkdir( spark_dir, "local" ), JAVA_HOME='/usr/lib/jvm/java-8-oracle', - SPARK_MASTER_IP='spark-master', + SPARK_MASTER_HOST='spark-master', HADOOP_CONF_DIR=fmt( "{install_dir}/hadoop/etc/hadoop" ), SPARK_PUBLIC_DNS="$(curl -s http://169.254.169.254/latest/meta-data/public-hostname)" ) with remote_open( spark_env_sh_path, use_sudo=True ) as spark_env_sh: From a37cdb4b8134da05d1b5c8e7596f14143d5b3e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=83=C2=A9=20Yviquel?= Date: Tue, 24 Jan 2017 18:55:41 -0200 Subject: [PATCH 11/14] Open webui ports for all ip (fixes #11) --- spark/src/cgcloud/spark/spark_box.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spark/src/cgcloud/spark/spark_box.py b/spark/src/cgcloud/spark/spark_box.py index 61acb38..ae3118e 100644 --- a/spark/src/cgcloud/spark/spark_box.py +++ b/spark/src/cgcloud/spark/spark_box.py @@ -108,7 +108,11 @@ def _populate_security_group( self, group_id ): dict( ip_protocol='tcp', from_port=0, to_port=65535, src_security_group_group_id=group_id ), dict( ip_protocol='udp', from_port=0, to_port=65535, - src_security_group_group_id=group_id ) ] + src_security_group_group_id=group_id ), + dict( ip_protocol='tcp', from_port=8080, to_port=8081, + cidr_ip='0.0.0.0/0' ), + dict( ip_protocol='tcp', from_port=4040, to_port=4045, + cidr_ip='0.0.0.0/0' )] def _get_iam_ec2_role( self ): iam_role_name, policies = super( SparkBox, self )._get_iam_ec2_role( ) From aa8206ebb7fcf24dd027a4ca27c5c2a4446a1ad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Tue, 24 Jan 2017 19:17:57 -0200 Subject: [PATCH 12/14] Allow tests of the concrete class of SparkClusterTests --- spark/src/cgcloud/spark/test/test_spark.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index ff630b6..e835ddf 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -141,6 +141,7 @@ class SparkClusterTests( BaseSparkClusterTests ): Covers the creation of a Spark v1.x cluster from scratch and running a simple Spark job on it. Also covers persistant HDFS between two cluster incarnations. """ + __test__ = True node = SparkBox.role( ) @@ -149,4 +150,5 @@ class Spark2ClusterTests( BaseSparkClusterTests ): Covers the creation of a Spark v2.x cluster from scratch and running a simple Spark job on it. Also covers persistant HDFS between two cluster incarnations. """ + __test__ = True node = Spark2Box.role( ) From 55555a2c901b5aef4147d840b0a86cb1d4726f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Yviquel?= Date: Thu, 26 Jan 2017 13:45:43 -0200 Subject: [PATCH 13/14] try to fix abstract class architecture of Sparkbox test --- spark/src/cgcloud/spark/test/test_spark.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index e835ddf..58b256a 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -23,14 +23,19 @@ class BaseSparkClusterTests( CoreTestCase ): Also covers persistant HDFS between two cluster incarnations. """ __metaclass__ = ABCMeta - __test__ = False node = NotImplemented cleanup = True create_image = True + @classmethod + @abc.abstractmethod + def initNode( cls ): + raise NotImplementedError("Abstract method") + @classmethod def setUpClass( cls ): + cls.initNode os.environ[ 'CGCLOUD_PLUGINS' ] = 'cgcloud.spark' super( BaseSparkClusterTests, cls ).setUpClass( ) if cls.create_image: @@ -42,6 +47,7 @@ def tearDownClass( cls ): cls._cgcloud( 'delete-image', cls.node ) super( BaseSparkClusterTests, cls ).tearDownClass( ) + def test_wordcount( self ): self._create_cluster( ) try: @@ -141,8 +147,10 @@ class SparkClusterTests( BaseSparkClusterTests ): Covers the creation of a Spark v1.x cluster from scratch and running a simple Spark job on it. Also covers persistant HDFS between two cluster incarnations. """ - __test__ = True - node = SparkBox.role( ) + + @classmethod + def initNode( cls ): + cls.node = SparkBox.role( ) class Spark2ClusterTests( BaseSparkClusterTests ): @@ -150,5 +158,7 @@ class Spark2ClusterTests( BaseSparkClusterTests ): Covers the creation of a Spark v2.x cluster from scratch and running a simple Spark job on it. Also covers persistant HDFS between two cluster incarnations. """ - __test__ = True - node = Spark2Box.role( ) + + @classmethod + def initNode( cls ): + cls.node = Spark2Box.role( ) From 1c3fe35f5e92ebe1d927a5ea372bef22cd4e6776 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Thu, 27 Apr 2017 22:40:51 -0700 Subject: [PATCH 14/14] @abc.abstractmethod -> @abstractmethod --- spark/src/cgcloud/spark/test/test_spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/cgcloud/spark/test/test_spark.py b/spark/src/cgcloud/spark/test/test_spark.py index 58b256a..96adc48 100644 --- a/spark/src/cgcloud/spark/test/test_spark.py +++ b/spark/src/cgcloud/spark/test/test_spark.py @@ -4,7 +4,7 @@ import time import logging import unittest -from abc import ABCMeta +from abc import ABCMeta, abstractmethod from cgcloud.core.test import CoreTestCase from cgcloud.spark.spark_box import install_dir, SparkBox, Spark2Box, SparkMaster, SparkSlave @@ -29,7 +29,7 @@ class BaseSparkClusterTests( CoreTestCase ): create_image = True @classmethod - @abc.abstractmethod + @abstractmethod def initNode( cls ): raise NotImplementedError("Abstract method")