diff --git a/clipper_admin/clipper_admin/nomad/mgmt_deployment.py b/clipper_admin/clipper_admin/nomad/mgmt_deployment.py index 5f3ca45a1..3ddc05dc0 100644 --- a/clipper_admin/clipper_admin/nomad/mgmt_deployment.py +++ b/clipper_admin/clipper_admin/nomad/mgmt_deployment.py @@ -6,57 +6,59 @@ def mgmt_job_prefix(cluster_name): """ Nomad payload to deploy a new mgmt """ def mgmt_deployment(job_id, datacenters, cluster_name, image, redis_ip, redis_port, num_replicas): - job = { 'Job': { - 'ID': job_id, - 'Datacenters': datacenters, - 'Type': 'service', - 'TaskGroups': [ + job = { + 'Job': { - 'Name': nomad_job_prefix(cluster_name), - 'Count': num_replicas, - 'Tasks': [ + 'ID': job_id, + 'Datacenters': datacenters, + 'Type': 'service', + 'TaskGroups': [ { - 'Name': mgmt_job_prefix(cluster_name), - 'Driver': 'docker', - 'Config': { - 'args': [ - "--redis_ip={}".format(redis_ip or os.environ('REDIS_SERVICE_IP')), # If redis_service_host == None, default to env var - "--redis_port={}".format(redis_port or os.environ('REDIS_SERVICE_PORT') or True) - ], - 'image': image, - 'port_map': [ - {'http': 1338} - ] - }, - 'Resources': { - 'CPU': 500, - 'MemoryMB': 256, - 'Networks': [ - { - 'DynamicPorts': [{'Label': 'http', 'Value': 1338}] - } - ] - }, - 'Services': [ + 'Name': nomad_job_prefix(cluster_name), + 'Count': num_replicas, + 'Tasks': [ { - 'Name': '{}-mgmt'.format(nomad_job_prefix(cluster_name)), - 'Tags': ['machine-learning', 'model', 'clipper', 'mgmt'], - 'PortLabel': 'http', - 'Checks': [ + 'Name': mgmt_job_prefix(cluster_name), + 'Driver': 'docker', + 'Config': { + 'args': [ + "--redis_ip={}".format(redis_ip or os.environ('REDIS_SERVICE_IP')), # If redis_service_host == None, default to env var + "--redis_port={}".format(redis_port or os.environ('REDIS_SERVICE_PORT') or True) + ], + 'image': image, + 'port_map': [ + {'http': 1338} + ] + }, + 'Resources': { + 'CPU': 500, + 'MemoryMB': 256, + 'Networks': [ + { + 'DynamicPorts': [{'Label': 'http', 'Value': 1338}] + } + ] + }, + 'Services': [ { - 'Name': 'alive', - 'Type': 'tcp', - 'interval': 1000000000000, - 'timeout': 20000000000 + 'Name': '{}-mgmt'.format(nomad_job_prefix(cluster_name)), + 'Tags': ['machine-learning', 'model', 'clipper', 'mgmt'], + 'PortLabel': 'http', + 'Checks': [ + { + 'Name': 'alive', + 'Type': 'tcp', + 'interval': 3000000000, + 'timeout': 2000000000 + } + ] } ] } ] } ] - } - ] - } + } } return job diff --git a/clipper_admin/clipper_admin/nomad/model_deployment.py b/clipper_admin/clipper_admin/nomad/model_deployment.py index 2bf42ab4d..8e8cb8837 100644 --- a/clipper_admin/clipper_admin/nomad/model_deployment.py +++ b/clipper_admin/clipper_admin/nomad/model_deployment.py @@ -55,8 +55,8 @@ def model_deployment(job_id, datacenters, cluster_name, name, version, input_typ { 'Name': 'alive', 'Type': 'tcp', - 'interval': 1000000000000, - 'timeout': 20000000000 + 'interval': 3000000000, + 'timeout': 2000000000 } ] } diff --git a/clipper_admin/clipper_admin/nomad/nomad_container_manager.py b/clipper_admin/clipper_admin/nomad/nomad_container_manager.py index e3539aa92..7bcab6100 100644 --- a/clipper_admin/clipper_admin/nomad/nomad_container_manager.py +++ b/clipper_admin/clipper_admin/nomad/nomad_container_manager.py @@ -241,7 +241,7 @@ def connect(self): def deploy_model(self, name, version, input_type, image, num_replicas=1): check_name = model_check_name(self.cluster_name, name, version) - job_id = 'clipper-model-{}-{}'.format(name, version) + job_id = '{}-{}-{}'.format(model_job_prefix(self.cluster_name), name, version) self.nomad.job.register_job( job_id, model_deployment(job_id, self.datacenters, self.cluster_name, name, version, input_type, image, num_replicas) @@ -287,7 +287,9 @@ def stop_models(self, models): raise e def stop_all_model_containers(self): + print('model job prefix {}', model_job_prefix(self.cluster_name)) jobs = self.nomad.jobs.get_jobs(prefix=model_job_prefix(self.cluster_name)) + print('jobs: {}', jobs) for job in jobs: self.logger.warning('nomad job below') self.logger.warning(job) @@ -315,8 +317,7 @@ def get_query_addr(self): query_ip, query_port= self.dns.resolveSRV(check_name) self.query_ip = query_ip self.query_port = query_port - #return '{}:{}'.format(query_ip, query_port) - return '10.65.30.43:29293' + return '{}:{}'.format(query_ip, query_port) except NXDOMAIN: return ''