diff --git a/api/coalition.py b/api/coalition.py index ad28901..735fe45 100644 --- a/api/coalition.py +++ b/api/coalition.py @@ -4,231 +4,231 @@ class CoalitionError(Exception): - pass + pass class Connection(object): - """A connection to the coalition server. - - :param str host: The coalition server hostname. - :param int port: The coalition server port. - """ - - def __init__(self, host='localhost', port=19211): - """Setup http connection.""" - self.IntoWith = False - self._Conn = httplib.HTTPConnection (host, port) - - def _send (self, method, command, params=None): - """Send message to server. - - :param str method: Http request method between "GET", "PUT", "POST" and "DELETE". - :param str command: REST api URL. - :param str params: Optional parameters. - :return: A string or an error. - :rtype: str or :class:CoalitionError - """ - - if params: - params = json.dumps (params) - headers = {'Content-Type': 'application/json'} - self._Conn.request (method, command, params, headers) - res = self._Conn.getresponse() - if res.status == 200: - return res.read () - else: - raise CoalitionError (res.read()) - - def newJob(self, parent=0, title='', command='', dir='', environment='', - state="WAITING", paused=False, priority=1000, timeout=0, - affinity='', user='', progress_pattern='', dependencies=[]): - """Create a :class:`Job`. - - :param int parent: The parent :class:`Job` id. - :param str title: The :class:`Job` title. - :param str command: The :class:`Job` command, or an empty string for a parent node. - :param str dir: The :class:`Job` directory. This is the current directory when the :class:`Job` is run. - :param str environment: The :class:`Job` environment variables. - :param str state: The :class:`Job` initial state. It must be "WAITING" or "PAUSED". If the state is "WAITING", the :class:`Job` will start as soon as possible. If the state is "PAUSED", the :class:`Job` won't start until it is started or reset. - :param int priority: The :class:`Job` priority. For a given :class:`Job` hierarchy level, the :class:`Job` with the biggest priority is taken first. - :param int timeout: The maximum duration a :class:`Job` run can take in seconds. If timeout=0, no limit on the :class:`Job` run. - :param str affinity: The :class:`Job` affinity string. Affinities are coma separated keywords. To run a :class:`Job`, the worker affinities must match all the :class:`Job` affinities. - :param str user: The :class:`Job` user name. - :param str progress_pattern: A regexp pattern which filters the logs and return the progression. The pattern must include a '%percent' or a '%one' keyword. - :param list(int) dependencies: The :class:`Job` ids on which the new :class:`Job` has dependencies. The :class:`Job` will run when the dependency jobs have been completed without error. - :return: The :class:`Job` id. - :rtype: int - """ - - params = locals().copy () - del params['self'] - res = self._send ("PUT", '/api/jobs', params) - return int(res) - - def getJob (self, id): - """Get a :class:`Job` instance. - - :param int id: The id of the :class:`Job`. - :return: A :class:`Job` instance. - :rtype: :class:`Job` - """ - - res = self._send('GET', '/api/jobs/' + str(id)) - return Job (json.loads(res), self) - - def getJobChildren (self, id): - """Get :class:`Job` children instances. - - :param int id: The parent :class:`Job` id. - :return: The list of children :class:`Job` instances. - :rtype: list(:class:`Job`) - """ - - res = self._send('GET', '/api/jobs/{id}/children'.format(id=id)) - return [Job(r, self) for r in json.loads(res)] - - def getJobDependencies (self, id): - """Get the :class:`Job` dependencies. - Alternatively, the dependencies attribute of a :class:`Job` contains the list - of dependent jobs ids. - - :param str id: The :class:`Job` id having dependencies. - :return: The :class:`Job` instances on which the :class:`Job` has dependencies. - :rtype: list(:class:`Job`) - """ - - res = self._send('GET', '/api/jobs/{}/dependencies'.format(id)) - return [Job(r, self) for r in json.loads(res)] - - def setJobDependencies (self, id, ids): - '''Set the :class:`Job` objects on which a job has a dependency. - Alternatively, one can set the dependencies attribute of a Job. - - :param id int: the id of the job with dependencies - :param ids [int]: the list of job.id (int) on which the job depends - ''' - res = self._send ("POST", '/api/jobs/'+str(id)+'/dependencies', ids) - return res - - def setAffinities( self, data ): - '''Set the affinities. - Affinities need to be set before they can be assigned to :class:`Job` or Worker. - - :param data: a dictionnary of affinities - ''' - res = self._send( "POST", "/api/affinities", data ) - return res - - def getAffinities( self ): - '''Get the affinities. - Affinities need to be set before they can be assigned to :class:`Job` or Worker. - - :param data: a dictionnary of affinities - ''' - - res = self._send( "GET", "/api/affinities" ) - res = json.loads( res ) - return res - - def getWorkers ( self ): - '''Returns the :class:`Worker` objects. - Workers are identified by an index. - - :rtype: the list of :class:`Worker` objects. - ''' - - res = self._send ("GET", '/api/workers') - res = json.loads( res ) - return res - - def editWorkers( self, workers ): - '''Set the :class:`Worker` objects. - All the workers' attributes are updated. - - :param data: a dictionnary of workers. - ''' - - res = self._send( "POST", '/api/workers', workers ) - return res - - def __enter__(self): - self.Jobs = {} - self.Workers = {} - self.IntoWith = True - - def __exit__(self, type, value, traceback): - self.IntoWith = False - - # Convert an object in dict - def convobj (o): - d = o.__dict__.copy() - del d['Conn'] - return d + """A connection to the coalition server. + + :param str host: The coalition server hostname. + :param int port: The coalition server port. + """ + + def __init__(self, host='localhost', port=19211): + """Setup http connection.""" + self.IntoWith = False + self._Conn = httplib.HTTPConnection (host, port) + + def _send (self, method, command, params=None): + """Send message to server. + + :param str method: Http request method between "GET", "PUT", "POST" and "DELETE". + :param str command: REST api URL. + :param str params: Optional parameters. + :return: A string or an error. + :rtype: str or :class:CoalitionError + """ + + if params: + params = json.dumps (params) + headers = {'Content-Type': 'application/json'} + self._Conn.request (method, command, params, headers) + res = self._Conn.getresponse() + if res.status == 200: + return res.read () + else: + raise CoalitionError (res.read()) + + def newJob(self, parent=0, title='', command='', dir='', environment='', + state="WAITING", paused=False, priority=1000, timeout=0, + affinity='', user='', progress_pattern='', dependencies=[]): + """Create a :class:`Job`. + + :param int parent: The parent :class:`Job` id. + :param str title: The :class:`Job` title. + :param str command: The :class:`Job` command, or an empty string for a parent node. + :param str dir: The :class:`Job` directory. This is the current directory when the :class:`Job` is run. + :param str environment: The :class:`Job` environment variables. + :param str state: The :class:`Job` initial state. It must be "WAITING" or "PAUSED". If the state is "WAITING", the :class:`Job` will start as soon as possible. If the state is "PAUSED", the :class:`Job` won't start until it is started or reset. + :param int priority: The :class:`Job` priority. For a given :class:`Job` hierarchy level, the :class:`Job` with the biggest priority is taken first. + :param int timeout: The maximum duration a :class:`Job` run can take in seconds. If timeout=0, no limit on the :class:`Job` run. + :param str affinity: The :class:`Job` affinity string. Affinities are coma separated keywords. To run a :class:`Job`, the worker affinities must match all the :class:`Job` affinities. + :param str user: The :class:`Job` user name. + :param str progress_pattern: A regexp pattern which filters the logs and return the progression. The pattern must include a '%percent' or a '%one' keyword. + :param list(int) dependencies: The :class:`Job` ids on which the new :class:`Job` has dependencies. The :class:`Job` will run when the dependency jobs have been completed without error. + :return: The :class:`Job` id. + :rtype: int + """ + + params = locals().copy () + del params['self'] + res = self._send ("PUT", '/api/jobs', params) + return int(res) + + def getJob (self, id): + """Get a :class:`Job` instance. + + :param int id: The id of the :class:`Job`. + :return: A :class:`Job` instance. + :rtype: :class:`Job` + """ + + res = self._send('GET', '/api/jobs/' + str(id)) + return Job (json.loads(res), self) + + def getJobChildren (self, id): + """Get :class:`Job` children instances. + + :param int id: The parent :class:`Job` id. + :return: The list of children :class:`Job` instances. + :rtype: list(:class:`Job`) + """ + + res = self._send('GET', '/api/jobs/{id}/children'.format(id=id)) + return [Job(r, self) for r in json.loads(res)] + + def getJobDependencies (self, id): + """Get the :class:`Job` dependencies. + Alternatively, the dependencies attribute of a :class:`Job` contains the list + of dependent jobs ids. + + :param str id: The :class:`Job` id having dependencies. + :return: The :class:`Job` instances on which the :class:`Job` has dependencies. + :rtype: list(:class:`Job`) + """ + + res = self._send('GET', '/api/jobs/{}/dependencies'.format(id)) + return [Job(r, self) for r in json.loads(res)] + + def setJobDependencies (self, id, ids): + '''Set the :class:`Job` objects on which a job has a dependency. + Alternatively, one can set the dependencies attribute of a Job. + + :param id int: the id of the job with dependencies + :param ids [int]: the list of job.id (int) on which the job depends + ''' + res = self._send ("POST", '/api/jobs/'+str(id)+'/dependencies', ids) + return res + + def setAffinities( self, data ): + '''Set the affinities. + Affinities need to be set before they can be assigned to :class:`Job` or Worker. + + :param data: a dictionnary of affinities + ''' + res = self._send( "POST", "/api/affinities", data ) + return res + + def getAffinities( self ): + '''Get the affinities. + Affinities need to be set before they can be assigned to :class:`Job` or Worker. + + :param data: a dictionnary of affinities + ''' + + res = self._send( "GET", "/api/affinities" ) + res = json.loads( res ) + return res + + def getWorkers ( self ): + '''Returns the :class:`Worker` objects. + Workers are identified by an index. + + :rtype: the list of :class:`Worker` objects. + ''' + + res = self._send ("GET", '/api/workers') + res = json.loads( res ) + return res + + def editWorkers( self, workers ): + '''Set the :class:`Worker` objects. + All the workers' attributes are updated. + + :param data: a dictionnary of workers. + ''' + + res = self._send( "POST", '/api/workers', workers ) + return res + + def __enter__(self): + self.Jobs = {} + self.Workers = {} + self.IntoWith = True - if not isinstance(value, TypeError): - if len(self.Jobs) > 0: - self._send ("POST", '/api/jobs', self.Jobs) - if len(self.Workers) > 0: - self._send ("POST", '/api/workers', self.Workers) + def __exit__(self, type, value, traceback): + self.IntoWith = False + + # Convert an object in dict + def convobj (o): + d = o.__dict__.copy() + del d['Conn'] + return d + + if not isinstance(value, TypeError): + if len(self.Jobs) > 0: + self._send ("POST", '/api/jobs', self.Jobs) + if len(self.Workers) > 0: + self._send ("POST", '/api/workers', self.Workers) class Job(object): - '''A job object returned by the :class:`Connection`. Don't create such objects yourself. - Job properties should be modified into a Connection with block. Don't modify the id or the state properties directly. - ''' - - def __init__ (self, d, conn): - assert (conn) - self.Conn = False - self.__dict__.update (d) - self.Conn = conn - """:var int id: the job id - :var int parent: the parent job id - :var str title: the job title - :var str command: the job command to execute, or an empty string if the job is a parent node. - :var str dir: the job working directory - :var str environment: the job environment - :var str state: the job state. It can be "WAITING", "PAUSED", "WORKING", "PENDING", "FINISHED" or "ERROR" - :var str paused: the job is paused, which is an alias for state == "PAUSED". - :var str worker: the last worker name who took the job - :var int start_time: the job start time (in seconds after epoch) - :var int duration: the job duration (in seconds) - :var int ping_time: the last time a worker ping on this job (in seconds after epoch) - :var int run_done: number of run done on this job - :var int timeout: maximum duration a job run can take in seconds. If timeout=0, no limit on the job run. - :var int priority: the job priority. For a given job hierarchy level, the job with the biggest priority is taken first. - :var str affinity: the job affinity string. Affinities are coma separated keywords. To run a job, the worker affinities must match all the job affinities. - :var str user: the job user name. - :var int finished: number of finished children jobs. For parent node only. - :var int errors: number of faulty children jobs. For parent node only. - :var int working: number of working children jobs. For parent node only. - :var int total: number of total (grand)children jobs. For parent node only. - :var int total_finished: number of finished (grand)children jobs. For parent node only. - :var int total_errors: number of faulty (grand)children jobs. For parent node only. - :var int total_working: number of working (grand)children jobs. For parent node only. - :var array dependencies: the ids of jobs this job is dependent on. - :var str url: an URL to the job result. If available, a link to this URL will be shown in the interface. - :var float progress: the job progression between 0 and 1. - :var str progress_pattern: a regexp pattern which filters the logs and return the progression. The pattern must include a '%percent' or a '%one' keyword. - """ - - def __setattr__(self, attr, value): - if attr != "Conn" and self.Conn: - if self.Conn.IntoWith: - w = self.Conn.Jobs.get(self.id) - if not w: - w = {} - self.Conn.Jobs[self.id] = w - w[attr] = value - else: - raise CoalitionError("Can't write attributes outside a connection block") - super(Job, self).__setattr__(attr, value) + '''A job object returned by the :class:`Connection`. Don't create such objects yourself. + Job properties should be modified into a Connection with block. Don't modify the id or the state properties directly. + ''' + + def __init__ (self, d, conn): + assert (conn) + self.Conn = False + self.__dict__.update (d) + self.Conn = conn + """:var int id: the job id + :var int parent: the parent job id + :var str title: the job title + :var str command: the job command to execute, or an empty string if the job is a parent node. + :var str dir: the job working directory + :var str environment: the job environment + :var str state: the job state. It can be "WAITING", "PAUSED", "WORKING", "PENDING", "FINISHED" or "ERROR" + :var str paused: the job is paused, which is an alias for state == "PAUSED". + :var str worker: the last worker name who took the job + :var int start_time: the job start time (in seconds after epoch) + :var int duration: the job duration (in seconds) + :var int ping_time: the last time a worker ping on this job (in seconds after epoch) + :var int run_done: number of run done on this job + :var int timeout: maximum duration a job run can take in seconds. If timeout=0, no limit on the job run. + :var int priority: the job priority. For a given job hierarchy level, the job with the biggest priority is taken first. + :var str affinity: the job affinity string. Affinities are coma separated keywords. To run a job, the worker affinities must match all the job affinities. + :var str user: the job user name. + :var int finished: number of finished children jobs. For parent node only. + :var int errors: number of faulty children jobs. For parent node only. + :var int working: number of working children jobs. For parent node only. + :var int total: number of total (grand)children jobs. For parent node only. + :var int total_finished: number of finished (grand)children jobs. For parent node only. + :var int total_errors: number of faulty (grand)children jobs. For parent node only. + :var int total_working: number of working (grand)children jobs. For parent node only. + :var array dependencies: the ids of jobs this job is dependent on. + :var str url: an URL to the job result. If available, a link to this URL will be shown in the interface. + :var float progress: the job progression between 0 and 1. + :var str progress_pattern: a regexp pattern which filters the logs and return the progression. The pattern must include a '%percent' or a '%one' keyword. + """ + + def __setattr__(self, attr, value): + if attr != "Conn" and self.Conn: + if self.Conn.IntoWith: + w = self.Conn.Jobs.get(self.id) + if not w: + w = {} + self.Conn.Jobs[self.id] = w + w[attr] = value + else: + raise CoalitionError("Can't write attributes outside a connection block") + super(Job, self).__setattr__(attr, value) class CoalitionError(Exception): - pass + pass # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/cloud/aws.py b/cloud/aws.py index dbbe4de..a114ba2 100644 --- a/cloud/aws.py +++ b/cloud/aws.py @@ -13,88 +13,88 @@ def startInstance(name, config): - """ - Run the aws command to start a worker instance. - Return the created instanceid in case of dedicated ec2 instance or the spotinstancerequestid - in case of a spot instance. - """ - - if config.get("worker", "spot"): - cmd = ["aws", "ec2", "request-spot-instances", - "--spot-price", config.get("spot", "spotprice"), - "--instance-count", config.get("spot", "instancecount"), - "--type", config.get("spot", "type"), - "--launch-specification", _getLaunchSpecification(name, config),] - else: - cmd = ["aws", "ec2", "run-instances", - "--key-name", config.get("authentication", "keyname"), - "--image-id", config.get("worker", "imageid"), - "--instance-type", config.get("worker", "instancetype"), - "--subnet-id", config.get("worker", "subnetid"), - "--security-group-ids", - config.get("worker", "securitygroupid"), - "--iam-instance-profile", - "Arn=%s" % config.get("worker", "iaminstanceprofile"), - "--user-data", _getUserData(name, config),] - common._run_or_none(cmd) + """ + Run the aws command to start a worker instance. + Return the created instanceid in case of dedicated ec2 instance or the spotinstancerequestid + in case of a spot instance. + """ + + if config.get("worker", "spot"): + cmd = ["aws", "ec2", "request-spot-instances", + "--spot-price", config.get("spot", "spotprice"), + "--instance-count", config.get("spot", "instancecount"), + "--type", config.get("spot", "type"), + "--launch-specification", _getLaunchSpecification(name, config),] + else: + cmd = ["aws", "ec2", "run-instances", + "--key-name", config.get("authentication", "keyname"), + "--image-id", config.get("worker", "imageid"), + "--instance-type", config.get("worker", "instancetype"), + "--subnet-id", config.get("worker", "subnetid"), + "--security-group-ids", + config.get("worker", "securitygroupid"), + "--iam-instance-profile", + "Arn=%s" % config.get("worker", "iaminstanceprofile"), + "--user-data", _getUserData(name, config),] + common._run_or_none(cmd) def stopInstance(name, config): - """Run the aws command to terminate the instance.""" - cmd = ["aws", "ec2", "terminate-instances", "--instance-ids", - _getInstanceIdByName(name)] - common._run_or_none(cmd) + """Run the aws command to terminate the instance.""" + cmd = ["aws", "ec2", "terminate-instances", "--instance-ids", + _getInstanceIdByName(name)] + common._run_or_none(cmd) def _getLaunchSpecification(name, config): - with open("cloud/aws_worker_spot_launchspecification.json.template", 'r') as f: - template = Template(f.read()) - values = { - "image_id": config.get("worker", "imageid"), - "keyname": config.get("authentication", "keyname"), - "security_group_id": config.get("worker", "securitygroupid"), - "instance_type": config.get("worker", "instancetype"), - "user_data": encodestring(_getUserData(name, config)), } - return template.substitute(values).replace('\n', '') + with open("cloud/aws_worker_spot_launchspecification.json.template", 'r') as f: + template = Template(f.read()) + values = { + "image_id": config.get("worker", "imageid"), + "keyname": config.get("authentication", "keyname"), + "security_group_id": config.get("worker", "securitygroupid"), + "instance_type": config.get("worker", "instancetype"), + "user_data": encodestring(_getUserData(name, config)), } + return template.substitute(values).replace('\n', '') def _getUserData(name, config): - """ - Prepare the user-data script in cloud-init syntax. - Return the script as a string. - """ - - with open("cloud/aws_worker_cloud_init.template", 'r') as f: - template = Template(f.read()) - values = { - "hostname": name, - "region": config.get("authentication", "region"), - "access_key": config.get("authentication", "accesskey"), - "secret_access_key": - config.get("authentication", "secretaccesskey"), - "bucket_name": config.get("storage", "name"), - "mount_point": config.get("storage", "mountpoint"), - "guerilla_render_filename": - config.get("storage", "guerillarenderfilename"), - "coalition_filename": - config.get("storage", "coalitionfilename"), - "coalition_server_ip": config.get("coalition", "ip"), - "coalition_server_port": config.get("coalition", "port"),} - return template.substitute(values) + """ + Prepare the user-data script in cloud-init syntax. + Return the script as a string. + """ + + with open("cloud/aws_worker_cloud_init.template", 'r') as f: + template = Template(f.read()) + values = { + "hostname": name, + "region": config.get("authentication", "region"), + "access_key": config.get("authentication", "accesskey"), + "secret_access_key": + config.get("authentication", "secretaccesskey"), + "bucket_name": config.get("storage", "name"), + "mount_point": config.get("storage", "mountpoint"), + "guerilla_render_filename": + config.get("storage", "guerillarenderfilename"), + "coalition_filename": + config.get("storage", "coalitionfilename"), + "coalition_server_ip": config.get("coalition", "ip"), + "coalition_server_port": config.get("coalition", "port"),} + return template.substitute(values) def _getInstanceIdByName(name): - """Return instanceid from name.""" - cmd = ["aws", "ec2", "describe-instances"] - output = common._check_output_or_none(cmd) - if output: - for resources in json.loads(output)["Reservations"]: - instance = resources["Instances"][0] - if instance.has_key("Tags"): - for tags in instance["Tags"]: - if tags["Key"] == "Name" and tags["Value"] == name: - return instance["InstanceId"] - return None + """Return instanceid from name.""" + cmd = ["aws", "ec2", "describe-instances"] + output = common._check_output_or_none(cmd) + if output: + for resources in json.loads(output)["Reservations"]: + instance = resources["Instances"][0] + if instance.has_key("Tags"): + for tags in instance["Tags"]: + if tags["Key"] == "Name" and tags["Value"] == name: + return instance["InstanceId"] + return None # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/cloud/common.py b/cloud/common.py index 70d4a59..0351291 100644 --- a/cloud/common.py +++ b/cloud/common.py @@ -9,27 +9,27 @@ def createWorkerInstanceName(prefix): - """Return a unique name based on prefix and timestamp.""" - return "%s%s" % (prefix, int(time())) - - -def _run_or_none(cmd): - """Execute command. Returns None in case of exception.""" - try: - return subprocess.Popen(cmd, stderr=subprocess.STDOUT, - universal_newlines=True) - except Exception as e: - print(e) - return None - -def _check_output_or_none(cmd): - """Execute command. Returns None in case of exception.""" - try: - return subprocess.check_output(cmd, stderr=subprocess.STDOUT, - universal_newlines=True) - except Exception as e: - print(e) - return None + """Return a unique name based on prefix and timestamp.""" + return "%s%s" % (prefix, int(time())) + + +def _run_or_none(cmd): + """Execute command. Returns None in case of exception.""" + try: + return subprocess.Popen(cmd, stderr=subprocess.STDOUT, + universal_newlines=True) + except Exception as e: + print(e) + return None + +def _check_output_or_none(cmd): + """Execute command. Returns None in case of exception.""" + try: + return subprocess.check_output(cmd, stderr=subprocess.STDOUT, + universal_newlines=True) + except Exception as e: + print(e) + return None # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/cloud/gcloud.py b/cloud/gcloud.py index 092c6c7..ef30842 100644 --- a/cloud/gcloud.py +++ b/cloud/gcloud.py @@ -15,68 +15,68 @@ def startInstance(name, config): - """ - Run the gcloud command to start a worker instance. - Return the created FIXME - """ - - # gcloud command line tool is picky with params escaping (eg. key-id in json) - # So we use a real temporary file - startup_script_file = tempfile.NamedTemporaryFile(delete=False) - startup_script_file.write(_getStartupScript(name, config)) - startup_script_file.flush() - os.fsync(startup_script_file.fileno()) - - - cmd = ["gcloud", "compute", "--project", config.get("authentication", "project"), - "instances", "create", name, - "--zone", config.get("worker", "zone"), - "--machine-type", config.get("worker", "machinetype"), - "--subnet", config.get("worker", "subnet"), - "--maintenance-policy", config.get("worker", "maintenancepolicy"), - "--service-account", config.get("authentication", "serviceaccount"), - "--scopes", config.get("authentication", "scopes"), - "--image", config.get("worker", "image"), - "--image-project", config.get("worker", "imageproject"), - "--boot-disk-size", config.get("worker", "bootdisksize"), - "--boot-disk-type", config.get("worker", "bootdisktype"), - "--boot-disk-device-name", name, - "--metadata-from-file", "startup-script={}".format(startup_script_file.name),] - if config.getboolean("worker", "preemptible") == True: - cmd.append("--preemptible") - common._run_or_none(cmd) + """ + Run the gcloud command to start a worker instance. + Return the created FIXME + """ + + # gcloud command line tool is picky with params escaping (eg. key-id in json) + # So we use a real temporary file + startup_script_file = tempfile.NamedTemporaryFile(delete=False) + startup_script_file.write(_getStartupScript(name, config)) + startup_script_file.flush() + os.fsync(startup_script_file.fileno()) + + + cmd = ["gcloud", "compute", "--project", config.get("authentication", "project"), + "instances", "create", name, + "--zone", config.get("worker", "zone"), + "--machine-type", config.get("worker", "machinetype"), + "--subnet", config.get("worker", "subnet"), + "--maintenance-policy", config.get("worker", "maintenancepolicy"), + "--service-account", config.get("authentication", "serviceaccount"), + "--scopes", config.get("authentication", "scopes"), + "--image", config.get("worker", "image"), + "--image-project", config.get("worker", "imageproject"), + "--boot-disk-size", config.get("worker", "bootdisksize"), + "--boot-disk-type", config.get("worker", "bootdisktype"), + "--boot-disk-device-name", name, + "--metadata-from-file", "startup-script={}".format(startup_script_file.name),] + if config.getboolean("worker", "preemptible") == True: + cmd.append("--preemptible") + common._run_or_none(cmd) def stopInstance(name, config): - """Run the gcloud command to terminate the instance.""" - zone = config.get("worker", "zone") - cmd = ["gcloud", "compute", "instances", "delete", "--quiet", "--zone", zone, name] - common._run_or_none(cmd) + """Run the gcloud command to terminate the instance.""" + zone = config.get("worker", "zone") + cmd = ["gcloud", "compute", "instances", "delete", "--quiet", "--zone", zone, name] + common._run_or_none(cmd) def _getStartupScript(name, config): - """ - Prepare the startup-script in bash script syntax. - Return the script as a string. - """ - - with open(config.get("authentication", "keyfile"), 'r') as f: - key_id_data = f.read() - - with open("cloud/gcloud_worker_startup_script.template", 'r') as f: - template = Template(f.read()) - values = { - "key_id_json": key_id_data, - "hostname": name, - "mount_point": config.get("storage", "mountpoint"), - "bucket_name": config.get("storage", "name"), + """ + Prepare the startup-script in bash script syntax. + Return the script as a string. + """ + + with open(config.get("authentication", "keyfile"), 'r') as f: + key_id_data = f.read() + + with open("cloud/gcloud_worker_startup_script.template", 'r') as f: + template = Template(f.read()) + values = { + "key_id_json": key_id_data, + "hostname": name, + "mount_point": config.get("storage", "mountpoint"), + "bucket_name": config.get("storage", "name"), "install_dir": config.get("worker", "installdir"), - "coalition_package": config.get("storage", "coalitionpackage"), - "main_program_package": config.get("main_program", "package"), - "main_program_environment": config.get("main_program", "environment"), - "coalition_server_ip": config.get("coalition", "ip"), - "coalition_server_port": config.get("coalition", "port"),} - return template.substitute(values) + "coalition_package": config.get("storage", "coalitionpackage"), + "main_program_package": config.get("main_program", "package"), + "main_program_environment": config.get("main_program", "environment"), + "coalition_server_ip": config.get("coalition", "ip"), + "coalition_server_port": config.get("coalition", "port"),} + return template.substitute(values) # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/cloud/qarnot_api.py b/cloud/qarnot_api.py index 4b14bfe..8d3be7c 100644 --- a/cloud/qarnot_api.py +++ b/cloud/qarnot_api.py @@ -14,51 +14,52 @@ def startInstance(name, config): - """Use qarnot API to start a worker instance. Instances are started by task creation.""" - startup_script_file = tempfile.NamedTemporaryFile(delete=True) - startup_script_file.write(_getStartupScript(name, config)) - startup_script_file.flush() - os.fsync(startup_script_file.fileno()) - startup_script_file.seek(0) + """Use qarnot API to start a worker instance. Instances are started by task creation.""" + startup_script_file = tempfile.NamedTemporaryFile(delete=True) + startup_script_file.write(_getStartupScript(name, config)) + startup_script_file.flush() + os.fsync(startup_script_file.fileno()) + startup_script_file.seek(0) - connection = qarnot.connection.Connection("cloud_qarnot.ini") - # We need internet access and start one instance at time - task = connection.create_task(name, 'docker-network', 1) - task.constants["DOCKER_REPO"] = config.get("worker", "docker_repo") - task.constants["DOCKER_TAG"] = config.get("worker", "docker_tag") - task.constants["DOCKER_HOST"] = common.createWorkerInstanceName(config.get("worker", "nameprefix")) - task.constants["DOCKER_CMD"] = startup_script_file.read() - p = Process(target=task.run) - p.start() + connection = qarnot.connection.Connection("cloud_qarnot.ini") + # We need internet access and start one instance at time + task = connection.create_task(name, 'docker-network', 1) + task.constants["DOCKER_REPO"] = config.get("worker", "docker_repo") + task.constants["DOCKER_TAG"] = config.get("worker", "docker_tag") + task.constants["DOCKER_HOST"] = common.createWorkerInstanceName(config.get("worker", "nameprefix")) + task.constants["DOCKER_CMD"] = startup_script_file.read() + p = Process(target=task.run) + p.start() def stopInstance(name, config): - """Use qarnot API to terminate the instance.""" + """Use qarnot API to terminate the instance.""" - connection = qarnot.connection.Connection("cloud_qarnot.ini") - tasks = connection.tasks() - task = [t for t in tasks if t.name == name][0] - task.delete() + connection = qarnot.connection.Connection("cloud_qarnot.ini") + tasks = connection.tasks() + task = [t for t in tasks if t.name == name][0] + task.delete() def _getStartupScript(name, config): - """Build the workers startup script.""" + """Build the workers startup script.""" - with open(config.get("authentication", "keyfile"), 'r') as f: - key_id_data = f.read() + with open(config.get("authentication", "keyfile"), 'r') as f: + key_id_data = f.read() - with open("cloud/qarnot_worker_startup_script.template", 'r') as f: - template = Template(f.read()) - values = { - "key_id_json": key_id_data, - "hostname": name, - "mount_point": config.get("storage", "mountpoint"), - "bucket_name": config.get("storage", "name"), + with open("cloud/qarnot_worker_startup_script.template", 'r') as f: + template = Template(f.read()) + values = { + "key_id_json": key_id_data, + "hostname": name, + "mount_point": config.get("storage", "mountpoint"), + "bucket_name": config.get("storage", "name"), "install_dir": config.get("worker", "installdir"), - "coalition_package": config.get("storage", "coalitionpackage"), - "main_program_package": config.get("main_program", "package"), - "main_program_environment": config.get("main_program", "environment"), - "coalition_server_ip": config.get("coalition", "ip"), - "coalition_server_port": config.get("coalition", "port"),} - return template.substitute(values) + "coalition_package": config.get("storage", "coalitionpackage"), + "main_program_package": config.get("main_program", "package"), + "main_program_environment": config.get("main_program", "environment"), + "coalition_server_ip": config.get("coalition", "ip"), + "coalition_server_port": config.get("coalition", "port"),} + return template.substitute(values) # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 + diff --git a/control.py b/control.py index d7b4155..eda6bf7 100644 --- a/control.py +++ b/control.py @@ -19,129 +19,129 @@ globalprogress=None def usage(): - print ("Usage: control.py [OPTIONS] SERVER_URL ACTION [COMMAND]") - print ("Control the Coalition server located at SERVER_URL.\n") - print("Actions:") - print(" add: add a job, use option -c for command") - print(" list: list the jobs on the server") - print(" remove: remove job designated by id, option -i is necessary") - print ("Options:") - print (" -h, --help\t\tShow this help") - print (" -v, --verbose\t\tIncrease verbosity") - print (" -c, --cmd=COMMAND\t\tIf action is add, add command to server") - print (" -d, --directory=DIR\tWorking directory (default: "+dir+")") - print (" -t, --title=TITLE\tSet the job title (default: "+title+")") - print (" -p, --priority=PRIORITY\tPriority of the job (default: "+str(priority)+")") - print (" -r, --retry=RETRY\tNumber of retry this jobs can do (default: "+str(retry)+")") - print (" -a, --affinity=AFFINITY\tAffinity words to workers, separated by a comma (default: \"\"") - print (" -i, --jobid=JOBID\tID of the Job") - print (" -T, --timeout=TIMEOUT\ttimeout for the job") - print (" -D, --dependencies=DEPS\tIDs of the dependent jobs (exemple : \"21 22 23\"") - print (" -P, --parent=PARENT\tId of of the parent of the job") - print (" --globalprogress=PATTERN\tThe job progression pattern") - print (" --localprogress=PATTERN\tThe second job progression pattern") + print ("Usage: control.py [OPTIONS] SERVER_URL ACTION [COMMAND]") + print ("Control the Coalition server located at SERVER_URL.\n") + print("Actions:") + print(" add: add a job, use option -c for command") + print(" list: list the jobs on the server") + print(" remove: remove job designated by id, option -i is necessary") + print ("Options:") + print (" -h, --help\t\tShow this help") + print (" -v, --verbose\t\tIncrease verbosity") + print (" -c, --cmd=COMMAND\t\tIf action is add, add command to server") + print (" -d, --directory=DIR\tWorking directory (default: "+dir+")") + print (" -t, --title=TITLE\tSet the job title (default: "+title+")") + print (" -p, --priority=PRIORITY\tPriority of the job (default: "+str(priority)+")") + print (" -r, --retry=RETRY\tNumber of retry this jobs can do (default: "+str(retry)+")") + print (" -a, --affinity=AFFINITY\tAffinity words to workers, separated by a comma (default: \"\"") + print (" -i, --jobid=JOBID\tID of the Job") + print (" -T, --timeout=TIMEOUT\ttimeout for the job") + print (" -D, --dependencies=DEPS\tIDs of the dependent jobs (exemple : \"21 22 23\"") + print (" -P, --parent=PARENT\tId of of the parent of the job") + print (" --globalprogress=PATTERN\tThe job progression pattern") + print (" --localprogress=PATTERN\tThe second job progression pattern") - print ("\nExample : control -t \"Job\" -a \"Linux\" -c \"echo Hello world!\" http://localhost:8080 add") + print ("\nExample : control -t \"Job\" -a \"Linux\" -c \"echo Hello world!\" http://localhost:8080 add") # Parse the options try: - opts, args = getopt.getopt(sys.argv[1:], "a:d:e:h:r:s:t:v:c:i:D:p:T:P:", ["affinity=", "directory=", "end=", "help", "retry=", "start=", "title=", "verbose=", "command=", "cmd=", "dependencies=", "priority=", "timeout=","parent=","localprogress=","globalprogress="]) - if len(args) != 2 : - usage() - sys.exit(2) - serverUrl = args[0] - while serverUrl[-1] == '/': - serverUrl = serverUrl[:-1] - action = args[1] + opts, args = getopt.getopt(sys.argv[1:], "a:d:e:h:r:s:t:v:c:i:D:p:T:P:", ["affinity=", "directory=", "end=", "help", "retry=", "start=", "title=", "verbose=", "command=", "cmd=", "dependencies=", "priority=", "timeout=","parent=","localprogress=","globalprogress="]) + if len(args) != 2 : + usage() + sys.exit(2) + serverUrl = args[0] + while serverUrl[-1] == '/': + serverUrl = serverUrl[:-1] + action = args[1] except getopt.GetoptError as err: - # print help information and exit: - print str(err) # will print something like "option -a not recognized" - usage() - sys.exit(2) + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) for o, a in opts: - if o in ("-h", "--help"): - usage() - sys.exit(2) - elif o in ("-d", "--directory"): - dir = a - elif o in ("-s", "--start"): - startIndex = int(a) - elif o in ("-e", "--end"): - endIndex = int(a) - elif o in ("-v", "--verbose"): - verbose = True - elif o in ("-r", "--retry"): - retry = int(a) - elif o in ("-p", "--priority"): - priority = int(a) - elif o in ("-a", "--affinity"): - affinity = a - elif o in ("-t", "--title"): - title = a - elif o in ("-c", "--command", "--cmd"): - cmd=a - elif o in ("-i", "--jobid"): - id=a - elif o in ("-D", "--dependencies"): - dependencies=a - elif o in ("-T", "--timeout"): - timeout=int(a) - elif o in ("-P", "--parent"): - parent=int(a) - elif o in ("--localprogress"): - localprogress = a - elif o in ("--globalprogress"): - globalprogress = a - else: - assert False, "unhandled option " + o + if o in ("-h", "--help"): + usage() + sys.exit(2) + elif o in ("-d", "--directory"): + dir = a + elif o in ("-s", "--start"): + startIndex = int(a) + elif o in ("-e", "--end"): + endIndex = int(a) + elif o in ("-v", "--verbose"): + verbose = True + elif o in ("-r", "--retry"): + retry = int(a) + elif o in ("-p", "--priority"): + priority = int(a) + elif o in ("-a", "--affinity"): + affinity = a + elif o in ("-t", "--title"): + title = a + elif o in ("-c", "--command", "--cmd"): + cmd=a + elif o in ("-i", "--jobid"): + id=a + elif o in ("-D", "--dependencies"): + dependencies=a + elif o in ("-T", "--timeout"): + timeout=int(a) + elif o in ("-P", "--parent"): + parent=int(a) + elif o in ("--localprogress"): + localprogress = a + elif o in ("--globalprogress"): + globalprogress = a + else: + assert False, "unhandled option " + o # Log function def output (str): - if verbose: - print (str) + if verbose: + print (str) if action=="add": - params = urllib.urlencode({'parent':parent, 'title':title, 'cmd':cmd, 'dir':dir, 'priority':priority, 'retry':retry, 'timeout':timeout, 'affinity':affinity, 'dependencies':dependencies, 'localprogress':localprogress, 'globalprogress':globalprogress}) - conn = httplib.HTTPConnection(re.sub("^http://", "", serverUrl)) - conn.request("GET", "/json/addjob?"+params) - response = conn.getresponse() - data = response.read() - conn.close() - print data - + params = urllib.urlencode({'parent':parent, 'title':title, 'cmd':cmd, 'dir':dir, 'priority':priority, 'retry':retry, 'timeout':timeout, 'affinity':affinity, 'dependencies':dependencies, 'localprogress':localprogress, 'globalprogress':globalprogress}) + conn = httplib.HTTPConnection(re.sub("^http://", "", serverUrl)) + conn.request("GET", "/json/addjob?"+params) + response = conn.getresponse() + data = response.read() + conn.close() + print data + elif action=="list": - params = urllib.urlencode({'id':parent}) - conn = httplib.HTTPConnection(re.sub("^http://", "", serverUrl)) - conn.request("GET", "/json/getjobs?"+params) - response = conn.getresponse() - data = response.read() - conn.close() + params = urllib.urlencode({'id':parent}) + conn = httplib.HTTPConnection(re.sub("^http://", "", serverUrl)) + conn.request("GET", "/json/getjobs?"+params) + response = conn.getresponse() + data = response.read() + conn.close() + + data = eval (data) + vars=data["Vars"] + print (vars) + jobs=data["Jobs"] + parents=data["Parents"] - data = eval (data) - vars=data["Vars"] - print (vars) - jobs=data["Jobs"] - parents=data["Parents"] - - parents_info='' - for i in range(len(parents)): - parents_info = parents_info+ str(parents[i]["ID"])+" " +str(parents[i]["Title"])+ " > " - print(parents_info) - for i in range(len(jobs)): - for j in range(len(vars)): - print (jobs[i]) + parents_info='' + for i in range(len(parents)): + parents_info = parents_info+ str(parents[i]["ID"])+" " +str(parents[i]["Title"])+ " > " + print(parents_info) + for i in range(len(jobs)): + for j in range(len(vars)): + print (jobs[i]) elif action=="remove": - if id<0: - print("Use option -i to specify the job id to remove") - else: - params = urllib.urlencode({'id':id}) - conn = httplib.HTTPConnection(re.sub("^http://", "", serverUrl)) - conn.request("GET", "/json/clearjobs?"+params) - response = conn.getresponse() - data = response.read() - conn.close() + if id<0: + print("Use option -i to specify the job id to remove") + else: + params = urllib.urlencode({'id':id}) + conn = httplib.HTTPConnection(re.sub("^http://", "", serverUrl)) + conn.request("GET", "/json/clearjobs?"+params) + response = conn.getresponse() + data = response.read() + conn.close() else: - print("I don't know what to do with myself. Use another action") + print("I don't know what to do with myself. Use another action") # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/db.py b/db.py index 5f30887..bd59c3b 100644 --- a/db.py +++ b/db.py @@ -4,98 +4,98 @@ class DB(object): - def __init__(self): - self.IntoWith = False + def __init__(self): + self.IntoWith = False - '''Enter a transaction block''' - def __enter__(self): + '''Enter a transaction block''' + def __enter__(self): - self.Jobs = {} - self.Worker = {} + self.Jobs = {} + self.Worker = {} - # Those map are the edits done on every objects to commit at the end of the transaction - self.JobsToUpdate = {} - self.WorkersToUpdate = {} + # Those map are the edits done on every objects to commit at the end of the transaction + self.JobsToUpdate = {} + self.WorkersToUpdate = {} - self.IntoWith = True + self.IntoWith = True - '''Leave a transaction block''' - def __exit__ (self, type, value, traceback): - self.IntoWith = False - if not isinstance(value, TypeError): - self.editJobs(self.JobsToUpdate) - self.editWorkers(self.WorkersToUpdate) + '''Leave a transaction block''' + def __exit__ (self, type, value, traceback): + self.IntoWith = False + if not isinstance(value, TypeError): + self.editJobs(self.JobsToUpdate) + self.editWorkers(self.WorkersToUpdate) - def getRoot (self): - return Job (self, 0, 0, "Root", "", "", "", "", "", 0, 0, 0, 0, 0, 0, 0, "", "", 0, 0, 0, 0, 0, 0, 0, "", "", "") + def getRoot (self): + return Job (self, 0, 0, "Root", "", "", "", "", "", 0, 0, 0, 0, 0, 0, 0, "", "", 0, 0, 0, 0, 0, 0, 0, "", "", "") class Worker(object): - ''' - The database proxy object for a worker - - This object is readonly outside a transaction block. - ''' - def __init__ (self, db, values): - self.db = db - self.name = values['name'] - self.Data = values - # Should not exist in the cache - assert (db.Workers.get (self.name) == None) - # Cache it - db.Workers[self.name] = self - - def __setattr__(self, attr, value): - # Backup the value for delayed writting - db = super (object, self).__getattr__ ('db') - name = super (object, self).__getattr__ ('name') - data = super (object, self).__getattr__ ('data') - if not db.IntoWith: - raise Exception - w = db.WorkerToUpdate.get (name) - if not w: - w = {} - db.WorkersToUpdate[name] = w - w[attr] = value - data[attr] = value - - def __getattr__(self, attr): - data = super (object, self).__getattr__ ('data') - return data[attr] + ''' + The database proxy object for a worker + + This object is readonly outside a transaction block. + ''' + def __init__ (self, db, values): + self.db = db + self.name = values['name'] + self.Data = values + # Should not exist in the cache + assert (db.Workers.get (self.name) == None) + # Cache it + db.Workers[self.name] = self + + def __setattr__(self, attr, value): + # Backup the value for delayed writting + db = super (object, self).__getattr__ ('db') + name = super (object, self).__getattr__ ('name') + data = super (object, self).__getattr__ ('data') + if not db.IntoWith: + raise Exception + w = db.WorkerToUpdate.get (name) + if not w: + w = {} + db.WorkersToUpdate[name] = w + w[attr] = value + data[attr] = value + + def __getattr__(self, attr): + data = super (object, self).__getattr__ ('data') + return data[attr] class Job(object): - ''' - The database proxy object for a job - - This object is readonly outside a transaction block. - ''' - def __init__ (self, db, values): - self.db = db - self.id = values['id'] - self.Data = values - # Should not exist in the cache - assert (db.Jobs.get (self.id) == None) - # Cache it - db.Jobs[self.id] = self - - def __setattr__(self, attr, value): - # Backup the value for delayed writting - db = super (object, self).__getattr__ ('db') - id = super (object, self).__getattr__ ('id') - data = super (object, self).__getattr__ ('data') - if not db.IntoWith: - raise Exception - w = db.WorkerToUpdate.get (id) - if not w: - w = {} - db.WorkersToUpdate[id] = w - w[attr] = value - data[attr] = value - - def __getattr__(self, attr): - data = super (object, self).__getattr__ ('data') - return data[attr] + ''' + The database proxy object for a job + + This object is readonly outside a transaction block. + ''' + def __init__ (self, db, values): + self.db = db + self.id = values['id'] + self.Data = values + # Should not exist in the cache + assert (db.Jobs.get (self.id) == None) + # Cache it + db.Jobs[self.id] = self + + def __setattr__(self, attr, value): + # Backup the value for delayed writting + db = super (object, self).__getattr__ ('db') + id = super (object, self).__getattr__ ('id') + data = super (object, self).__getattr__ ('data') + if not db.IntoWith: + raise Exception + w = db.WorkerToUpdate.get (id) + if not w: + w = {} + db.WorkersToUpdate[id] = w + w[attr] = value + data[attr] = value + + def __getattr__(self, attr): + data = super (object, self).__getattr__ ('data') + return data[attr] # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/db_mysql.py b/db_mysql.py index 1b24640..7e0a058 100644 --- a/db_mysql.py +++ b/db_mysql.py @@ -4,35 +4,36 @@ class DBMySQL(DBSQL): - # The Context class allows using context capsules for the sql transactions - # Note: this behaviour has disappeared from MySQLdb as of 2018 - class Context: - def __init__ (self, db, conn): - self.DB = db - self.Conn = conn - self.Conn.ping(True) - self.Conn.autocommit = False - - def __enter__(self): - pass - - def __exit__ (self, type, value, traceback): - if type is None: - self.Conn.commit () - else: - if self.DB.Verbose: - sys.stdout.flush () - sys.stdout.write ("[SQL] Warning: db context exited with an exception, rollback!\n") - sys.stdout.flush () - self.Conn.rollback () - - def cursor (self): - return self.Conn.cursor () - - - def __init__ (self, host, user, password, database, **kwargs): - self.config = kwargs["config"] - self.cloudconfig = kwargs["cloudconfig"] - self.Conn = self.Context (self, MySQLdb.connect(host, user, password, database)) - # super is called *after* because DBSQL inits stuffs in the DB - super(DBMySQL, self).__init__() + # The Context class allows using context capsules for the sql transactions + # Note: this behaviour has disappeared from MySQLdb as of 2018 + class Context: + def __init__ (self, db, conn): + self.DB = db + self.Conn = conn + self.Conn.ping(True) + self.Conn.autocommit = False + + def __enter__(self): + pass + + def __exit__ (self, type, value, traceback): + if type is None: + self.Conn.commit () + else: + if self.DB.Verbose: + sys.stdout.flush () + sys.stdout.write ("[SQL] Warning: db context exited with an exception, rollback!\n") + sys.stdout.flush () + self.Conn.rollback () + + def cursor (self): + return self.Conn.cursor () + + + def __init__ (self, host, user, password, database, **kwargs): + self.config = kwargs["config"] + self.cloudconfig = kwargs["cloudconfig"] + self.Conn = self.Context (self, MySQLdb.connect(host, user, password, database)) + # super is called *after* because DBSQL inits stuffs in the DB + super(DBMySQL, self).__init__() + diff --git a/db_sql.py b/db_sql.py index c88aa56..53fee68 100644 --- a/db_sql.py +++ b/db_sql.py @@ -9,1291 +9,1291 @@ def convdata (d): - return isinstance(d, str) and repr (d) or (isinstance(d, bool) and (d and '1' or '0') or (isinstance(d, unicode) and repr(str(d)) or str(d))) + return isinstance(d, str) and repr (d) or (isinstance(d, bool) and (d and '1' or '0') or (isinstance(d, unicode) and repr(str(d)) or str(d))) class LdapError(Exception): - """Error class for LDAP exceptions.""" - def __init__(self, value): - self.value = value - def __str__(self): - return "[LDAP] Error: {value}".format(value=self.value) + """Error class for LDAP exceptions.""" + def __init__(self, value): + self.value = value + def __str__(self): + return "[LDAP] Error: {value}".format(value=self.value) class DBSQL(DB): - def __init__ (self): - self.StartTime = time.time () - self.lastworkerinstancestarttime = 0 - self.LastUpdate = 0 - self.EnterTime = 0 - self.RunTime = 0.0 - self.HeartBeats = 0 - self.PickJobs = 0 - self.Verbose = False - self.NotifyFinished = None - self.NotifyError = None - self.Workers = dict() - self.AffinityBitsToName = dict() - - tables = self._getDatabaseTables() - - if (("Workers",) in tables) or (("workers",) in tables): - self._populateWorkersCache() - - if (("Affinities",) in tables) or (("affinities",) in tables): - self._populateAffinitiesTable() - - def __enter__(self): - self.EnterTime = time.time () - self.Conn.__enter__ () - - def __exit__ (self, type, value, traceback): - self.RunTime = time.time ()-self.EnterTime - if not isinstance(value, TypeError): - self._update () - self.Conn.__exit__ (type, value, traceback) - - def _execute (self, cur, req, data=None): - now = time.time () - if data: - cur.execute (req, data) - else: - cur.execute (req) - after = time.time () - if self.Verbose: - sys.stdout.flush () - sys.stdout.write ("[SQL] (%f/%f) %s\n" % (now-self.StartTime, after-now, req)) - sys.stdout.flush () - - def _rowAsDict (self, cur, row): - if row: - result = {} - for idx, col in enumerate (cur.description): - result[col[0]] = row[idx] - return result - else: - return None - - def _populateWorkersCache(self): - """Populate cache with pre-existent data in Workers table.""" - cur = self.Conn.cursor () - self._execute (cur, "SELECT name FROM Workers") - for worker in cur: - info = {} - info['ping_time'] = int (time.time ()) - info['cpu'] = '' - info['free_memory'] = 0 - info['total_memory'] = 0 - info['ip'] = '' - info['timeout'] = False - self.Workers[worker[0]] = info - - def _populateAffinitiesTable(self): - """Populate Affinities table with pre-existent data having an id < 64.""" - cur = self.Conn.cursor () - with self.Conn: - affinities = {} - self._execute (cur, "SELECT id, name FROM Affinities") - for row in cur: - affinities[int (row[0])] = row[1] - for i in range (1, 64): - if not i in affinities: - self._execute (cur, "INSERT INTO Affinities (id, name) VALUES (%d,'')" % i) - - def _getLdapPermission(self, action): - """Check ldap permissions. - - If ldap is not configured, ldap_user is unset, return "". - If ldap user is allowed to do the action, return the additional sql filter - or "" if the user has global permissions. - If ldap user is not allowed, return False.""" - - if not hasattr(self, "ldap_user") or not self.ldap_user: # LDAP is not set up in configuration or ldapunsafeapi is set to True - return "" - if action == "addjob": - if self.permissions["ldaptemplateaddjobglobal"]: - return "" - elif self.permissions["ldaptemplateaddjob"]: - return "AND user='{user}'".format(user=self.ldap_user) - else: - raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) - return False - elif action == "viewjob": - if self.permissions["ldaptemplateviewjobglobal"]: - return "" - elif self.permissions["ldaptemplateviewjob"]: - return "AND user='{user}'".format(user=self.ldap_user) - else: - raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) - return False - elif action == "editjob": - if self.permissions["ldaptemplateeditjobglobal"]: - return "" - elif self.permissions["ldaptemplateeditjob"]: - return "AND user='{user}'".format(user=self.ldap_user) - else: - raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) - return False - elif action == "deletejob": - if self.permissions["ldaptemplatedeletejobglobal"]: - return "" - elif self.permissions["ldaptemplatedeletejob"]: - return "AND user='{user}'".format(user=self.ldap_user) - else: - raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) - return False - else: - raise LdapError("Action '{action}' is not defined for user '{user}'".format(action=action, user=self.ldap_user)) - return False - - def listJobs (self): - cur = self.Conn.cursor () - self._execute (cur, "SELECT * FROM Jobs") - for row in cur: - print (self._rowAsDict (cur, row)) - - def listUnpausedWaitingJobs(self): - """Get jobs currently waiting for a worker.""" - cur = self.Conn.cursor () - req = "SELECT * FROM Jobs WHERE state = 'WAITING' and paused = 0" - self._execute (cur, req) - return [self._rowAsDict (cur, row) for row in cur] - - def listWorkers (self): - cur = self.Conn.cursor () - self._execute (cur, "SELECT * FROM Workers") - for row in cur: - print (row) - - def listWorkersByStates(self, state, *argv): - cur = self.Conn.cursor () - req = "SELECT * FROM Workers WHERE state = '%s'" % state - if argv: - for arg in argv: - req += " OR state = '%s'" % arg - self._execute (cur, req) - return [self._rowAsDict (cur, row) for row in cur] - - def listAffinities (self): - cur = self.Conn.cursor () - self._execute (cur, "SELECT id, name FROM Affinities") - aff = {} - for row in cur: - if row[1] != "" and row[0] >= 1 and row[0] <= 63: - aff[row[1]] = (1L << (row[0]-1)) - return aff - - def getAffinities (self): - cur = self.Conn.cursor () - self._execute (cur, "SELECT id, name FROM Affinities") - aff = {} - for row in cur: - if row[0] >= 1 and row[0] <= 63: - aff[row[0]] = row[1] - return aff - - def setAffinities (self, affinities): - # reset affinities cache - self.AffinityBitsToName = {} - cur = self.Conn.cursor () - for id, affinity in affinities.iteritems (): - self._execute (cur, "UPDATE Affinities SET name = '%s' WHERE id = %d" % (affinity, int (id))) - - def getAffinityMask (self, affinities): - if affinities == "": - return 0 - aff = self.listAffinities () - mask = 0L - cur = self.Conn.cursor () - for affinity in affinities.split (","): - if affinity != "": - m = re.match(r"^#(\d+)$", affinity) - if m: - bit = (int(m.group (1))-1) - mask = mask | (1L << bit) - else: - mask = mask | aff[affinity] - return mask - - def getAffinityString (self, affinity_bits): - if affinity_bits == 0: - return "" - if affinity_bits in self.AffinityBitsToName: - return self.AffinityBitsToName[affinity_bits] - names = [] - aff = self.getAffinities() - for id, name in aff.iteritems (): - bit = (1L << (id-1)) - if affinity_bits & bit != 0: - if name != '': - names.append (name) - else: - names.append ("#"+ str (id)) - names.sort () - result = ",".join (names) - self.AffinityBitsToName[affinity_bits] = result - return result - - def newJob(self, parent, title, command, dir, environment, state, paused, timeout, - priority, affinity, user, url, progress_pattern, dependencies = None): - ldap_perm = self._getLdapPermission("addjob") - if ldap_perm is False: - return None - if ldap_perm != "": # User can add job owned by himself, force user value - user = self.ldap_user - cur = self.Conn.cursor() - self._execute(cur, - "SELECT h_depth, h_affinity, h_priority, h_paused, command " - "FROM Jobs " - "WHERE id = {parent} {ldap_perm}".format(parent=parent, ldap_perm=ldap_perm)) - data = cur.fetchone() - paused = 0 - if state == "PAUSED": - paused = 1; - if data is None: - data = [-1, 0, 0, 0, ''] - if data[4] != '': - print("Error: can't add job, parent {parent} is not a group".format(parent=parent)) - return None - # one depth below - h_depth = data[0]+1 - # merge parent affinities with child affinities - parent_affinities = data[1] - child_affinities = self.getAffinityMask(affinity) - h_affinity = parent_affinities | child_affinities - # merge priority - priority = max(0, min(255, int(priority))) - h_priority = data[2] + (priority << (56-h_depth*8)) - h_paused = data[3] or paused - - self._execute (cur, - "INSERT INTO Jobs (" - "parent, title, command, dir, environment, timeout," - "priority, affinity, affinity_bits, user, url," - "progress_pattern, paused, state, worker, h_depth," - "h_affinity, h_priority, h_paused" - ") VALUES (" - "{parent}, {title}, {command}, {directory}, {environment}, {timeout}," - "{priority}, {affinity}, {child_affinities}, {user}, {url}," - "{progress_pattern}, {paused}, {state}, {worker}, {h_depth}," - "{h_affinity}, {h_priority}, {h_paused})".format(parent=parent, title=repr(title), command=repr(command), - directory=repr(dir), environment=repr(environment), timeout=timeout, - priority=priority, affinity=repr(affinity), child_affinities=child_affinities, - user=repr(user), url=repr(url), progress_pattern=repr(progress_pattern), - paused="'"+str(paused)+"'", state="'WAITING'", worker="''", h_depth=int(h_depth), - h_affinity=h_affinity, h_priority=int(h_priority), h_paused="'"+str(h_paused)+"'")) - - data = cur.fetchone () - job = self.getJob (cur.lastrowid) - if job is not None and dependencies is not None: - self.setJobDependencies (job['id'], dependencies) - self._updateJobCounters (parent) - job['dependencies'] = dependencies - return job - - def getJob(self, id): - ldap_perm = self._getLdapPermission("viewjob") - if ldap_perm is False: - return None - print("LDAP_PERM", ldap_perm) - cur = self.Conn.cursor() - self._execute(cur, - "SELECT * FROM Jobs " - "WHERE id = {id} {ldap_perm}".format(id=id, ldap_perm=ldap_perm)) - result = self._rowAsDict(cur, cur.fetchone()) - if result is not None: - if result['paused']: - result['state'] = str("PAUSED") - if result['state'] == "WORKING" and result['total'] == 0: - current_time = int(time.time ()) - result['duration'] = current_time - result['start_time'] - result['affinity'] = self.getAffinityString(result['affinity_bits']) - # get dependencies - result['dependencies'] = [] - self._execute(cur, - "SELECT job.id FROM Jobs AS job " - "INNER JOIN Dependencies AS dep " - "ON job.id = dep.dependency " - "WHERE dep.job_id = {id}".format(id=id)) - for row in cur: - result['dependencies'].append(row[0]) - return result - - def getJobChildren(self, id, data): - ldap_perm = self._getLdapPermission("viewjob") - if ldap_perm is False: - return None - cur = self.Conn.cursor() - self._execute(cur, - "SELECT * FROM Jobs " - "WHERE parent = {id} {ldap_perm}".format(id=id, ldap_perm=ldap_perm)) - jobs = [] - for row in cur: - result = self._rowAsDict (cur, row) - if result and result['paused']: - result['state'] = str ("PAUSED") - if result['state'] == "WORKING" and result['total'] == 0: - current_time = int (time.time ()) - result['duration'] = current_time - result['start_time'] - result['affinity'] = self.getAffinityString (result['affinity_bits']) - jobs.append (result) - return jobs - - def getJobDependencies(self, id): - ldap_perm = self._getLdapPermission("viewjob") - if ldap_perm is False: - return None - cur = self.Conn.cursor() - self._execute (cur, - "SELECT job.* FROM Jobs AS job " - "INNER JOIN Dependencies AS dep " - "ON job.id = dep.dependency " - "WHERE dep.job_id = {id} {ldap_perm}".format(id=id, ldap_perm=ldap_perm)) - rows = cur.fetchall() - return [self._rowAsDict (cur, row) for row in rows] - - def getCountJobsWhere(self, where_clause=''): - """Get the number of matching jobs.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT COUNT(*) FROM Jobs WHERE {}".format(where_clause[0])) - return cur.fetchone()[0] - - def getJobsWhere(self, where_clause='', index_min=0, index_max=1): - """Get Jobs via a readonly SQL request.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT * FROM Jobs WHERE {} LIMIT {},{}".format(where_clause, index_min, index_max)) - return [self._rowAsDict (cur, row) for row in cur.fetchall()] - - def getJobsUsers(self): - """Get users.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT DISTINCT user FROM Jobs ORDER BY user") - return [self._rowAsDict (cur, row) for row in cur.fetchall()] - - def getJobsStates(self): - """Get States.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT DISTINCT state FROM Jobs") - return [self._rowAsDict (cur, row) for row in cur.fetchall()] - - def getJobsWorkers(self): - """Get States.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT DISTINCT worker FROM Jobs") - return [self._rowAsDict (cur, row) for row in cur.fetchall()] - - def getJobsPriorities(self): - """Get States.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT DISTINCT priority FROM Jobs") - return [self._rowAsDict (cur, row) for row in cur.fetchall()] - - def getJobsAffinities(self): - """Get States.""" - cur = self.Conn.cursor() - self._execute(cur, "SELECT DISTINCT affinity FROM Jobs") - return [self._rowAsDict (cur, row) for row in cur.fetchall()] - - def getChildrenDependencyIds (self, id): - cur = self.Conn.cursor () - self._execute (cur, "SELECT job.id AS id, dep.dependency AS dependency FROM Dependencies AS dep " - "INNER JOIN Jobs AS job ON job.id = dep.job_id " - " WHERE job.parent = %d" % id) - - def getChildrenDependencyIds(self, id): - cur = self.Conn.cursor() - self._execute(cur, """ - SELECT job.id AS id, dep.dependency AS dependency - FROM Dependencies AS dep - INNER JOIN Jobs AS job - ON job.id = dep.job_id - WHERE job.parent = {id} - """.format(id=id)) - rows = cur.fetchall() - return [self._rowAsDict(cur, row) for row in rows] - - def getWorker (self, hostname): - cur = self.Conn.cursor () - self._execute (cur, "SELECT * FROM Workers WHERE name = '%s'" % hostname) - worker = self._rowAsDict (cur, cur.fetchone ()) - try: - info = self.Workers[hostname] - worker['ping_time'] = info['ping_time'] - worker['cpu'] = info['cpu'] - worker['free_memory'] = info['free_memory'] - worker['total_memory'] = info['total_memory'] - except: - pass - - self._execute (cur, "SELECT affinity FROM WorkerAffinities WHERE worker_name = '%s'" % ( hostname ) ) - affinities = [] - - data = cur.fetchone() - - if data is None: - worker['affinity'] = "" - return worker - - for data in cur: - affinities.append( self.getAffinityString( data[0] ) ) - - worker['affinity'] = "\n".join( affinities ) - return worker - - def getWorkerStartTime(self, name): - """Get the number of seconds since epoch.""" - cur = self.Conn.cursor () - self._execute(cur, "SELECT start_time FROM Workers WHERE name = '%s'" % name) - db_type = self._getDatabaseType() - if db_type == "mysql": - start_time = cur.fetchone()[0].timetuple() - else: - start_time = time.strptime(cur.fetchone()[0], '%Y-%m-%d %H:%M:%S') - return time.mktime(start_time) - - def getWorkers (self): - cur = self.Conn.cursor () - self._execute (cur, "SELECT * FROM Workers") - workers = [] - for row in cur: - worker = self._rowAsDict (cur, row) - try: - info = self.Workers[worker['name']] - worker['ping_time'] = info['ping_time'] - worker['cpu'] = info['cpu'] - worker['free_memory'] = info['free_memory'] - worker['total_memory'] = info['total_memory'] - except: - pass - - req = self.Conn.cursor() - self._execute( req, "SELECT affinity FROM WorkerAffinities WHERE worker_name = '%s'" % ( worker['name'] ) ) - affinities = [] - - for d in req: - - affinities.append( self.getAffinityString( d[0] ) ) - - worker['affinity'] = "\n".join( affinities ) - worker['start_time'] = self.getWorkerStartTime(worker['name']) - workers.append (worker) - return workers - - def getEvents (self, job, worker, howlong): - cur = self.Conn.cursor() - req = "SELECT * FROM Events WHERE start > %d" % (int(time.time())-howlong) - if worker: - req += " AND worker=%s" % convdata (worker) - if job > 0: - req += " AND job_id=%d" % job - self._execute (cur, req); - return [self._rowAsDict (cur, row) for row in cur.fetchall ()] - - def editJobs (self, jobs): - ldap_perm = self._getLdapPermission("editjob") - if ldap_perm is False: - return None - cur = self.Conn.cursor () - for id, attr in jobs.iteritems (): - if attr.has_key("user") and attr["user"].lower() != self.ldap_user.lower() and ldap_perm != "": - # User has no global permission, so he can change his own jobs only - raise LdapError("User '{user}' is not allowed to change job id='{id}' user attribute to '{user_attr}'".format( - user=self.ldap_user, id=id, user_attr=attr["user"])) - break - toUpdate = [k+"="+convdata(v) for k,v in attr.iteritems() - if k != 'dependencies' and k != 'affinity' and k != 'priority' and - k != 'state' and k != 'parent'] - if toUpdate: - req = "UPDATE Jobs SET " + ",".join (toUpdate) + " WHERE id=" + str(id) - self._execute(cur, req) - cur.fetchall() - # Special cases - if attr.get ('paused') is not None: - paused = attr.get ('paused') - if paused: - self.pauseJob (int (id)) - else: - self.startJob (int (id)) - if attr.get ('state'): - state = attr.get ('state') - if state == 'PAUSED': - self.pauseJob (int (id)) - elif state == 'WAITING': - self.startJob (int (id)) - else: - self._setJobState (int (id), state, True) - updateChildren = False - if attr.get ('parent') is not None: - self.moveJob (int (id), int (attr['parent'])) - if attr.get ('affinity') is not None: - self.setJobAffinity (int (id), attr['affinity']) - if attr.get ('priority'): - self.setJobPriority (int (id), attr['priority']) - if attr.get ('parent') is not None or attr.get ('affinity') is not None or attr.get ('priority') is not None or attr.get ('paused') is not None: - self._updateChildren (int (id)) - if attr.get ('dependencies'): - dependencies = attr['dependencies'] - if type(dependencies) is str: - # Parse the dependencies string - dependencies = re.findall ('(\d+)', dependencies) - ids = [] - for i, dep in enumerate (dependencies) : - try: - ids.append (int (dep)) - except: - pass - self.setJobDependencies (int (id), ids) - self._setJobState (int (id), None, True) - - def editWorkers (self, workers): - cur = self.Conn.cursor () - for name, attr in workers.iteritems (): - hasField = False - req = "UPDATE Workers SET" - for k, v in attr.iteritems(): - if k != 'affinity': - hasField = True - req += " " + k + " = " + convdata (v) - req += " WHERE name = '" + name + "'" - if hasField: - self._execute(cur, req) - cur.fetchall() - if attr.get ('affinity') is not None: - self.setWorkerAffinity (str (name), attr['affinity']) - - def setJobProgress (self, jobId, progress): - cur = self.Conn.cursor () - self._execute (cur, "UPDATE Jobs SET progress = %f WHERE id = %d" % (progress, jobId)) - - def _getDatabaseType(self): - """Get the database type.""" - return self.config.get("server", "db_type") - - def _getDatabaseTables(self): - """Return list of database tables.""" - cur = self.Conn.cursor() - db_type = self._getDatabaseType() - if db_type == "mysql": - req = "SHOW TABLES;" - else: - req = "SELECT name FROM sqlite_master WHERE type = 'table';" - self._execute(cur, req) - return cur.fetchall() - - def _getDatabaseVersion(self): - """Return database version.""" - cur = self.Conn.cursor() - tables = self._getDatabaseTables() - if (not ("Migrations",) in tables) and (not ("migrations",) in tables): - current_version = [("0000",)] - else: - req = "SELECT database_version FROM Migrations;" - self._execute(cur, req) - current_version = cur.fetchall() - return int(current_version[0][0]) - - def _getMigrationVersion(self): - """Return latest migration version.""" - return int(max([re.sub(r'_.*$', '', f) for f in - os.walk("migrations").next()[2]])) - - def _getDatabaseDataCount(self): - datacount = 0 - cur = self.Conn.cursor() - for table in self._getDatabaseTables(): - req = "SELECT COUNT(*) FROM {}".format(table[0]) - self._execute(cur, req) - fetched = cur.fetchone() - if fetched: - datacount += fetched[0] - return datacount - - def initDatabase(self): - """Initialize the database.""" - if len(self._getDatabaseTables()): - if self._getDatabaseDataCount() != 0: - print("The database is not empty, it will not be initialized.") - return False - print("Initializing database.") - return self.migrateDatabase(init=True) - - def migrateDatabase(self, init=False): - """Migrate the database.""" - db_type = self._getDatabaseType() - current = self._getDatabaseVersion() - target = self._getMigrationVersion() - if init: - # Init with the '0000' migration - current -= 1 - cur = self.Conn.cursor() - print("The database version is {current} and the migration target is {target}. Migrating.".format(current=current, target=target)) - while current < target: - current += 1 - migration_module_name = "{current:04d}_db_{db_type}".format(current=current, db_type=db_type) - migration_module = import_module("migrations.{}".format(migration_module_name)) - with self.Conn: - for step in migration_module.steps: - self._execute(cur, step.strip()) - if init: - with self.Conn: - for i in range(1, 64): - self._execute(cur, dedent(""" - INSERT INTO Affinities (id, name) - VALUES ('{}', '')""".format(i))) - return True - - def moveJob (self, jobId, parent): - cur = self.Conn.cursor () - self._execute (cur, "SELECT parent FROM Jobs WHERE id = %d" % jobId) - previous = cur.fetchone () - self._execute (cur, "UPDATE Jobs SET parent = %d WHERE id = %d" % (parent, jobId)) - self._updateJobCounters (previous[0]) - self._updateJobCounters (parent) - - def setJobAffinity (self, id, affinity): - cur = self.Conn.cursor () - affinities = self.getAffinityMask (affinity) - self._execute (cur, "UPDATE Jobs SET affinity = '%s', affinity_bits = %d WHERE id = %d" % (affinity, affinities, id)) - - def setJobPriority (self, id, priority): - cur = self.Conn.cursor () - priority = max (0, min (255, int (priority))) - self._execute (cur, "UPDATE Jobs SET priority = %d WHERE id = %d" % (priority, id)) - - def setJobDependencies (self, id, dependencies): - cur = self.Conn.cursor () - self._execute (cur, "DELETE FROM Dependencies WHERE job_id = %d" % int (id)) - for dep in dependencies: - self._execute (cur, "INSERT INTO Dependencies (job_id,dependency) " - "VALUES (%d,%d)" % (int (id), int (dep))) - self._setJobState (int (id), None, True) - - def resetJob (self, id, updateChildren = True): - ldap_perm = self._getLdapPermission("editjob") - if ldap_perm is False: - return None - if ldap_perm != "": # Not a global permission - cur = self.Conn.cursor () - self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) - data = cur.fetchone() - if data and data[0].lower() != self.ldap_user.lower(): - raise LdapError("User '{user}' is not allowed to reset job id='{id}'".format(user=user,id=id)) - return None - cur = self.Conn.cursor () - self._execute (cur, "UPDATE Jobs SET start_time = 0 WHERE id = %d" % id) - self._setJobState (id, "WAITING", False) - self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) - for row in cur: - self.resetJob (row[0], False) - if updateChildren: - self._resetJobCounters (id) - - def resetErrorJob (self, id, updateChildren = True): - ldap_perm = self._getLdapPermission("editjob") - if ldap_perm is False: - return None - if ldap_perm != "": # Not a global permission - cur = self.Conn.cursor () - self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) - data = cur.fetchone() - if data and data[0].lower() != self.ldap_user.lower(): - raise LdapError("User '{user}' is not allowed to reset error job id='{id}'".format(user=user,id=id)) - return None - cur = self.Conn.cursor () - self._execute (cur, "SELECT state FROM Jobs WHERE id = %d" % id) - data = cur.fetchone () - if data is not None and data[0] == "ERROR": - self._execute (cur, "UPDATE Jobs SET start_time = 0 WHERE id = %d" % id) - self._setJobState (id, "WAITING", False) - self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) - for row in cur: - self.resetErrorJob (row[0], False) - if updateChildren: - self._resetJobCounters (id) - - def startJob (self, id): - ldap_perm = self._getLdapPermission("editjob") - if ldap_perm is False: - return None - if ldap_perm != "": # Not a global permission - cur = self.Conn.cursor () - self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) - data = cur.fetchone() - if data and data[0].lower() != self.ldap_user.lower(): - raise LdapError("User '{user}' is not allowed to start job id='{id}'".format(user=user,id=id)) - return None - - cur = self.Conn.cursor () - self._execute (cur, "UPDATE Jobs SET paused = 0 WHERE id = %d" % id) - self._setJobState (id, "WAITING", False) - self._updateChildren (id) - self._updateJobCounters (id) - - def pauseJob (self, id): - ldap_perm = self._getLdapPermission("editjob") - if ldap_perm is False: - return None - if ldap_perm != "": # Not a global permission - cur = self.Conn.cursor () - self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) - data = cur.fetchone() - if data and data[0].lower() != self.ldap_user.lower(): - raise LdapError("User '{user}' is not allowed to pause job id='{id}'".format(user=user,id=id)) - return None - cur = self.Conn.cursor () - self._execute (cur, "UPDATE Jobs SET paused = 1 WHERE id = %d" % id) - self._setJobState (id, "PAUSED", False) - self._updateChildren (id) - self._updateJobCounters (id) - - def deleteJob (self, id, deletedJobs = [], updateCounters = True): - ldap_perm = self._getLdapPermission("deletejob") - if ldap_perm is False: - return None - if ldap_perm != "": # Not a global permission - cur = self.Conn.cursor () - self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) - data = cur.fetchone() - if data and data[0].lower() != self.ldap_user.lower(): - raise LdapError("User '{user}' is not allowed to delete job id='{id}'".format(user=user,id=id)) - return None - - cur = self.Conn.cursor () - self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) - for row in cur: - self.deleteJob (row[0], deletedJobs, False) - parent = None - if updateCounters: - self._execute (cur, "SELECT parent FROM Jobs WHERE id = %d" % id) - parent = cur.fetchone () - self._execute (cur, "DELETE FROM Jobs WHERE id = %d" % id) - # clean up Events? - #self._execute (cur, "DELETE FROM Events WHERE job_id = %d" % id) - deletedJobs.append (id) - if parent is not None: - self._updateJobCounters (parent[0]) - - def newWorker (self, name): - cur = self.Conn.cursor () - self._execute (cur, "INSERT INTO Workers (name,ip,affinity, state,finished," - "error,last_job,current_event,cpu,free_memory,total_memory,active) " - "VALUES ('%s','','','WAITING',0,0,-1,-1,'[0]',0,0,1)" % name) - - def setWorkerAffinity (self, name, affinity): - cur = self.Conn.cursor () - # Delete all the worker's affinities - self._execute( cur, "DELETE FROM WorkerAffinities WHERE worker_name = '%s'" % ( name ) ) - - if len( affinity ) > 0: - - affinities = affinity.split( "\n" ) - - for index, aff in enumerate( affinities ): - - query = "INSERT INTO WorkerAffinities ( worker_name, affinity, ordering ) VALUES( '%s', %d, %d )" % ( name, self.getAffinityMask( aff ), index+1 ) - self._execute( cur, query ) - - def stopWorker (self, name): - cur = self.Conn.cursor () - self._execute (cur, "UPDATE Workers SET active = 0 WHERE name = '%s'" % name) - self._execute (cur, "SELECT job.id FROM Jobs AS job " - "INNER JOIN Workers AS worker ON " - "worker.last_job = job.id AND worker.name = job.worker " - "WHERE worker.name = '%s' AND job.state = 'WORKING'" % name) - row = cur.fetchone () - if row is not None: - self._setJobState (row[0], "WAITING", True) - - def startWorker (self, name): - cur = self.Conn.cursor () - self._execute (cur, "UPDATE Workers SET active = 1 WHERE name = '%s'" % name) - - def deleteWorker (self, name): - cur = self.Conn.cursor () - self._execute (cur, "DELETE FROM Workers WHERE name = '%s'" % name) - try: - del self.Workers[name] - except: - pass - - def _updateWorkerInfo (self, hostname, cpu, free_memory, total_memory, ip): - try: - info = self.Workers[hostname] - except: - info = {} - self.Workers[hostname] = info - info['ping_time'] = int (time.time ()) - info['cpu'] = cpu - info['free_memory'] = free_memory - info['total_memory'] = total_memory - info['ip'] = ip - info['timeout'] = False - return info - - # Worker heartbeats while running a job - # Lookup for worker and job - # update worker and job - def heartbeat (self, hostname, jobId, cpu, free_memory, total_memory, ip): - self.HeartBeats += 1 - current_time = int(time.time()) - cur = self.Conn.cursor () - - self._updateWorkerInfo (hostname, cpu, free_memory, total_memory, ip) - - _query = ("SELECT w.active, w.state, j.state FROM Workers as w " - "INNER JOIN Jobs AS j ON " - "j.worker = w.name AND j.id = %d AND w.last_job = %d AND " - "w.state = 'WORKING' AND j.state = 'WORKING' and j.h_paused = 0 " - "WHERE w.name = '%s'" % (jobId, jobId, hostname)) - self._execute (cur, _query) - data = cur.fetchone () - - if data: - return True - - # slow path here - # either worker doesn't exist or job is not assigned to the worker or job was pause - # get the worker active and state - self._execute (cur, "SELECT active, state FROM Workers WHERE name = '%s'" % hostname) - worker = cur.fetchone () - if worker is None: - # create worker if needed - self.newWorker (hostname) - self._execute (cur, "SELECT active, state FROM Workers WHERE name = '%s'" % hostname) - worker = cur.fetchone () - - # by default we're suspicious and we flag the worker as waiting - state = "WAITING" - job = None - if worker[0] == True: - self._execute (cur, "SELECT state, h_paused FROM Jobs WHERE id = %d AND worker = '%s'" % (jobId, hostname)) - job = cur.fetchone () - if job is not None and job[0] == "WORKING" and not job[1]: - # if the worker is active and is running the job, it's all good - # we just lost track of the worker (deleteWorker) and we just need - # to update them - self._setWorkerState (hostname, "WORKING") - return True - - # something is not right! - # reset the worker to WAITING - self._setWorkerState (hostname, "WAITING") - # and if the job exists, reset it to WAITING as well - if job is not None: - self._setJobState (jobId, "WAITING", True) - return False - - def pickJob (self, hostname, cpu, free_memory, total_memory, ip): - self.PickJobs += 1 - current_time = int(time.time()) - cur = self.Conn.cursor() - - self._updateWorkerInfo(hostname, cpu, free_memory, total_memory, ip) - - # get the worker active and state - self._execute(cur, "SELECT active, state, last_job FROM Workers WHERE name = '%s'" % hostname) - worker = cur.fetchone() - if worker is None: - self.newWorker(hostname) - self._execute(cur, "SELECT active, state, last_job FROM Workers WHERE name = '%s'" % hostname) - worker = cur.fetchone() - - # check the worker is not already working - # this can happen if the worker crashed and restarted before - # timeout is detected - if worker[1] == "WORKING": - # reset all working jobs assigned to this worker - self._execute(cur, "SELECT id FROM Jobs WHERE state = 'WORKING' and worker = '%s'" % hostname) - for job in cur: - self._setJobState(job[0], "WAITING", True) - - # worker is not active, drop now - if not worker[0]: - return -1,"","","",None - - # Here, we have an INNER JOIN query - # Fetch the FIRST job whose affinity match the worker's first affinity in the list (stored in WorkerAffinities) - self._execute(cur, dedent(""" - SELECT J.id, J.title, J.command, J.dir, J.user, J.environment - FROM Jobs AS J - INNER JOIN WorkerAffinities AS W - ON (( J.h_affinity & W.affinity = J.h_affinity ) & ( J.h_affinity != 0 )) - WHERE W.worker_name = '{}' - AND J.state = 'WAITING' - AND NOT J.h_paused - AND J.command != '' - ORDER BY W.ordering ASC, J.h_priority DESC, J.id ASC LIMIT 1""".format(hostname))) - - job = cur.fetchone() # This instruction is redundant because there is a LIMIT 1 in the query - - # At this point, the job will be set to None IF : - # * There is no Worker whose affinity match any Job affinity - # * A job has no affinity - # The former case is EXPECTED, but not the latter one - # Therefore, we need to add a query that take the first Job that has no affinity WHEN Workers are not doing anything - if job is None: - self._execute(cur, dedent(""" - SELECT id, title, command, dir, user, environment - FROM Jobs - WHERE state = 'WAITING' - AND NOT h_paused - AND affinity = '' - AND command != '' - ORDER BY h_priority DESC, id ASC LIMIT 1""")) - - job = cur.fetchone () - - # Finally, return nothing if there is no job. - if job is None: - # Update worker state - self._execute (cur, "UPDATE Workers SET state = 'WAITING' WHERE name = '{}'".format(hostname)) - return -1, "", "", "", None - - # update the job and worker - id = job[0] - - # create a new event - self._execute (cur, "INSERT INTO Events (worker, job_id, job_title, state, start, duration) " - "VALUES (%s, %d, %s, 'WORKING', %d, %d)" % - (convdata (hostname), job[0], convdata (job[1]), - current_time, 0)) - cur.fetchone () - eventid = cur.lastrowid - - self._execute (cur, "UPDATE Jobs SET worker = '%s', start_time = %d, duration = 0, progress = 0.0 " - "WHERE id = %d" % (hostname, current_time, id)) - self._execute (cur, "UPDATE Workers SET last_job = %d, state = 'WORKING', current_event = %d " - "WHERE name = '%s'" % (id, eventid, hostname)) - - self._setJobState (id, "WORKING", True) - - if job[4] != None and job[4] != "": - return job[0], job[2], job[3], job[4], job[5] - else: - return job[0], job[2], job[3], "", job[5] - - def endJob (self, hostname, jobId, errorCode, ip): - current_time = int(time.time()) - cur = self.Conn.cursor () - self._execute (cur, "SELECT active, current_event FROM Workers WHERE name = '%s'" % hostname) - worker = cur.fetchone () - if worker is None: - self.newWorker (hostname) - self._execute (cur, "SELECT active, current_event FROM Workers WHERE name = '%s'" % hostname) - worker = cur.fetchone () - - self._execute (cur, "SELECT state, start_time FROM Jobs WHERE id = %d AND worker = '%s' AND state = 'WORKING'" % (jobId, hostname)) - job = cur.fetchone () - if job is not None: - state = (errorCode != 0) and "ERROR" or "FINISHED" - # update event - start_time = job[1] - self._execute (cur, "UPDATE Events SET state = %s, duration = %d WHERE id = %d" % - (convdata (state), current_time-start_time, worker[1])) - self._setJobState (jobId, state, True) - self._setWorkerState (hostname, state) - - def _isJobPending (self, id): - cur = self.Conn.cursor () - self._execute (cur, "SELECT COUNT(job.id) FROM Jobs AS job " - "INNER JOIN Dependencies AS dep ON job.id = dep.dependency " - "WHERE dep.job_id = %d AND job.state != 'FINISHED'" % id) - result = cur.fetchone () - return (result[0] > 0) - - def _updateDependentJobsState (self, id): - cur = self.Conn.cursor () - self._execute (cur, "SELECT job.id FROM Jobs AS job " - "INNER JOIN Dependencies AS dep ON job.id = dep.job_id " - "WHERE dep.dependency = %d" % id) - for dependent in cur: - self._setJobState (dependent[0], None, True) - - # update the job state - # also check dependencies, mark pending in this case - # if None is passed as state, assumes previous state - def _setJobState (self, id, state, updateCounters): - current_time = int(time.time()) - cur = self.Conn.cursor () - self._execute (cur, "SELECT state, parent, user, title, id FROM Jobs WHERE id = %d" % id) - job = cur.fetchone () - if job is not None: - jobdict = self._rowAsDict (cur, job) - # passed None, use previous state - if state is None: - state = job[0] - # job set to waiting/pending, check dependencies first - if state == "WAITING" or state == "PENDING": - state = self._isJobPending (id) and "PENDING" or "WAITING" - # changing status? - if state != job[0]: - if state == "FINISHED" and self.NotifyFinished: - self.NotifyFinished (jobdict) - elif state == "ERROR" and self.NotifyError: - self.NotifyError (jobdict) - _set = "state = '%s'" % state - if state == "FINISHED" or state == "ERROR": - _set += ", duration = %d-start_time" % current_time - _set += ", run_done = run_done+1" - self._execute (cur, "UPDATE Jobs SET "+_set+" WHERE id = %d" % id) - self._updateDependentJobsState (id) - self._updateChildren (id) - if updateCounters: - self._updateJobCounters (job[1]) - - # recompute the whole job hierarchy counters - def _resetJobCounters (self, id, updateParent = True): - if id != 0: - cur = self.Conn.cursor () - self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) - for child in cur: - self._resetJobCounters (child[0], False) - self._updateJobCounters (id, updateParent) - - # update this job and its parent counters - def _updateJobCounters (self, id, updateParent = True): - if id != 0: - current_time = int(time.time()) - cur = self.Conn.cursor () - total = 0 - working = 0 - errors = 0 - finished = 0 - total_working = 0 - total_errors = 0 - total_finished = 0 - start_time = 0 - duration = 0 - self._execute (cur, "SELECT state, total_working, total_errors, total_finished, total, start_time, duration FROM Jobs WHERE parent = %d" % id) - for job in cur: - state = job[0] - if job[4] == 0: - total += 1 - if state == 'WORKING': - working += 1 - elif state == 'ERROR': - errors += 1 - elif state == 'FINISHED': - finished += 1 - total_working += job[1] - total_errors += job[2] - total_finished += job[3] - total += job[4] - if job[5] != 0: - if start_time == 0: - start_time = job[5] - else: - start_time = min (start_time, job[5]) - if state == 'ERROR' or state == 'FINISHED': - duration += job[6] - elif state == 'WORKING': - duration += (current_time - job[5]) - total_working += working - total_errors += errors - total_finished += finished - # update job counters! - # note that we also update the start_time as the minimum of - # all children start times - _set = ("working = %d, errors = %d, finished = %d, " - "total_working = %d, total_errors = %d, total_finished = %d, " - "total = %d" % (working, errors, finished, total_working, - total_errors, total_finished, total)) - if total > 0: - _set += ", start_time = %d, duration = %d" % (start_time, duration) - self._execute (cur, "UPDATE Jobs SET " + _set + (" WHERE id = %d" % id)) - if total > 0: - self._execute (cur, "SELECT state, parent, user, title, id, progress FROM Jobs WHERE id = %d" % id) - oldState = cur.fetchone () - jobdict = self._rowAsDict (cur, oldState) - newState = "WAITING" - if total_errors > 0: - newState = "ERROR" - elif total_finished == total: - newState = "FINISHED" - elif total_working > 0: - newState = "WORKING" - if newState != oldState[0]: - # parent job is finished! - # update the duration now! - if newState == "WAITING" or newState == "PENDING": - newState = self._isJobPending (id) and "PENDING" or "WAITING" - self._execute (cur, "UPDATE Jobs SET state = '%s' WHERE id = %d" % (newState, id)) - # and send notification - if newState == "FINISHED" and self.NotifyFinished: - self.NotifyFinished (jobdict) - elif newState == "ERROR" and self.NotifyError: - self.NotifyError (jobdict) - # no longer pending, unpause children - if newState == "WAITING" and oldState[0] == "PENDING": - self._updateChildren (id) - # finished job, update dependent jobs - if newState == "FINISHED": - self._updateDependentJobsState (id) - progress = float (total_finished) / total - if progress != oldState[5]: - self._execute (cur, "UPDATE Jobs SET progress = %f WHERE id = %d" % (progress, id)) - - if updateParent: - self._execute (cur, "SELECT parent FROM Jobs WHERE id = %d" % id) - parent = cur.fetchone () - if parent is not None: - self._updateJobCounters (parent[0]) - - # update the worker state - # if passing an error state, increase counters - def _setWorkerState (self, hostname, state): - cur = self.Conn.cursor () - self._execute (cur, "SELECT state FROM Workers AS worker WHERE name = '%s'" % hostname) - worker = cur.fetchone () - if worker is not None and worker[0] != state: - if state == "ERROR": - self._execute (cur, "UPDATE Workers SET state = 'WAITING', error = error+1 WHERE name = '%s'" % hostname) - elif state == "TIMEOUT": - self._execute (cur, "UPDATE Workers SET state = 'TIMEOUT', error = error+1 WHERE name = '%s'" % hostname) - elif state == "FINISHED": - self._execute (cur, "UPDATE Workers SET state = 'WAITING', finished = finished+1 WHERE name = '%s'" % hostname) - else: - self._execute (cur, "UPDATE Workers SET state = '%s' WHERE name = '%s'" % (state, hostname)) - - # update children hierarchical values, such as h_priority, h_affinity, h_paused - def _updateChildren (self, id, parenth = None): - cur = self.Conn.cursor () - self._execute (cur, "SELECT parent, affinity_bits, priority, paused, state FROM Jobs WHERE id = %d" % id) - job = cur.fetchone () - if job: - if not parenth: - self._execute (cur, "SELECT h_depth, h_affinity, h_priority, h_paused FROM Jobs WHERE id = %d" % job[0]) - parenth = cur.fetchone () or (-1, 0, 0, False) - h_depth = parenth[0]+1 - h_affinity = parenth[1] | job[1] - h_priority = parenth[2] + (job[2] << (56-h_depth*8)) - if parenth[3] or job[3] or job[4] == "PENDING": - h_paused = 1 - else: - h_paused = 0 - self._execute (cur, "UPDATE Jobs SET h_depth = %d, h_affinity = %d, h_priority = %d, h_paused = %d " - "WHERE id = %d" % (h_depth, h_affinity, h_priority, h_paused, id)) - self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) - jobh = [h_depth,h_affinity,h_priority,h_paused] - for child in cur: - self._updateChildren (child[0], jobh) - - def _update (self): - current_time = int(time.time()) - # update timeout jobs no more than every 10 seconds - if current_time - self.LastUpdate >= 10: - load = self.RunTime / (current_time - self.LastUpdate) - if self.Verbose: - print ("[STAT] %d heartbeats, %d pickjobs, load %f" % (self.HeartBeats, self.PickJobs, load)) - self.HeartBeats = 0 - self.PickJobs = 0 - self.LastUpdate = current_time - self.RunTime = 0 - cur = self.Conn.cursor () - timeout = 60 - - # find all working jobs that are running out of time *or* - # all working jobs which worker is timing out - self._execute (cur, "SELECT id, worker FROM Jobs " - "WHERE state = 'WORKING' AND command != '' AND " - "(timeout != 0 AND %d-start_time > timeout)" % - current_time) - for job in cur: - print ("Job %d timeout!" % job[0]) - self._setJobState (job[0], "ERROR", True) - self._setWorkerState (job[1], "TIMEOUT") - - for worker in self.Workers: - info = self.Workers[worker] - if current_time - info['ping_time'] > timeout and not info['timeout']: - # worker timeout! - info['timeout'] = True - self._execute (cur, "SELECT last_job FROM Workers WHERE name = '%s' AND state = 'WORKING'" % worker) - data = cur.fetchone () - if data is not None: - self._setJobState (data[0], "WAITING", True) - if self.getWorker(worker)['state'] == "TERMINATED": - # State TERMINATED is more explicit than TIMEOUT for terminated instances - pass - else: - self._setWorkerState (worker, "TIMEOUT") - - # If cloud mode has been set via "servermode" option - if self.cloudconfig: - cloudprovider = self.config.get('server', 'servermode') - # Dynamic module loading for configured provider - self.cloudmanager = import_module('cloud.{}'.format(cloudprovider)) - waitingjobs = self.listUnpausedWaitingJobs() - if len(waitingjobs): - self._manageWorkerInstanceStart(current_time, - waitingjobs) - else: - self._manageWorkerInstanceTerminate(current_time) - - def _manageWorkerInstanceStart(self, current_time, waitingjobs): - """ - Manage worker starting. A new worker is started if the start - delay is reached, if there are more waiting jobs than available - workers and the maximum number of instances has not been reached. - Create an instance via the cloud provider module, create a - worker reference in the coalition DB and update the delay - timestamp. - """ - if current_time - self.lastworkerinstancestarttime < int( - self.cloudconfig.get("coalition", "workerinstancestartdelay")): - return - availableworkers = self.listWorkersByStates("STARTING", "WORKING", "WAITING") - if len(waitingjobs) > len(availableworkers) and len(availableworkers) < int( - self.cloudconfig.get("coalition", "workerinstancemax")): - name = createWorkerInstanceName( - self.cloudconfig.get("worker", "nameprefix")) - self.cloudmanager.startInstance(name, self.cloudconfig) - self.newWorker(name) - self._setWorkerState(name, 'STARTING') - self.lastworkerinstancestarttime = current_time - if self.Verbose: - print("[CLOUD] Starting new instance %s" % name) - - def _manageWorkerInstanceTerminate(self, current_time): - """ - Manage worker termination. Worker instances are terminated if - they are not working and they have been living for at least the - number of second defined by "workerinstancestopdelay". Terminate - via the cloud provider module, update the coalition DB reference. - """ - uselessworkers = self.listWorkersByStates( - "STARTING", "WAITING", "TIMEOUT") - if len(uselessworkers): - for worker in uselessworkers: - name = worker["name"] - lastworkerstarttime = self.getWorkerStartTime(name) - if lastworkerstarttime and ( - current_time - lastworkerstarttime > int( - self.cloudconfig.get("coalition", - "workerinstanceminimumlifetime"))): - self._setWorkerState(name, "TERMINATED") - self.cloudmanager.stopInstance(name, self.cloudconfig) - if self.Verbose: - print("[CLOUD] Terminating instance %s" % name) - - def requiresMigration(self): - """ - Check if database requires migration. - Returns a boolean. - """ - return self._getDatabaseVersion() < self._getMigrationVersion() - - def reset (self): - cur = self.Conn.cursor () - self._execute (cur, "DELETE FROM Jobs"); - self._execute (cur, "DELETE FROM Workers"); - self._execute (cur, "DELETE FROM Dependencies"); - self._execute (cur, "DELETE FROM Events"); - self._execute (cur, "DELETE FROM Affinities"); - self._execute (cur, "DELETE FROM WorkerAffinities"); - print("[SQL] Database has been reset.") - exit(0) + def __init__ (self): + self.StartTime = time.time () + self.lastworkerinstancestarttime = 0 + self.LastUpdate = 0 + self.EnterTime = 0 + self.RunTime = 0.0 + self.HeartBeats = 0 + self.PickJobs = 0 + self.Verbose = False + self.NotifyFinished = None + self.NotifyError = None + self.Workers = dict() + self.AffinityBitsToName = dict() + + tables = self._getDatabaseTables() + + if (("Workers",) in tables) or (("workers",) in tables): + self._populateWorkersCache() + + if (("Affinities",) in tables) or (("affinities",) in tables): + self._populateAffinitiesTable() + + def __enter__(self): + self.EnterTime = time.time () + self.Conn.__enter__ () + + def __exit__ (self, type, value, traceback): + self.RunTime = time.time ()-self.EnterTime + if not isinstance(value, TypeError): + self._update () + self.Conn.__exit__ (type, value, traceback) + + def _execute (self, cur, req, data=None): + now = time.time () + if data: + cur.execute (req, data) + else: + cur.execute (req) + after = time.time () + if self.Verbose: + sys.stdout.flush () + sys.stdout.write ("[SQL] (%f/%f) %s\n" % (now-self.StartTime, after-now, req)) + sys.stdout.flush () + + def _rowAsDict (self, cur, row): + if row: + result = {} + for idx, col in enumerate (cur.description): + result[col[0]] = row[idx] + return result + else: + return None + + def _populateWorkersCache(self): + """Populate cache with pre-existent data in Workers table.""" + cur = self.Conn.cursor () + self._execute (cur, "SELECT name FROM Workers") + for worker in cur: + info = {} + info['ping_time'] = int (time.time ()) + info['cpu'] = '' + info['free_memory'] = 0 + info['total_memory'] = 0 + info['ip'] = '' + info['timeout'] = False + self.Workers[worker[0]] = info + + def _populateAffinitiesTable(self): + """Populate Affinities table with pre-existent data having an id < 64.""" + cur = self.Conn.cursor () + with self.Conn: + affinities = {} + self._execute (cur, "SELECT id, name FROM Affinities") + for row in cur: + affinities[int (row[0])] = row[1] + for i in range (1, 64): + if not i in affinities: + self._execute (cur, "INSERT INTO Affinities (id, name) VALUES (%d,'')" % i) + + def _getLdapPermission(self, action): + """Check ldap permissions. + + If ldap is not configured, ldap_user is unset, return "". + If ldap user is allowed to do the action, return the additional sql filter + or "" if the user has global permissions. + If ldap user is not allowed, return False.""" + + if not hasattr(self, "ldap_user") or not self.ldap_user: # LDAP is not set up in configuration or ldapunsafeapi is set to True + return "" + if action == "addjob": + if self.permissions["ldaptemplateaddjobglobal"]: + return "" + elif self.permissions["ldaptemplateaddjob"]: + return "AND user='{user}'".format(user=self.ldap_user) + else: + raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) + return False + elif action == "viewjob": + if self.permissions["ldaptemplateviewjobglobal"]: + return "" + elif self.permissions["ldaptemplateviewjob"]: + return "AND user='{user}'".format(user=self.ldap_user) + else: + raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) + return False + elif action == "editjob": + if self.permissions["ldaptemplateeditjobglobal"]: + return "" + elif self.permissions["ldaptemplateeditjob"]: + return "AND user='{user}'".format(user=self.ldap_user) + else: + raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) + return False + elif action == "deletejob": + if self.permissions["ldaptemplatedeletejobglobal"]: + return "" + elif self.permissions["ldaptemplatedeletejob"]: + return "AND user='{user}'".format(user=self.ldap_user) + else: + raise LdapError("Action '{action}' is not permitted for user '{user}'".format(action=action, user=self.ldap_user)) + return False + else: + raise LdapError("Action '{action}' is not defined for user '{user}'".format(action=action, user=self.ldap_user)) + return False + + def listJobs (self): + cur = self.Conn.cursor () + self._execute (cur, "SELECT * FROM Jobs") + for row in cur: + print (self._rowAsDict (cur, row)) + + def listUnpausedWaitingJobs(self): + """Get jobs currently waiting for a worker.""" + cur = self.Conn.cursor () + req = "SELECT * FROM Jobs WHERE state = 'WAITING' and paused = 0" + self._execute (cur, req) + return [self._rowAsDict (cur, row) for row in cur] + + def listWorkers (self): + cur = self.Conn.cursor () + self._execute (cur, "SELECT * FROM Workers") + for row in cur: + print (row) + + def listWorkersByStates(self, state, *argv): + cur = self.Conn.cursor () + req = "SELECT * FROM Workers WHERE state = '%s'" % state + if argv: + for arg in argv: + req += " OR state = '%s'" % arg + self._execute (cur, req) + return [self._rowAsDict (cur, row) for row in cur] + + def listAffinities (self): + cur = self.Conn.cursor () + self._execute (cur, "SELECT id, name FROM Affinities") + aff = {} + for row in cur: + if row[1] != "" and row[0] >= 1 and row[0] <= 63: + aff[row[1]] = (1L << (row[0]-1)) + return aff + + def getAffinities (self): + cur = self.Conn.cursor () + self._execute (cur, "SELECT id, name FROM Affinities") + aff = {} + for row in cur: + if row[0] >= 1 and row[0] <= 63: + aff[row[0]] = row[1] + return aff + + def setAffinities (self, affinities): + # reset affinities cache + self.AffinityBitsToName = {} + cur = self.Conn.cursor () + for id, affinity in affinities.iteritems (): + self._execute (cur, "UPDATE Affinities SET name = '%s' WHERE id = %d" % (affinity, int (id))) + + def getAffinityMask (self, affinities): + if affinities == "": + return 0 + aff = self.listAffinities () + mask = 0L + cur = self.Conn.cursor () + for affinity in affinities.split (","): + if affinity != "": + m = re.match(r"^#(\d+)$", affinity) + if m: + bit = (int(m.group (1))-1) + mask = mask | (1L << bit) + else: + mask = mask | aff[affinity] + return mask + + def getAffinityString (self, affinity_bits): + if affinity_bits == 0: + return "" + if affinity_bits in self.AffinityBitsToName: + return self.AffinityBitsToName[affinity_bits] + names = [] + aff = self.getAffinities() + for id, name in aff.iteritems (): + bit = (1L << (id-1)) + if affinity_bits & bit != 0: + if name != '': + names.append (name) + else: + names.append ("#"+ str (id)) + names.sort () + result = ",".join (names) + self.AffinityBitsToName[affinity_bits] = result + return result + + def newJob(self, parent, title, command, dir, environment, state, paused, timeout, + priority, affinity, user, url, progress_pattern, dependencies = None): + ldap_perm = self._getLdapPermission("addjob") + if ldap_perm is False: + return None + if ldap_perm != "": # User can add job owned by himself, force user value + user = self.ldap_user + cur = self.Conn.cursor() + self._execute(cur, + "SELECT h_depth, h_affinity, h_priority, h_paused, command " + "FROM Jobs " + "WHERE id = {parent} {ldap_perm}".format(parent=parent, ldap_perm=ldap_perm)) + data = cur.fetchone() + paused = 0 + if state == "PAUSED": + paused = 1; + if data is None: + data = [-1, 0, 0, 0, ''] + if data[4] != '': + print("Error: can't add job, parent {parent} is not a group".format(parent=parent)) + return None + # one depth below + h_depth = data[0]+1 + # merge parent affinities with child affinities + parent_affinities = data[1] + child_affinities = self.getAffinityMask(affinity) + h_affinity = parent_affinities | child_affinities + # merge priority + priority = max(0, min(255, int(priority))) + h_priority = data[2] + (priority << (56-h_depth*8)) + h_paused = data[3] or paused + + self._execute (cur, + "INSERT INTO Jobs (" + "parent, title, command, dir, environment, timeout," + "priority, affinity, affinity_bits, user, url," + "progress_pattern, paused, state, worker, h_depth," + "h_affinity, h_priority, h_paused" + ") VALUES (" + "{parent}, {title}, {command}, {directory}, {environment}, {timeout}," + "{priority}, {affinity}, {child_affinities}, {user}, {url}," + "{progress_pattern}, {paused}, {state}, {worker}, {h_depth}," + "{h_affinity}, {h_priority}, {h_paused})".format(parent=parent, title=repr(title), command=repr(command), + directory=repr(dir), environment=repr(environment), timeout=timeout, + priority=priority, affinity=repr(affinity), child_affinities=child_affinities, + user=repr(user), url=repr(url), progress_pattern=repr(progress_pattern), + paused="'"+str(paused)+"'", state="'WAITING'", worker="''", h_depth=int(h_depth), + h_affinity=h_affinity, h_priority=int(h_priority), h_paused="'"+str(h_paused)+"'")) + + data = cur.fetchone () + job = self.getJob (cur.lastrowid) + if job is not None and dependencies is not None: + self.setJobDependencies (job['id'], dependencies) + self._updateJobCounters (parent) + job['dependencies'] = dependencies + return job + + def getJob(self, id): + ldap_perm = self._getLdapPermission("viewjob") + if ldap_perm is False: + return None + print("LDAP_PERM", ldap_perm) + cur = self.Conn.cursor() + self._execute(cur, + "SELECT * FROM Jobs " + "WHERE id = {id} {ldap_perm}".format(id=id, ldap_perm=ldap_perm)) + result = self._rowAsDict(cur, cur.fetchone()) + if result is not None: + if result['paused']: + result['state'] = str("PAUSED") + if result['state'] == "WORKING" and result['total'] == 0: + current_time = int(time.time ()) + result['duration'] = current_time - result['start_time'] + result['affinity'] = self.getAffinityString(result['affinity_bits']) + # get dependencies + result['dependencies'] = [] + self._execute(cur, + "SELECT job.id FROM Jobs AS job " + "INNER JOIN Dependencies AS dep " + "ON job.id = dep.dependency " + "WHERE dep.job_id = {id}".format(id=id)) + for row in cur: + result['dependencies'].append(row[0]) + return result + + def getJobChildren(self, id, data): + ldap_perm = self._getLdapPermission("viewjob") + if ldap_perm is False: + return None + cur = self.Conn.cursor() + self._execute(cur, + "SELECT * FROM Jobs " + "WHERE parent = {id} {ldap_perm}".format(id=id, ldap_perm=ldap_perm)) + jobs = [] + for row in cur: + result = self._rowAsDict (cur, row) + if result and result['paused']: + result['state'] = str ("PAUSED") + if result['state'] == "WORKING" and result['total'] == 0: + current_time = int (time.time ()) + result['duration'] = current_time - result['start_time'] + result['affinity'] = self.getAffinityString (result['affinity_bits']) + jobs.append (result) + return jobs + + def getJobDependencies(self, id): + ldap_perm = self._getLdapPermission("viewjob") + if ldap_perm is False: + return None + cur = self.Conn.cursor() + self._execute (cur, + "SELECT job.* FROM Jobs AS job " + "INNER JOIN Dependencies AS dep " + "ON job.id = dep.dependency " + "WHERE dep.job_id = {id} {ldap_perm}".format(id=id, ldap_perm=ldap_perm)) + rows = cur.fetchall() + return [self._rowAsDict (cur, row) for row in rows] + + def getCountJobsWhere(self, where_clause=''): + """Get the number of matching jobs.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT COUNT(*) FROM Jobs WHERE {}".format(where_clause[0])) + return cur.fetchone()[0] + + def getJobsWhere(self, where_clause='', index_min=0, index_max=1): + """Get Jobs via a readonly SQL request.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT * FROM Jobs WHERE {} LIMIT {},{}".format(where_clause, index_min, index_max)) + return [self._rowAsDict (cur, row) for row in cur.fetchall()] + + def getJobsUsers(self): + """Get users.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT DISTINCT user FROM Jobs ORDER BY user") + return [self._rowAsDict (cur, row) for row in cur.fetchall()] + + def getJobsStates(self): + """Get States.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT DISTINCT state FROM Jobs") + return [self._rowAsDict (cur, row) for row in cur.fetchall()] + + def getJobsWorkers(self): + """Get States.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT DISTINCT worker FROM Jobs") + return [self._rowAsDict (cur, row) for row in cur.fetchall()] + + def getJobsPriorities(self): + """Get States.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT DISTINCT priority FROM Jobs") + return [self._rowAsDict (cur, row) for row in cur.fetchall()] + + def getJobsAffinities(self): + """Get States.""" + cur = self.Conn.cursor() + self._execute(cur, "SELECT DISTINCT affinity FROM Jobs") + return [self._rowAsDict (cur, row) for row in cur.fetchall()] + + def getChildrenDependencyIds (self, id): + cur = self.Conn.cursor () + self._execute (cur, "SELECT job.id AS id, dep.dependency AS dependency FROM Dependencies AS dep " + "INNER JOIN Jobs AS job ON job.id = dep.job_id " + " WHERE job.parent = %d" % id) + + def getChildrenDependencyIds(self, id): + cur = self.Conn.cursor() + self._execute(cur, """ + SELECT job.id AS id, dep.dependency AS dependency + FROM Dependencies AS dep + INNER JOIN Jobs AS job + ON job.id = dep.job_id + WHERE job.parent = {id} + """.format(id=id)) + rows = cur.fetchall() + return [self._rowAsDict(cur, row) for row in rows] + + def getWorker (self, hostname): + cur = self.Conn.cursor () + self._execute (cur, "SELECT * FROM Workers WHERE name = '%s'" % hostname) + worker = self._rowAsDict (cur, cur.fetchone ()) + try: + info = self.Workers[hostname] + worker['ping_time'] = info['ping_time'] + worker['cpu'] = info['cpu'] + worker['free_memory'] = info['free_memory'] + worker['total_memory'] = info['total_memory'] + except: + pass + + self._execute (cur, "SELECT affinity FROM WorkerAffinities WHERE worker_name = '%s'" % ( hostname ) ) + affinities = [] + + data = cur.fetchone() + + if data is None: + worker['affinity'] = "" + return worker + + for data in cur: + affinities.append( self.getAffinityString( data[0] ) ) + + worker['affinity'] = "\n".join( affinities ) + return worker + + def getWorkerStartTime(self, name): + """Get the number of seconds since epoch.""" + cur = self.Conn.cursor () + self._execute(cur, "SELECT start_time FROM Workers WHERE name = '%s'" % name) + db_type = self._getDatabaseType() + if db_type == "mysql": + start_time = cur.fetchone()[0].timetuple() + else: + start_time = time.strptime(cur.fetchone()[0], '%Y-%m-%d %H:%M:%S') + return time.mktime(start_time) + + def getWorkers (self): + cur = self.Conn.cursor () + self._execute (cur, "SELECT * FROM Workers") + workers = [] + for row in cur: + worker = self._rowAsDict (cur, row) + try: + info = self.Workers[worker['name']] + worker['ping_time'] = info['ping_time'] + worker['cpu'] = info['cpu'] + worker['free_memory'] = info['free_memory'] + worker['total_memory'] = info['total_memory'] + except: + pass + + req = self.Conn.cursor() + self._execute( req, "SELECT affinity FROM WorkerAffinities WHERE worker_name = '%s'" % ( worker['name'] ) ) + affinities = [] + + for d in req: + + affinities.append( self.getAffinityString( d[0] ) ) + + worker['affinity'] = "\n".join( affinities ) + worker['start_time'] = self.getWorkerStartTime(worker['name']) + workers.append (worker) + return workers + + def getEvents (self, job, worker, howlong): + cur = self.Conn.cursor() + req = "SELECT * FROM Events WHERE start > %d" % (int(time.time())-howlong) + if worker: + req += " AND worker=%s" % convdata (worker) + if job > 0: + req += " AND job_id=%d" % job + self._execute (cur, req); + return [self._rowAsDict (cur, row) for row in cur.fetchall ()] + + def editJobs (self, jobs): + ldap_perm = self._getLdapPermission("editjob") + if ldap_perm is False: + return None + cur = self.Conn.cursor () + for id, attr in jobs.iteritems (): + if attr.has_key("user") and attr["user"].lower() != self.ldap_user.lower() and ldap_perm != "": + # User has no global permission, so he can change his own jobs only + raise LdapError("User '{user}' is not allowed to change job id='{id}' user attribute to '{user_attr}'".format( + user=self.ldap_user, id=id, user_attr=attr["user"])) + break + toUpdate = [k+"="+convdata(v) for k,v in attr.iteritems() + if k != 'dependencies' and k != 'affinity' and k != 'priority' and + k != 'state' and k != 'parent'] + if toUpdate: + req = "UPDATE Jobs SET " + ",".join (toUpdate) + " WHERE id=" + str(id) + self._execute(cur, req) + cur.fetchall() + # Special cases + if attr.get ('paused') is not None: + paused = attr.get ('paused') + if paused: + self.pauseJob (int (id)) + else: + self.startJob (int (id)) + if attr.get ('state'): + state = attr.get ('state') + if state == 'PAUSED': + self.pauseJob (int (id)) + elif state == 'WAITING': + self.startJob (int (id)) + else: + self._setJobState (int (id), state, True) + updateChildren = False + if attr.get ('parent') is not None: + self.moveJob (int (id), int (attr['parent'])) + if attr.get ('affinity') is not None: + self.setJobAffinity (int (id), attr['affinity']) + if attr.get ('priority'): + self.setJobPriority (int (id), attr['priority']) + if attr.get ('parent') is not None or attr.get ('affinity') is not None or attr.get ('priority') is not None or attr.get ('paused') is not None: + self._updateChildren (int (id)) + if attr.get ('dependencies'): + dependencies = attr['dependencies'] + if type(dependencies) is str: + # Parse the dependencies string + dependencies = re.findall ('(\d+)', dependencies) + ids = [] + for i, dep in enumerate (dependencies) : + try: + ids.append (int (dep)) + except: + pass + self.setJobDependencies (int (id), ids) + self._setJobState (int (id), None, True) + + def editWorkers (self, workers): + cur = self.Conn.cursor () + for name, attr in workers.iteritems (): + hasField = False + req = "UPDATE Workers SET" + for k, v in attr.iteritems(): + if k != 'affinity': + hasField = True + req += " " + k + " = " + convdata (v) + req += " WHERE name = '" + name + "'" + if hasField: + self._execute(cur, req) + cur.fetchall() + if attr.get ('affinity') is not None: + self.setWorkerAffinity (str (name), attr['affinity']) + + def setJobProgress (self, jobId, progress): + cur = self.Conn.cursor () + self._execute (cur, "UPDATE Jobs SET progress = %f WHERE id = %d" % (progress, jobId)) + + def _getDatabaseType(self): + """Get the database type.""" + return self.config.get("server", "db_type") + + def _getDatabaseTables(self): + """Return list of database tables.""" + cur = self.Conn.cursor() + db_type = self._getDatabaseType() + if db_type == "mysql": + req = "SHOW TABLES;" + else: + req = "SELECT name FROM sqlite_master WHERE type = 'table';" + self._execute(cur, req) + return cur.fetchall() + + def _getDatabaseVersion(self): + """Return database version.""" + cur = self.Conn.cursor() + tables = self._getDatabaseTables() + if (not ("Migrations",) in tables) and (not ("migrations",) in tables): + current_version = [("0000",)] + else: + req = "SELECT database_version FROM Migrations;" + self._execute(cur, req) + current_version = cur.fetchall() + return int(current_version[0][0]) + + def _getMigrationVersion(self): + """Return latest migration version.""" + return int(max([re.sub(r'_.*$', '', f) for f in + os.walk("migrations").next()[2]])) + + def _getDatabaseDataCount(self): + datacount = 0 + cur = self.Conn.cursor() + for table in self._getDatabaseTables(): + req = "SELECT COUNT(*) FROM {}".format(table[0]) + self._execute(cur, req) + fetched = cur.fetchone() + if fetched: + datacount += fetched[0] + return datacount + + def initDatabase(self): + """Initialize the database.""" + if len(self._getDatabaseTables()): + if self._getDatabaseDataCount() != 0: + print("The database is not empty, it will not be initialized.") + return False + print("Initializing database.") + return self.migrateDatabase(init=True) + + def migrateDatabase(self, init=False): + """Migrate the database.""" + db_type = self._getDatabaseType() + current = self._getDatabaseVersion() + target = self._getMigrationVersion() + if init: + # Init with the '0000' migration + current -= 1 + cur = self.Conn.cursor() + print("The database version is {current} and the migration target is {target}. Migrating.".format(current=current, target=target)) + while current < target: + current += 1 + migration_module_name = "{current:04d}_db_{db_type}".format(current=current, db_type=db_type) + migration_module = import_module("migrations.{}".format(migration_module_name)) + with self.Conn: + for step in migration_module.steps: + self._execute(cur, step.strip()) + if init: + with self.Conn: + for i in range(1, 64): + self._execute(cur, dedent(""" + INSERT INTO Affinities (id, name) + VALUES ('{}', '')""".format(i))) + return True + + def moveJob (self, jobId, parent): + cur = self.Conn.cursor () + self._execute (cur, "SELECT parent FROM Jobs WHERE id = %d" % jobId) + previous = cur.fetchone () + self._execute (cur, "UPDATE Jobs SET parent = %d WHERE id = %d" % (parent, jobId)) + self._updateJobCounters (previous[0]) + self._updateJobCounters (parent) + + def setJobAffinity (self, id, affinity): + cur = self.Conn.cursor () + affinities = self.getAffinityMask (affinity) + self._execute (cur, "UPDATE Jobs SET affinity = '%s', affinity_bits = %d WHERE id = %d" % (affinity, affinities, id)) + + def setJobPriority (self, id, priority): + cur = self.Conn.cursor () + priority = max (0, min (255, int (priority))) + self._execute (cur, "UPDATE Jobs SET priority = %d WHERE id = %d" % (priority, id)) + + def setJobDependencies (self, id, dependencies): + cur = self.Conn.cursor () + self._execute (cur, "DELETE FROM Dependencies WHERE job_id = %d" % int (id)) + for dep in dependencies: + self._execute (cur, "INSERT INTO Dependencies (job_id,dependency) " + "VALUES (%d,%d)" % (int (id), int (dep))) + self._setJobState (int (id), None, True) + + def resetJob (self, id, updateChildren = True): + ldap_perm = self._getLdapPermission("editjob") + if ldap_perm is False: + return None + if ldap_perm != "": # Not a global permission + cur = self.Conn.cursor () + self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) + data = cur.fetchone() + if data and data[0].lower() != self.ldap_user.lower(): + raise LdapError("User '{user}' is not allowed to reset job id='{id}'".format(user=user,id=id)) + return None + cur = self.Conn.cursor () + self._execute (cur, "UPDATE Jobs SET start_time = 0 WHERE id = %d" % id) + self._setJobState (id, "WAITING", False) + self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) + for row in cur: + self.resetJob (row[0], False) + if updateChildren: + self._resetJobCounters (id) + + def resetErrorJob (self, id, updateChildren = True): + ldap_perm = self._getLdapPermission("editjob") + if ldap_perm is False: + return None + if ldap_perm != "": # Not a global permission + cur = self.Conn.cursor () + self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) + data = cur.fetchone() + if data and data[0].lower() != self.ldap_user.lower(): + raise LdapError("User '{user}' is not allowed to reset error job id='{id}'".format(user=user,id=id)) + return None + cur = self.Conn.cursor () + self._execute (cur, "SELECT state FROM Jobs WHERE id = %d" % id) + data = cur.fetchone () + if data is not None and data[0] == "ERROR": + self._execute (cur, "UPDATE Jobs SET start_time = 0 WHERE id = %d" % id) + self._setJobState (id, "WAITING", False) + self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) + for row in cur: + self.resetErrorJob (row[0], False) + if updateChildren: + self._resetJobCounters (id) + + def startJob (self, id): + ldap_perm = self._getLdapPermission("editjob") + if ldap_perm is False: + return None + if ldap_perm != "": # Not a global permission + cur = self.Conn.cursor () + self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) + data = cur.fetchone() + if data and data[0].lower() != self.ldap_user.lower(): + raise LdapError("User '{user}' is not allowed to start job id='{id}'".format(user=user,id=id)) + return None + + cur = self.Conn.cursor () + self._execute (cur, "UPDATE Jobs SET paused = 0 WHERE id = %d" % id) + self._setJobState (id, "WAITING", False) + self._updateChildren (id) + self._updateJobCounters (id) + + def pauseJob (self, id): + ldap_perm = self._getLdapPermission("editjob") + if ldap_perm is False: + return None + if ldap_perm != "": # Not a global permission + cur = self.Conn.cursor () + self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) + data = cur.fetchone() + if data and data[0].lower() != self.ldap_user.lower(): + raise LdapError("User '{user}' is not allowed to pause job id='{id}'".format(user=user,id=id)) + return None + cur = self.Conn.cursor () + self._execute (cur, "UPDATE Jobs SET paused = 1 WHERE id = %d" % id) + self._setJobState (id, "PAUSED", False) + self._updateChildren (id) + self._updateJobCounters (id) + + def deleteJob (self, id, deletedJobs = [], updateCounters = True): + ldap_perm = self._getLdapPermission("deletejob") + if ldap_perm is False: + return None + if ldap_perm != "": # Not a global permission + cur = self.Conn.cursor () + self._execute(cur, "SELECT user FROM Jobs WHERE id={id}".format(id=id)) + data = cur.fetchone() + if data and data[0].lower() != self.ldap_user.lower(): + raise LdapError("User '{user}' is not allowed to delete job id='{id}'".format(user=user,id=id)) + return None + + cur = self.Conn.cursor () + self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) + for row in cur: + self.deleteJob (row[0], deletedJobs, False) + parent = None + if updateCounters: + self._execute (cur, "SELECT parent FROM Jobs WHERE id = %d" % id) + parent = cur.fetchone () + self._execute (cur, "DELETE FROM Jobs WHERE id = %d" % id) + # clean up Events? + #self._execute (cur, "DELETE FROM Events WHERE job_id = %d" % id) + deletedJobs.append (id) + if parent is not None: + self._updateJobCounters (parent[0]) + + def newWorker (self, name): + cur = self.Conn.cursor () + self._execute (cur, "INSERT INTO Workers (name,ip,affinity, state,finished," + "error,last_job,current_event,cpu,free_memory,total_memory,active) " + "VALUES ('%s','','','WAITING',0,0,-1,-1,'[0]',0,0,1)" % name) + + def setWorkerAffinity (self, name, affinity): + cur = self.Conn.cursor () + # Delete all the worker's affinities + self._execute( cur, "DELETE FROM WorkerAffinities WHERE worker_name = '%s'" % ( name ) ) + + if len( affinity ) > 0: + + affinities = affinity.split( "\n" ) + + for index, aff in enumerate( affinities ): + + query = "INSERT INTO WorkerAffinities ( worker_name, affinity, ordering ) VALUES( '%s', %d, %d )" % ( name, self.getAffinityMask( aff ), index+1 ) + self._execute( cur, query ) + + def stopWorker (self, name): + cur = self.Conn.cursor () + self._execute (cur, "UPDATE Workers SET active = 0 WHERE name = '%s'" % name) + self._execute (cur, "SELECT job.id FROM Jobs AS job " + "INNER JOIN Workers AS worker ON " + "worker.last_job = job.id AND worker.name = job.worker " + "WHERE worker.name = '%s' AND job.state = 'WORKING'" % name) + row = cur.fetchone () + if row is not None: + self._setJobState (row[0], "WAITING", True) + + def startWorker (self, name): + cur = self.Conn.cursor () + self._execute (cur, "UPDATE Workers SET active = 1 WHERE name = '%s'" % name) + + def deleteWorker (self, name): + cur = self.Conn.cursor () + self._execute (cur, "DELETE FROM Workers WHERE name = '%s'" % name) + try: + del self.Workers[name] + except: + pass + + def _updateWorkerInfo (self, hostname, cpu, free_memory, total_memory, ip): + try: + info = self.Workers[hostname] + except: + info = {} + self.Workers[hostname] = info + info['ping_time'] = int (time.time ()) + info['cpu'] = cpu + info['free_memory'] = free_memory + info['total_memory'] = total_memory + info['ip'] = ip + info['timeout'] = False + return info + + # Worker heartbeats while running a job + # Lookup for worker and job + # update worker and job + def heartbeat (self, hostname, jobId, cpu, free_memory, total_memory, ip): + self.HeartBeats += 1 + current_time = int(time.time()) + cur = self.Conn.cursor () + + self._updateWorkerInfo (hostname, cpu, free_memory, total_memory, ip) + + _query = ("SELECT w.active, w.state, j.state FROM Workers as w " + "INNER JOIN Jobs AS j ON " + "j.worker = w.name AND j.id = %d AND w.last_job = %d AND " + "w.state = 'WORKING' AND j.state = 'WORKING' and j.h_paused = 0 " + "WHERE w.name = '%s'" % (jobId, jobId, hostname)) + self._execute (cur, _query) + data = cur.fetchone () + + if data: + return True + + # slow path here + # either worker doesn't exist or job is not assigned to the worker or job was pause + # get the worker active and state + self._execute (cur, "SELECT active, state FROM Workers WHERE name = '%s'" % hostname) + worker = cur.fetchone () + if worker is None: + # create worker if needed + self.newWorker (hostname) + self._execute (cur, "SELECT active, state FROM Workers WHERE name = '%s'" % hostname) + worker = cur.fetchone () + + # by default we're suspicious and we flag the worker as waiting + state = "WAITING" + job = None + if worker[0] == True: + self._execute (cur, "SELECT state, h_paused FROM Jobs WHERE id = %d AND worker = '%s'" % (jobId, hostname)) + job = cur.fetchone () + if job is not None and job[0] == "WORKING" and not job[1]: + # if the worker is active and is running the job, it's all good + # we just lost track of the worker (deleteWorker) and we just need + # to update them + self._setWorkerState (hostname, "WORKING") + return True + + # something is not right! + # reset the worker to WAITING + self._setWorkerState (hostname, "WAITING") + # and if the job exists, reset it to WAITING as well + if job is not None: + self._setJobState (jobId, "WAITING", True) + return False + + def pickJob (self, hostname, cpu, free_memory, total_memory, ip): + self.PickJobs += 1 + current_time = int(time.time()) + cur = self.Conn.cursor() + + self._updateWorkerInfo(hostname, cpu, free_memory, total_memory, ip) + + # get the worker active and state + self._execute(cur, "SELECT active, state, last_job FROM Workers WHERE name = '%s'" % hostname) + worker = cur.fetchone() + if worker is None: + self.newWorker(hostname) + self._execute(cur, "SELECT active, state, last_job FROM Workers WHERE name = '%s'" % hostname) + worker = cur.fetchone() + + # check the worker is not already working + # this can happen if the worker crashed and restarted before + # timeout is detected + if worker[1] == "WORKING": + # reset all working jobs assigned to this worker + self._execute(cur, "SELECT id FROM Jobs WHERE state = 'WORKING' and worker = '%s'" % hostname) + for job in cur: + self._setJobState(job[0], "WAITING", True) + + # worker is not active, drop now + if not worker[0]: + return -1,"","","",None + + # Here, we have an INNER JOIN query + # Fetch the FIRST job whose affinity match the worker's first affinity in the list (stored in WorkerAffinities) + self._execute(cur, dedent(""" + SELECT J.id, J.title, J.command, J.dir, J.user, J.environment + FROM Jobs AS J + INNER JOIN WorkerAffinities AS W + ON (( J.h_affinity & W.affinity = J.h_affinity ) & ( J.h_affinity != 0 )) + WHERE W.worker_name = '{}' + AND J.state = 'WAITING' + AND NOT J.h_paused + AND J.command != '' + ORDER BY W.ordering ASC, J.h_priority DESC, J.id ASC LIMIT 1""".format(hostname))) + + job = cur.fetchone() # This instruction is redundant because there is a LIMIT 1 in the query + + # At this point, the job will be set to None IF : + # * There is no Worker whose affinity match any Job affinity + # * A job has no affinity + # The former case is EXPECTED, but not the latter one + # Therefore, we need to add a query that take the first Job that has no affinity WHEN Workers are not doing anything + if job is None: + self._execute(cur, dedent(""" + SELECT id, title, command, dir, user, environment + FROM Jobs + WHERE state = 'WAITING' + AND NOT h_paused + AND affinity = '' + AND command != '' + ORDER BY h_priority DESC, id ASC LIMIT 1""")) + + job = cur.fetchone () + + # Finally, return nothing if there is no job. + if job is None: + # Update worker state + self._execute (cur, "UPDATE Workers SET state = 'WAITING' WHERE name = '{}'".format(hostname)) + return -1, "", "", "", None + + # update the job and worker + id = job[0] + + # create a new event + self._execute (cur, "INSERT INTO Events (worker, job_id, job_title, state, start, duration) " + "VALUES (%s, %d, %s, 'WORKING', %d, %d)" % + (convdata (hostname), job[0], convdata (job[1]), + current_time, 0)) + cur.fetchone () + eventid = cur.lastrowid + + self._execute (cur, "UPDATE Jobs SET worker = '%s', start_time = %d, duration = 0, progress = 0.0 " + "WHERE id = %d" % (hostname, current_time, id)) + self._execute (cur, "UPDATE Workers SET last_job = %d, state = 'WORKING', current_event = %d " + "WHERE name = '%s'" % (id, eventid, hostname)) + + self._setJobState (id, "WORKING", True) + + if job[4] != None and job[4] != "": + return job[0], job[2], job[3], job[4], job[5] + else: + return job[0], job[2], job[3], "", job[5] + + def endJob (self, hostname, jobId, errorCode, ip): + current_time = int(time.time()) + cur = self.Conn.cursor () + self._execute (cur, "SELECT active, current_event FROM Workers WHERE name = '%s'" % hostname) + worker = cur.fetchone () + if worker is None: + self.newWorker (hostname) + self._execute (cur, "SELECT active, current_event FROM Workers WHERE name = '%s'" % hostname) + worker = cur.fetchone () + + self._execute (cur, "SELECT state, start_time FROM Jobs WHERE id = %d AND worker = '%s' AND state = 'WORKING'" % (jobId, hostname)) + job = cur.fetchone () + if job is not None: + state = (errorCode != 0) and "ERROR" or "FINISHED" + # update event + start_time = job[1] + self._execute (cur, "UPDATE Events SET state = %s, duration = %d WHERE id = %d" % + (convdata (state), current_time-start_time, worker[1])) + self._setJobState (jobId, state, True) + self._setWorkerState (hostname, state) + + def _isJobPending (self, id): + cur = self.Conn.cursor () + self._execute (cur, "SELECT COUNT(job.id) FROM Jobs AS job " + "INNER JOIN Dependencies AS dep ON job.id = dep.dependency " + "WHERE dep.job_id = %d AND job.state != 'FINISHED'" % id) + result = cur.fetchone () + return (result[0] > 0) + + def _updateDependentJobsState (self, id): + cur = self.Conn.cursor () + self._execute (cur, "SELECT job.id FROM Jobs AS job " + "INNER JOIN Dependencies AS dep ON job.id = dep.job_id " + "WHERE dep.dependency = %d" % id) + for dependent in cur: + self._setJobState (dependent[0], None, True) + + # update the job state + # also check dependencies, mark pending in this case + # if None is passed as state, assumes previous state + def _setJobState (self, id, state, updateCounters): + current_time = int(time.time()) + cur = self.Conn.cursor () + self._execute (cur, "SELECT state, parent, user, title, id FROM Jobs WHERE id = %d" % id) + job = cur.fetchone () + if job is not None: + jobdict = self._rowAsDict (cur, job) + # passed None, use previous state + if state is None: + state = job[0] + # job set to waiting/pending, check dependencies first + if state == "WAITING" or state == "PENDING": + state = self._isJobPending (id) and "PENDING" or "WAITING" + # changing status? + if state != job[0]: + if state == "FINISHED" and self.NotifyFinished: + self.NotifyFinished (jobdict) + elif state == "ERROR" and self.NotifyError: + self.NotifyError (jobdict) + _set = "state = '%s'" % state + if state == "FINISHED" or state == "ERROR": + _set += ", duration = %d-start_time" % current_time + _set += ", run_done = run_done+1" + self._execute (cur, "UPDATE Jobs SET "+_set+" WHERE id = %d" % id) + self._updateDependentJobsState (id) + self._updateChildren (id) + if updateCounters: + self._updateJobCounters (job[1]) + + # recompute the whole job hierarchy counters + def _resetJobCounters (self, id, updateParent = True): + if id != 0: + cur = self.Conn.cursor () + self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) + for child in cur: + self._resetJobCounters (child[0], False) + self._updateJobCounters (id, updateParent) + + # update this job and its parent counters + def _updateJobCounters (self, id, updateParent = True): + if id != 0: + current_time = int(time.time()) + cur = self.Conn.cursor () + total = 0 + working = 0 + errors = 0 + finished = 0 + total_working = 0 + total_errors = 0 + total_finished = 0 + start_time = 0 + duration = 0 + self._execute (cur, "SELECT state, total_working, total_errors, total_finished, total, start_time, duration FROM Jobs WHERE parent = %d" % id) + for job in cur: + state = job[0] + if job[4] == 0: + total += 1 + if state == 'WORKING': + working += 1 + elif state == 'ERROR': + errors += 1 + elif state == 'FINISHED': + finished += 1 + total_working += job[1] + total_errors += job[2] + total_finished += job[3] + total += job[4] + if job[5] != 0: + if start_time == 0: + start_time = job[5] + else: + start_time = min (start_time, job[5]) + if state == 'ERROR' or state == 'FINISHED': + duration += job[6] + elif state == 'WORKING': + duration += (current_time - job[5]) + total_working += working + total_errors += errors + total_finished += finished + # update job counters! + # note that we also update the start_time as the minimum of + # all children start times + _set = ("working = %d, errors = %d, finished = %d, " + "total_working = %d, total_errors = %d, total_finished = %d, " + "total = %d" % (working, errors, finished, total_working, + total_errors, total_finished, total)) + if total > 0: + _set += ", start_time = %d, duration = %d" % (start_time, duration) + self._execute (cur, "UPDATE Jobs SET " + _set + (" WHERE id = %d" % id)) + if total > 0: + self._execute (cur, "SELECT state, parent, user, title, id, progress FROM Jobs WHERE id = %d" % id) + oldState = cur.fetchone () + jobdict = self._rowAsDict (cur, oldState) + newState = "WAITING" + if total_errors > 0: + newState = "ERROR" + elif total_finished == total: + newState = "FINISHED" + elif total_working > 0: + newState = "WORKING" + if newState != oldState[0]: + # parent job is finished! + # update the duration now! + if newState == "WAITING" or newState == "PENDING": + newState = self._isJobPending (id) and "PENDING" or "WAITING" + self._execute (cur, "UPDATE Jobs SET state = '%s' WHERE id = %d" % (newState, id)) + # and send notification + if newState == "FINISHED" and self.NotifyFinished: + self.NotifyFinished (jobdict) + elif newState == "ERROR" and self.NotifyError: + self.NotifyError (jobdict) + # no longer pending, unpause children + if newState == "WAITING" and oldState[0] == "PENDING": + self._updateChildren (id) + # finished job, update dependent jobs + if newState == "FINISHED": + self._updateDependentJobsState (id) + progress = float (total_finished) / total + if progress != oldState[5]: + self._execute (cur, "UPDATE Jobs SET progress = %f WHERE id = %d" % (progress, id)) + + if updateParent: + self._execute (cur, "SELECT parent FROM Jobs WHERE id = %d" % id) + parent = cur.fetchone () + if parent is not None: + self._updateJobCounters (parent[0]) + + # update the worker state + # if passing an error state, increase counters + def _setWorkerState (self, hostname, state): + cur = self.Conn.cursor () + self._execute (cur, "SELECT state FROM Workers AS worker WHERE name = '%s'" % hostname) + worker = cur.fetchone () + if worker is not None and worker[0] != state: + if state == "ERROR": + self._execute (cur, "UPDATE Workers SET state = 'WAITING', error = error+1 WHERE name = '%s'" % hostname) + elif state == "TIMEOUT": + self._execute (cur, "UPDATE Workers SET state = 'TIMEOUT', error = error+1 WHERE name = '%s'" % hostname) + elif state == "FINISHED": + self._execute (cur, "UPDATE Workers SET state = 'WAITING', finished = finished+1 WHERE name = '%s'" % hostname) + else: + self._execute (cur, "UPDATE Workers SET state = '%s' WHERE name = '%s'" % (state, hostname)) + + # update children hierarchical values, such as h_priority, h_affinity, h_paused + def _updateChildren (self, id, parenth = None): + cur = self.Conn.cursor () + self._execute (cur, "SELECT parent, affinity_bits, priority, paused, state FROM Jobs WHERE id = %d" % id) + job = cur.fetchone () + if job: + if not parenth: + self._execute (cur, "SELECT h_depth, h_affinity, h_priority, h_paused FROM Jobs WHERE id = %d" % job[0]) + parenth = cur.fetchone () or (-1, 0, 0, False) + h_depth = parenth[0]+1 + h_affinity = parenth[1] | job[1] + h_priority = parenth[2] + (job[2] << (56-h_depth*8)) + if parenth[3] or job[3] or job[4] == "PENDING": + h_paused = 1 + else: + h_paused = 0 + self._execute (cur, "UPDATE Jobs SET h_depth = %d, h_affinity = %d, h_priority = %d, h_paused = %d " + "WHERE id = %d" % (h_depth, h_affinity, h_priority, h_paused, id)) + self._execute (cur, "SELECT id FROM Jobs WHERE parent = %d" % id) + jobh = [h_depth,h_affinity,h_priority,h_paused] + for child in cur: + self._updateChildren (child[0], jobh) + + def _update (self): + current_time = int(time.time()) + # update timeout jobs no more than every 10 seconds + if current_time - self.LastUpdate >= 10: + load = self.RunTime / (current_time - self.LastUpdate) + if self.Verbose: + print ("[STAT] %d heartbeats, %d pickjobs, load %f" % (self.HeartBeats, self.PickJobs, load)) + self.HeartBeats = 0 + self.PickJobs = 0 + self.LastUpdate = current_time + self.RunTime = 0 + cur = self.Conn.cursor () + timeout = 60 + + # find all working jobs that are running out of time *or* + # all working jobs which worker is timing out + self._execute (cur, "SELECT id, worker FROM Jobs " + "WHERE state = 'WORKING' AND command != '' AND " + "(timeout != 0 AND %d-start_time > timeout)" % + current_time) + for job in cur: + print ("Job %d timeout!" % job[0]) + self._setJobState (job[0], "ERROR", True) + self._setWorkerState (job[1], "TIMEOUT") + + for worker in self.Workers: + info = self.Workers[worker] + if current_time - info['ping_time'] > timeout and not info['timeout']: + # worker timeout! + info['timeout'] = True + self._execute (cur, "SELECT last_job FROM Workers WHERE name = '%s' AND state = 'WORKING'" % worker) + data = cur.fetchone () + if data is not None: + self._setJobState (data[0], "WAITING", True) + if self.getWorker(worker)['state'] == "TERMINATED": + # State TERMINATED is more explicit than TIMEOUT for terminated instances + pass + else: + self._setWorkerState (worker, "TIMEOUT") + + # If cloud mode has been set via "servermode" option + if self.cloudconfig: + cloudprovider = self.config.get('server', 'servermode') + # Dynamic module loading for configured provider + self.cloudmanager = import_module('cloud.{}'.format(cloudprovider)) + waitingjobs = self.listUnpausedWaitingJobs() + if len(waitingjobs): + self._manageWorkerInstanceStart(current_time, + waitingjobs) + else: + self._manageWorkerInstanceTerminate(current_time) + + def _manageWorkerInstanceStart(self, current_time, waitingjobs): + """ + Manage worker starting. A new worker is started if the start + delay is reached, if there are more waiting jobs than available + workers and the maximum number of instances has not been reached. + Create an instance via the cloud provider module, create a + worker reference in the coalition DB and update the delay + timestamp. + """ + if current_time - self.lastworkerinstancestarttime < int( + self.cloudconfig.get("coalition", "workerinstancestartdelay")): + return + availableworkers = self.listWorkersByStates("STARTING", "WORKING", "WAITING") + if len(waitingjobs) > len(availableworkers) and len(availableworkers) < int( + self.cloudconfig.get("coalition", "workerinstancemax")): + name = createWorkerInstanceName( + self.cloudconfig.get("worker", "nameprefix")) + self.cloudmanager.startInstance(name, self.cloudconfig) + self.newWorker(name) + self._setWorkerState(name, 'STARTING') + self.lastworkerinstancestarttime = current_time + if self.Verbose: + print("[CLOUD] Starting new instance %s" % name) + + def _manageWorkerInstanceTerminate(self, current_time): + """ + Manage worker termination. Worker instances are terminated if + they are not working and they have been living for at least the + number of second defined by "workerinstancestopdelay". Terminate + via the cloud provider module, update the coalition DB reference. + """ + uselessworkers = self.listWorkersByStates( + "STARTING", "WAITING", "TIMEOUT") + if len(uselessworkers): + for worker in uselessworkers: + name = worker["name"] + lastworkerstarttime = self.getWorkerStartTime(name) + if lastworkerstarttime and ( + current_time - lastworkerstarttime > int( + self.cloudconfig.get("coalition", + "workerinstanceminimumlifetime"))): + self._setWorkerState(name, "TERMINATED") + self.cloudmanager.stopInstance(name, self.cloudconfig) + if self.Verbose: + print("[CLOUD] Terminating instance %s" % name) + + def requiresMigration(self): + """ + Check if database requires migration. + Returns a boolean. + """ + return self._getDatabaseVersion() < self._getMigrationVersion() + + def reset (self): + cur = self.Conn.cursor () + self._execute (cur, "DELETE FROM Jobs"); + self._execute (cur, "DELETE FROM Workers"); + self._execute (cur, "DELETE FROM Dependencies"); + self._execute (cur, "DELETE FROM Events"); + self._execute (cur, "DELETE FROM Affinities"); + self._execute (cur, "DELETE FROM WorkerAffinities"); + print("[SQL] Database has been reset.") + exit(0) # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/db_sqlite.py b/db_sqlite.py index 7c47a49..3c23509 100644 --- a/db_sqlite.py +++ b/db_sqlite.py @@ -5,12 +5,12 @@ class DBSQLite(DBSQL): - def __init__(self, database, **kwargs): - self.config = kwargs["config"] - self.cloudconfig = kwargs["cloudconfig"] - self.Conn = sqlite3.connect(database) + def __init__(self, database, **kwargs): + self.config = kwargs["config"] + self.cloudconfig = kwargs["cloudconfig"] + self.Conn = sqlite3.connect(database) - super(DBSQLite, self).__init__() + super(DBSQLite, self).__init__() # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/host_cpu.py b/host_cpu.py index 4164689..7b80219 100644 --- a/host_cpu.py +++ b/host_cpu.py @@ -4,40 +4,40 @@ import sys,os,re if sys.platform=="win32": - import win32pdh - import win32pdhquery - import win32pdhutil - import _winreg + import win32pdh + import win32pdhquery + import win32pdhutil + import _winreg -# Parse the registry to find the localized perf counter name +# Parse the registry to find the localized perf counter name def pdhTranslateEnglishCounter (counter): - key = _winreg.OpenKey (_winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Perflib\009") - strings = _winreg.QueryValueEx (key, 'Counter')[0] - for i in range(0,len(strings),2): - if counter == strings[i+1]: - return win32pdh.LookupPerfNameByIndex (None, int(strings[i])) - return counter + key = _winreg.OpenKey (_winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Perflib\009") + strings = _winreg.QueryValueEx (key, 'Counter')[0] + for i in range(0,len(strings),2): + if counter == strings[i+1]: + return win32pdh.LookupPerfNameByIndex (None, int(strings[i])) + return counter def cpuCount(): - """Returns the number of CPUs in the system""" - num = 1 - if sys.platform == 'win32': - try: - num = int(os.environ['NUMBER_OF_PROCESSORS']) - except (ValueError, KeyError): - pass - elif sys.platform == 'darwin': - try: - num = int(os.popen('sysctl -n hw.ncpu').read()) - except ValueError: - pass - else: - try: - num = os.sysconf('SC_NPROCESSORS_ONLN') - except (ValueError, OSError, AttributeError): - pass + """Returns the number of CPUs in the system""" + num = 1 + if sys.platform == 'win32': + try: + num = int(os.environ['NUMBER_OF_PROCESSORS']) + except (ValueError, KeyError): + pass + elif sys.platform == 'darwin': + try: + num = int(os.popen('sysctl -n hw.ncpu').read()) + except ValueError: + pass + else: + try: + num = os.sysconf('SC_NPROCESSORS_ONLN') + except (ValueError, OSError, AttributeError): + pass - return num + return num gUser = 0 gNice = 0 @@ -45,7 +45,7 @@ def cpuCount(): gIdle = 0 class HostCPU: - """This class returns the per CPU""" + """This class returns the per CPU""" # def __init__(self): # if sys.platform=="win32": # self.base = win32pdh.OpenQuery() @@ -55,9 +55,9 @@ class HostCPU: # self.Counters.append (win32pdh.AddCounter(self.base, win32pdh.MakeCounterPath((None, pdhTranslateEnglishCounter ("Processor"),str(cpuid),None, -1, pdhTranslateEnglishCounter ("% Processor Time"))))) # #self.Counters.append (win32pdh.AddCounter(self.base, win32pdh.MakeCounterPath((None, "Processor",str(cpuid),None, -1, "% Processor Time")))) # win32pdh.CollectQueryData(self.base) - - def getUsage(self): - ''' Return a list with the usage of each CPU ''' + + def getUsage(self): + ''' Return a list with the usage of each CPU ''' # if sys.platform=="win32": # result = [] # win32pdh.CollectQueryData(self.base) @@ -68,40 +68,40 @@ def getUsage(self): # load = 0 # pass # result.append (load) -# return result +# return result # else: # result = [] -# for cpuid in range(0,cpucount): +# for cpuid in range(0,cpucount): # result.append (0) - if sys.platform!="win32" and sys.platform!="darwin": - global gUser - global gNice - global gSystem - global gIdle - user = 0 - nice = 0 - system = 0 - idle = 0 - file = open ("/proc/stat", "r") - for line in file: - words = re.split ('\W+', line) - if len(words) >= 5: - if words[0] == 'cpu': - user = int(words[1]) - nice = int(words[2]) - system = int(words[3]) - idle = int(words[4]) - usage = (user-gUser)+(nice-gNice)+(system-gSystem) - total = usage+(idle-gIdle) - gUser = user - gNice = nice - gSystem = system - gIdle = idle - if total > 0: - return [100*usage/total] - return [0] - - return [0] + if sys.platform!="win32" and sys.platform!="darwin": + global gUser + global gNice + global gSystem + global gIdle + user = 0 + nice = 0 + system = 0 + idle = 0 + file = open ("/proc/stat", "r") + for line in file: + words = re.split ('\W+', line) + if len(words) >= 5: + if words[0] == 'cpu': + user = int(words[1]) + nice = int(words[2]) + system = int(words[3]) + idle = int(words[4]) + usage = (user-gUser)+(nice-gNice)+(system-gSystem) + total = usage+(idle-gIdle) + gUser = user + gNice = nice + gSystem = system + gIdle = idle + if total > 0: + return [100*usage/total] + return [0] + + return [0] # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/host_mem.py b/host_mem.py index 98136d5..b895a47 100644 --- a/host_mem.py +++ b/host_mem.py @@ -5,69 +5,69 @@ import sys,re,os if sys.platform=="win32": - from ctypes import Structure, c_ulonglong - from ctypes.wintypes import DWORD, windll, byref - DWORDLONG = c_ulonglong - class MEMORYSTATUSEX(Structure): - _fields_ = [ - ('dwLength', DWORD), - ('dwMemoryLoad', DWORD), - ('ullTotalPhys', DWORDLONG), - ('ullAvailPhys', DWORDLONG), - ('ullTotalPageFile', DWORDLONG), - ('ullAvailPageFile', DWORDLONG), - ('ullTotalVirtual', DWORDLONG), - ('ullAvailVirtual', DWORDLONG), - ('ullAvailExtendedVirtual', DWORDLONG) - ] + from ctypes import Structure, c_ulonglong + from ctypes.wintypes import DWORD, windll, byref + DWORDLONG = c_ulonglong + class MEMORYSTATUSEX(Structure): + _fields_ = [ + ('dwLength', DWORD), + ('dwMemoryLoad', DWORD), + ('ullTotalPhys', DWORDLONG), + ('ullAvailPhys', DWORDLONG), + ('ullTotalPageFile', DWORDLONG), + ('ullAvailPageFile', DWORDLONG), + ('ullTotalVirtual', DWORDLONG), + ('ullAvailVirtual', DWORDLONG), + ('ullAvailExtendedVirtual', DWORDLONG) + ] def parseMemInfo(): - memtotal = 0 - memfree = 0 - buffers = 0 - cached = 0 - file = open ("/proc/meminfo", "r") - for line in file: - words = re.split ('\W+', line) - if len(words) >= 2: - if words[0] == 'MemTotal': - memTotal = int(words[1]) - if words[0] == 'MemFree': - memFree = int(words[1]) - if words[0] == 'Buffers': - buffers = int(words[1]) - if words[0] == 'Cached': - cached = int(words[1]) - return memTotal, memFree+buffers+cached + memtotal = 0 + memfree = 0 + buffers = 0 + cached = 0 + file = open ("/proc/meminfo", "r") + for line in file: + words = re.split ('\W+', line) + if len(words) >= 2: + if words[0] == 'MemTotal': + memTotal = int(words[1]) + if words[0] == 'MemFree': + memFree = int(words[1]) + if words[0] == 'Buffers': + buffers = int(words[1]) + if words[0] == 'Cached': + cached = int(words[1]) + return memTotal, memFree+buffers+cached def getTotalMem (): - if sys.platform=="win32": - x = MEMORYSTATUSEX() # create the structure - x.dwLength = 8*8; - windll.kernel32.GlobalMemoryStatusEx(byref(x)) # from cytypes.wintypes - return x.ullTotalPhys - elif sys.platform=="darwin": - return int(os.popen('/usr/sbin/sysctl -n hw.memsize').read()) - else: - total, free = parseMemInfo () - return total * 1024 + if sys.platform=="win32": + x = MEMORYSTATUSEX() # create the structure + x.dwLength = 8*8; + windll.kernel32.GlobalMemoryStatusEx(byref(x)) # from cytypes.wintypes + return x.ullTotalPhys + elif sys.platform=="darwin": + return int(os.popen('/usr/sbin/sysctl -n hw.memsize').read()) + else: + total, free = parseMemInfo () + return total * 1024 def getAvailableMem (): - if sys.platform=="win32": - x = MEMORYSTATUSEX() # create the structure - x.dwLength = 8*8; - windll.kernel32.GlobalMemoryStatusEx(byref(x)) # from cytypes.wintypes - return x.ullAvailPhys - elif sys.platform=="darwin": - for line in os.popen('/usr/bin/vm_stat').readlines(): - if line.startswith('Pages free'): - data = line.split() - return int(data[2].rstrip('.')) * 4 * 1024 - return 0 - else: - total, free = parseMemInfo () - return free * 1024 + if sys.platform=="win32": + x = MEMORYSTATUSEX() # create the structure + x.dwLength = 8*8; + windll.kernel32.GlobalMemoryStatusEx(byref(x)) # from cytypes.wintypes + return x.ullAvailPhys + elif sys.platform=="darwin": + for line in os.popen('/usr/bin/vm_stat').readlines(): + if line.startswith('Pages free'): + data = line.split() + return int(data[2].rstrip('.')) * 4 * 1024 + return 0 + else: + total, free = parseMemInfo () + return free * 1024 # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/install/win32/build_installer.py b/install/win32/build_installer.py index 61d7714..00fa030 100644 --- a/install/win32/build_installer.py +++ b/install/win32/build_installer.py @@ -17,80 +17,80 @@ # Compile the services # os.chdir ("../..") if compile: - os.system ("python server.py remove") - os.system ("python setup_py2exe.py install") - os.system ("python setup_py2exe.py py2exe") + os.system ("python server.py remove") + os.system ("python setup_py2exe.py install") + os.system ("python setup_py2exe.py py2exe") if buildNsis: - # Get the version number - f = open ("coalition.version", "r") - version = f.read () - version = re.sub ("\n", "", version) - version = re.sub ("\r", "", version) - f.close () - - # Generates the NSIS script - f = open ("install/win32/coalition.nsi", "r") - script = f.read () - f.close () - - installFiles = "" - removeFiles = "" - currentDir = "" - currentPath = "" - - def setOutPath (path, goin): - global installFiles, removeFiles, currentDir, currentPath - currentPath = path - currentDir = path == "" and "$INSTDIR" or ("$INSTDIR\\" + path) - installFiles = installFiles + "\tSetOutPath \"" + currentDir + "\"\n" - if goin: - removeFiles = "\tRMDir \"" + currentDir + "\"\n" + removeFiles - - def addFile (localpath): - global installFiles, removeFiles, currentDir - currentFile = currentDir + "\\" + os.path.basename (localpath) - installFiles = installFiles + "\tFile \"" + localpath + "\"\n" - removeFiles = "\tDelete \"" + currentFile + "\"\n" + removeFiles - - def addFiles (localpath, rec): - global currentPath - for file in os.listdir(localpath): - filename = localpath + "\\" + file - if os.path.isdir (filename): - if rec and file != ".svn": - oldpath = currentPath - setOutPath (currentPath + "\\" + file, True) - addFiles (filename, rec) - setOutPath (oldpath, False) - else: - addFile (filename) - - setOutPath ("", True) - addFile ("coalition.ini") - addFile ("images\coalition.ico") - addFile ("images\server_start.ico") - addFile ("images\server_stop.ico") - addFile ("images\worker_start.ico") - addFile ("images\worker_stop.ico") - addFile ("vcredist_x86.exe") - addFiles ("dist", True) - setOutPath ("public_html", True) - addFiles ("public_html", True) - - installFiles = re.sub ("\\\\", "\\\\\\\\", installFiles) - script = re.sub ("__INSTALL_FILES__", installFiles, script) - removeFiles = re.sub ("\\\\", "\\\\\\\\", removeFiles) - script = re.sub ("__REMOVE_FILES__", removeFiles, script) - - script = re.sub ("__VERSION__", version, script) - - f = open ("_coalition.nsi", "w") - f.write (script) - f.close () - - # Run NSIS - os.system ("\"" + NSISDir + "/makensis.exe\" _coalition.nsi") + # Get the version number + f = open ("coalition.version", "r") + version = f.read () + version = re.sub ("\n", "", version) + version = re.sub ("\r", "", version) + f.close () + + # Generates the NSIS script + f = open ("install/win32/coalition.nsi", "r") + script = f.read () + f.close () + + installFiles = "" + removeFiles = "" + currentDir = "" + currentPath = "" + + def setOutPath (path, goin): + global installFiles, removeFiles, currentDir, currentPath + currentPath = path + currentDir = path == "" and "$INSTDIR" or ("$INSTDIR\\" + path) + installFiles = installFiles + "\tSetOutPath \"" + currentDir + "\"\n" + if goin: + removeFiles = "\tRMDir \"" + currentDir + "\"\n" + removeFiles + + def addFile (localpath): + global installFiles, removeFiles, currentDir + currentFile = currentDir + "\\" + os.path.basename (localpath) + installFiles = installFiles + "\tFile \"" + localpath + "\"\n" + removeFiles = "\tDelete \"" + currentFile + "\"\n" + removeFiles + + def addFiles (localpath, rec): + global currentPath + for file in os.listdir(localpath): + filename = localpath + "\\" + file + if os.path.isdir (filename): + if rec and file != ".svn": + oldpath = currentPath + setOutPath (currentPath + "\\" + file, True) + addFiles (filename, rec) + setOutPath (oldpath, False) + else: + addFile (filename) + + setOutPath ("", True) + addFile ("coalition.ini") + addFile ("images\coalition.ico") + addFile ("images\server_start.ico") + addFile ("images\server_stop.ico") + addFile ("images\worker_start.ico") + addFile ("images\worker_stop.ico") + addFile ("vcredist_x86.exe") + addFiles ("dist", True) + setOutPath ("public_html", True) + addFiles ("public_html", True) + + installFiles = re.sub ("\\\\", "\\\\\\\\", installFiles) + script = re.sub ("__INSTALL_FILES__", installFiles, script) + removeFiles = re.sub ("\\\\", "\\\\\\\\", removeFiles) + script = re.sub ("__REMOVE_FILES__", removeFiles, script) + + script = re.sub ("__VERSION__", version, script) + + f = open ("_coalition.nsi", "w") + f.write (script) + f.close () + + # Run NSIS + os.system ("\"" + NSISDir + "/makensis.exe\" _coalition.nsi") # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/job.py b/job.py index 5818cc6..d193f5f 100644 --- a/job.py +++ b/job.py @@ -8,9 +8,9 @@ import time, sys for i in range(1000) : - print ("P:"+str(float(i)/1000)) - sys.stdout.flush() - time.sleep (0.01) + print ("P:"+str(float(i)/1000)) + sys.stdout.flush() + time.sleep (0.01) sys.exit (0) diff --git a/migrations/0000_db_mysql.py b/migrations/0000_db_mysql.py index b0cb072..66b7e43 100644 --- a/migrations/0000_db_mysql.py +++ b/migrations/0000_db_mysql.py @@ -5,79 +5,79 @@ steps = [ """ CREATE TABLE IF NOT EXISTS WorkerAffinities( - id INTEGER PRIMARY KEY AUTO_INCREMENT, - worker_name VARCHAR(255), - affinity BIGINT DEFAULT 0, - ordering INT DEFAULT 0) + id INTEGER PRIMARY KEY AUTO_INCREMENT, + worker_name VARCHAR(255), + affinity BIGINT DEFAULT 0, + ordering INT DEFAULT 0) """, """ CREATE TABLE IF NOT EXISTS Jobs( - id INTEGER PRIMARY KEY AUTO_INCREMENT, - parent INT DEFAULT 0, - title TEXT, - command TEXT, - dir TEXT, - environment TEXT, - state TEXT, - paused BOOLEAN DEFAULT 0, - worker TEXT, - start_time INT DEFAULT 0, - duration INT DEFAULT 0, - run_done INT DEFAULT 0, - timeout INT DEFAULT 0, - priority INT UNSIGNED DEFAULT 8, - affinity TEXT, - affinity_bits BIGINT DEFAULT 0, - user TEXT, - finished INT DEFAULT 0, - errors INT DEFAULT 0, - working INT DEFAULT 0, - total INT DEFAULT 0, - total_finished INT DEFAULT 0, - total_errors INT DEFAULT 0, - total_working INT DEFAULT 0, - url TEXT, - progress FLOAT, - progress_pattern TEXT, - h_affinity BIGINT DEFAULT 0, - h_priority BIGINT UNSIGNED DEFAULT 0, - h_paused BOOLEAN DEFAULT 0, - h_depth INT DEFAULT 0) + id INTEGER PRIMARY KEY AUTO_INCREMENT, + parent INT DEFAULT 0, + title TEXT, + command TEXT, + dir TEXT, + environment TEXT, + state TEXT, + paused BOOLEAN DEFAULT 0, + worker TEXT, + start_time INT DEFAULT 0, + duration INT DEFAULT 0, + run_done INT DEFAULT 0, + timeout INT DEFAULT 0, + priority INT UNSIGNED DEFAULT 8, + affinity TEXT, + affinity_bits BIGINT DEFAULT 0, + user TEXT, + finished INT DEFAULT 0, + errors INT DEFAULT 0, + working INT DEFAULT 0, + total INT DEFAULT 0, + total_finished INT DEFAULT 0, + total_errors INT DEFAULT 0, + total_working INT DEFAULT 0, + url TEXT, + progress FLOAT, + progress_pattern TEXT, + h_affinity BIGINT DEFAULT 0, + h_priority BIGINT UNSIGNED DEFAULT 0, + h_paused BOOLEAN DEFAULT 0, + h_depth INT DEFAULT 0) """, """ CREATE TABLE IF NOT EXISTS Dependencies( - job_id Int, dependency INT) + job_id Int, dependency INT) """, """ CREATE TABLE IF NOT EXISTS Workers( - name VARCHAR(255), - start_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - ip TEXT, - affinity TEXT, - state TEXT, - finished INT, - error INT, - last_job INT, - current_event INT, - cpu TEXT, - free_memory INT, - total_memory int, - active BOOLEAN) + name VARCHAR(255), + start_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ip TEXT, + affinity TEXT, + state TEXT, + finished INT, + error INT, + last_job INT, + current_event INT, + cpu TEXT, + free_memory INT, + total_memory int, + active BOOLEAN) """, """ CREATE TABLE IF NOT EXISTS Events( - id INTEGER PRIMARY KEY AUTO_INCREMENT, - worker VARCHAR(255), - job_id INT, - job_title TEXT, - state TEXT, - start INT, - duration INT) + id INTEGER PRIMARY KEY AUTO_INCREMENT, + worker VARCHAR(255), + job_id INT, + job_title TEXT, + state TEXT, + start INT, + duration INT) """, """ CREATE TABLE IF NOT EXISTS Affinities( - id INTEGER, - name TEXT) + id INTEGER, + name TEXT) """, """ CREATE INDEX worker_name_index ON WorkerAffinities(worker_name) diff --git a/migrations/0000_db_sqlite.py b/migrations/0000_db_sqlite.py index 12c9ca0..c251ead 100644 --- a/migrations/0000_db_sqlite.py +++ b/migrations/0000_db_sqlite.py @@ -5,55 +5,55 @@ steps = [ """ CREATE TABLE IF NOT EXISTS WorkerAffinities( - id INTEGER PRIMARY KEY AUTOINCREMENT, - worker_name TEXT, - affinity BIGINT DEFAULT 0, - ordering INT DEFAULT 0) + id INTEGER PRIMARY KEY AUTOINCREMENT, + worker_name TEXT, + affinity BIGINT DEFAULT 0, + ordering INT DEFAULT 0) """, """ CREATE INDEX IF NOT EXISTS worker_name_index ON WorkerAffinities(worker_name) """, """ CREATE TABLE IF NOT EXISTS Jobs( - id INTEGER PRIMARY KEY AUTOINCREMENT, - parent INT DEFAULT 0, - title TEXT DEFAULT "", - command TEXT DEFAULT "", - dir TEXT DEFAULT ".", - environment TEXT DEFAULT "", - state TEXT DEFAULT "WAITING", - paused BOOLEAN DEFAULT 0, - worker TEXT DEFAULT "", - start_time INT DEFAULT 0, - duration INT DEFAULT 0, - run_done INT DEFAULT 0, - timeout INT DEFAULT 0, - priority UNSIGNED INT DEFAULT 8, - affinity TEXT DEFAULT "", - affinity_bits BIGINT DEFAULT 0, - user TEXT DEFAULT "", - finished INT DEFAULT 0, - errors INT DEFAULT 0, - working INT DEFAULT 0, - total INT DEFAULT 0, - total_finished INT DEFAULT 0, - total_errors INT DEFAULT 0, - total_working INT DEFAULT 0, - url TEXT DEFAULT "", - progress FLOAT, - progress_pattern TEXT DEFAULT "", - h_affinity BIGINT DEFAULT 0, - h_priority UNSIGNED BIGINT DEFAULT 0, - h_paused BOOLEAN DEFAULT 0, - h_depth INT DEFAULT 0) + id INTEGER PRIMARY KEY AUTOINCREMENT, + parent INT DEFAULT 0, + title TEXT DEFAULT "", + command TEXT DEFAULT "", + dir TEXT DEFAULT ".", + environment TEXT DEFAULT "", + state TEXT DEFAULT "WAITING", + paused BOOLEAN DEFAULT 0, + worker TEXT DEFAULT "", + start_time INT DEFAULT 0, + duration INT DEFAULT 0, + run_done INT DEFAULT 0, + timeout INT DEFAULT 0, + priority UNSIGNED INT DEFAULT 8, + affinity TEXT DEFAULT "", + affinity_bits BIGINT DEFAULT 0, + user TEXT DEFAULT "", + finished INT DEFAULT 0, + errors INT DEFAULT 0, + working INT DEFAULT 0, + total INT DEFAULT 0, + total_finished INT DEFAULT 0, + total_errors INT DEFAULT 0, + total_working INT DEFAULT 0, + url TEXT DEFAULT "", + progress FLOAT, + progress_pattern TEXT DEFAULT "", + h_affinity BIGINT DEFAULT 0, + h_priority UNSIGNED BIGINT DEFAULT 0, + h_paused BOOLEAN DEFAULT 0, + h_depth INT DEFAULT 0) """, """ CREATE INDEX IF NOT EXISTS Parent_index ON Jobs(parent) """, """ CREATE TABLE IF NOT EXISTS Dependencies( - job_id Int, - dependency INT) + job_id Int, + dependency INT) """, """ CREATE INDEX IF NOT EXISTS JobId_index ON Dependencies(job_id) @@ -63,31 +63,31 @@ """, """ CREATE TABLE IF NOT EXISTS Workers( - name TEXT, - start_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - ip TEXT, - affinity TEXT DEFAULT "", - state TEXT, - finished INT, - error INT, - last_job INT, - current_event INT, - cpu TEXT, - free_memory INT, - total_memory int, - active BOOLEAN) + name TEXT, + start_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ip TEXT, + affinity TEXT DEFAULT "", + state TEXT, + finished INT, + error INT, + last_job INT, + current_event INT, + cpu TEXT, + free_memory INT, + total_memory int, + active BOOLEAN) """, """ CREATE UNIQUE INDEX IF NOT EXISTS Name_index ON Workers (name) """, """ CREATE TABLE IF NOT EXISTS Events( - id INTEGER PRIMARY KEY AUTOINCREMENT, - worker TEXT, job_id INT, - job_title TEXT, - state TEXT, - start INT, - duration INT) + id INTEGER PRIMARY KEY AUTOINCREMENT, + worker TEXT, job_id INT, + job_title TEXT, + state TEXT, + start INT, + duration INT) """, """ CREATE INDEX IF NOT EXISTS Worker_index ON Events(worker) @@ -100,8 +100,8 @@ """, """ CREATE TABLE IF NOT EXISTS Affinities( - id INTEGER, - name TEXT) + id INTEGER, + name TEXT) """ ] diff --git a/migrations/0001_db_mysql.py b/migrations/0001_db_mysql.py index f9a8196..4b8c042 100644 --- a/migrations/0001_db_mysql.py +++ b/migrations/0001_db_mysql.py @@ -14,3 +14,4 @@ ] # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 + diff --git a/migrations/0001_db_sqlite.py b/migrations/0001_db_sqlite.py index 5e18fe3..4b8c042 100644 --- a/migrations/0001_db_sqlite.py +++ b/migrations/0001_db_sqlite.py @@ -6,7 +6,7 @@ steps = [ """ CREATE TABLE IF NOT EXISTS Migrations( - database_version INT) + database_version INT) """, """ INSERT INTO Migrations (database_version) VALUES (1) diff --git a/qarnot/__init__.py b/qarnot/__init__.py index 5e6e58e..42f3260 100644 --- a/qarnot/__init__.py +++ b/qarnot/__init__.py @@ -65,3 +65,4 @@ def get_url(key, **kwargs): from ._version import get_versions # noqa __version__ = get_versions()['version'] del get_versions + diff --git a/qarnot/_version.py b/qarnot/_version.py index deb7ffc..ec5a63f 100644 --- a/qarnot/_version.py +++ b/qarnot/_version.py @@ -482,3 +482,4 @@ def get_versions(): return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version"} + diff --git a/qarnot/connection.py b/qarnot/connection.py index bb65fec..6b23971 100644 --- a/qarnot/connection.py +++ b/qarnot/connection.py @@ -510,3 +510,4 @@ def __init__(self, info): def __repr__(self): return 'Profile(name=%s, constants=%r}' % (self.name, self.constants) + diff --git a/qarnot/disk.py b/qarnot/disk.py index 6bc45fc..5ef7256 100644 --- a/qarnot/disk.py +++ b/qarnot/disk.py @@ -1123,3 +1123,4 @@ class UploadMode(object): """Launch a background thread for uploading.""" lazy = 2 """Actual uploading is made by the :func:`~Disk.flush` method call.""" + diff --git a/qarnot/exceptions.py b/qarnot/exceptions.py index 8c406dd..7276913 100644 --- a/qarnot/exceptions.py +++ b/qarnot/exceptions.py @@ -65,3 +65,4 @@ class NotEnoughCreditsException(Exception): class LockedDiskException(Exception): """Locked disk.""" pass + diff --git a/qarnot/task.py b/qarnot/task.py index 1e77b1e..7d8d6c7 100644 --- a/qarnot/task.py +++ b/qarnot/task.py @@ -1399,3 +1399,4 @@ def __str__(self): return ', '.join("{0}={1}".format(key, val) for (key, val) in self.__dict__.items()) else: return ', '.join("{0}={1}".format(key, val) for (key, val) in self.__dict__.iteritems()) # pylint: disable=no-member + diff --git a/server.py b/server.py index 0be8495..5659524 100644 --- a/server.py +++ b/server.py @@ -21,635 +21,635 @@ # Configuration functions def cfgInt (name, defvalue): - global config - if config.has_option('server', name): - try: - return int (config.get('server', name)) - except: - pass - return defvalue + global config + if config.has_option('server', name): + try: + return int (config.get('server', name)) + except: + pass + return defvalue def cfgBool (name, defvalue): - global config - if config.has_option('server', name): - try: - return int (config.get('server', name)) != 0 - except: - pass - return defvalue + global config + if config.has_option('server', name): + try: + return int (config.get('server', name)) != 0 + except: + pass + return defvalue def cfgStr (name, defvalue): - global config - if config.has_option('server', name): - try: - return str (config.get('server', name)) - except: - pass - return defvalue + global config + if config.has_option('server', name): + try: + return str (config.get('server', name)) + except: + pass + return defvalue def usage(): - print ("Usage: server [OPTIONS]") - print ("Start a Coalition server.\n") - print ("Options:") - print (" -h, --help\t\tShow this help") - print (" -p, --port=PORT\tPort used by the server (default: "+str(port)+")") - print (" -v, --verbose\t\tIncrease verbosity") - print (" --init\t\tInitialize the database") - print (" --migrate\t\tMigrate the database with interactive confirmation") - print (" --reset\t\tReset the database (warning: all previous data are lost)") - if sys.platform == "win32": - print (" -c, --console=\t\tRun as a windows console application") - print (" -s, --service=\t\tRun as a windows service") - print ("\nExample : server -p 1234") + print ("Usage: server [OPTIONS]") + print ("Start a Coalition server.\n") + print ("Options:") + print (" -h, --help\t\tShow this help") + print (" -p, --port=PORT\tPort used by the server (default: "+str(port)+")") + print (" -v, --verbose\t\tIncrease verbosity") + print (" --init\t\tInitialize the database") + print (" --migrate\t\tMigrate the database with interactive confirmation") + print (" --reset\t\tReset the database (warning: all previous data are lost)") + if sys.platform == "win32": + print (" -c, --console=\t\tRun as a windows console application") + print (" -s, --service=\t\tRun as a windows service") + print ("\nExample : server -p 1234") # Log functions def vprint (str): - if verbose: - print (str) - sys.stdout.flush() + if verbose: + print (str) + sys.stdout.flush() def getLogFilename (jobId): - global dataDir - return dataDir + "/logs/" + str(jobId) + ".log" + global dataDir + return dataDir + "/logs/" + str(jobId) + ".log" def getLogFilter (pattern): - """Get the pattern filter from the cache or add one""" - global LogFilterCache - try: - filter = LogFilterCache[pattern] - except KeyError: - filter = LogFilter (pattern) - LogFilterCache[pattern] = filter - return filter + """Get the pattern filter from the cache or add one""" + global LogFilterCache + try: + filter = LogFilterCache[pattern] + except KeyError: + filter = LogFilter (pattern) + LogFilterCache[pattern] = filter + return filter def writeJobLog (jobId, log): - logFile = open (getLogFilename (jobId), "a") - logFile.write (log) - logFile.close () + logFile = open (getLogFilename (jobId), "a") + logFile.write (log) + logFile.close () # Notify functions def sendEmail (to, message) : - if to != "" : - vprint ("Send email to " + to + " : " + message) - if smtphost != "" : - # Create a text/plain message - msg = MIMEText(message) - - # me == the sender's email address - # you == the recipient's email address - msg['Subject'] = message - msg['From'] = smtpsender - msg['To'] = to - - # Send the message via our own SMTP server, but don't include the - # envelope header. - try: - s = smtplib.SMTP(smtphost, smtpport) - if smtptls: - s.ehlo() - s.starttls() - s.ehlo() - if smtplogin != '' or smtppasswd != '': - s.login(smtplogin, smtppasswd) - s.sendmail (smtpsender, [to], msg.as_string()) - s.quit() - except Exception as inst: - vprint (inst) - pass + if to != "" : + vprint ("Send email to " + to + " : " + message) + if smtphost != "" : + # Create a text/plain message + msg = MIMEText(message) + + # me == the sender's email address + # you == the recipient's email address + msg['Subject'] = message + msg['From'] = smtpsender + msg['To'] = to + + # Send the message via our own SMTP server, but don't include the + # envelope header. + try: + s = smtplib.SMTP(smtphost, smtpport) + if smtptls: + s.ehlo() + s.starttls() + s.ehlo() + if smtplogin != '' or smtppasswd != '': + s.login(smtplogin, smtppasswd) + s.sendmail (smtpsender, [to], msg.as_string()) + s.quit() + except Exception as inst: + vprint (inst) + pass def notifyError (job): - if job['user'] : - sendEmail (job['user'], 'ERRORS in job ' + job['title'] + ' (' + str(job['id']) + ').') + if job['user'] : + sendEmail (job['user'], 'ERRORS in job ' + job['title'] + ' (' + str(job['id']) + ').') def notifyFinished (job): - if job['user'] : - sendEmail (job['user'], 'The job ' + job['title'] + ' (' + str(job['id']) + ') is FINISHED.') + if job['user'] : + sendEmail (job['user'], 'The job ' + job['title'] + ' (' + str(job['id']) + ') is FINISHED.') def notifyFirstFinished (job): - if job['user'] : - sendEmail (job['user'], 'The job ' + job['title'] + ' (' + str(job['id']) + ') has finished ' + str(notifyafter) + ' jobs.') + if job['user'] : + sendEmail (job['user'], 'The job ' + job['title'] + ' (' + str(job['id']) + ') has finished ' + str(notifyafter) + ' jobs.') def _interactiveConfirmation(confirmation_sentence="Yes I know what I'm doing."): - """Ask the user for confirmation.""" - text = "Please write this sentence then press enter to confirm:\n"+confirmation_sentence+'\n' - print (text) - sys.stdout.flush() - answer = raw_input() - if answer == confirmation_sentence: - return True - return False + """Ask the user for confirmation.""" + text = "Please write this sentence then press enter to confirm:\n"+confirmation_sentence+'\n' + print (text) + sys.stdout.flush() + answer = raw_input() + if answer == confirmation_sentence: + return True + return False ### LDAP functions ### ### LDAP classes and functions ### def authenticate(request, ldap_permissions): - """Check user authentication via LDAP if LDAP is configured in settings. If authenticated, get users permissions.""" - - def _getLdapPermissions(connection, username): - ldap_base = cfgStr("ldapbase", "") - - def _ldapSearch(connection, query): - if connection.search_ext_s(ldap_base, ldap.SCOPE_SUBTREE, query, ['dn']): - return True - return False - - for permission in ldap_permissions.keys(): - search_template = cfgStr(permission, "").replace("__login__", username) - ldap_permissions[permission] = _ldapSearch(connection, search_template) - - return ldap_permissions - - if LDAPServer: - username = request.getUser() - password = request.getPassword() - - if config.has_option("server", "ldapunsafeapi") and config.getboolean("server", "ldapunsafeapi") and not isWebFrontend(request): - # This request does not comes from the webfrontend and unsafe mode is set. - # Granting full access. - vprint("[LDAP] Access granted for unsafe API") - for k in ldap_permissions.keys(): - ldap_permissions[k] = True - return True, ldap_permissions - - if username or password: - l = ldap.initialize(LDAPServer) - vprint("[LDAP] Authenticate {}".format(username)) - ldapUsername = LDAPTemplateLogin.replace("__login__", username) - try: - if l.bind_s(ldapUsername, password, ldap.AUTH_SIMPLE): - vprint("[LDAP] Authentication accepted for user {}".format(username)) - request.addCookie("authenticated_user", username, path="/") - ldap_permissions = _getLdapPermissions(l, username) - return True, ldap_permissions - - except ldap.LDAPError as e: - vprint("[LDAP] Authentication failed for user {}".format(username)) - vprint("[LDAP] {}".format(e)) - pass - else: - vprint("[LDAP] Authentication required") - request.setHeader("WWW-Authenticate", 'Basic realm="Coalition login"') - request.setResponseCode(http.UNAUTHORIZED) - return False, {} - return True, ldap_permissions + """Check user authentication via LDAP if LDAP is configured in settings. If authenticated, get users permissions.""" + + def _getLdapPermissions(connection, username): + ldap_base = cfgStr("ldapbase", "") + + def _ldapSearch(connection, query): + if connection.search_ext_s(ldap_base, ldap.SCOPE_SUBTREE, query, ['dn']): + return True + return False + + for permission in ldap_permissions.keys(): + search_template = cfgStr(permission, "").replace("__login__", username) + ldap_permissions[permission] = _ldapSearch(connection, search_template) + + return ldap_permissions + + if LDAPServer: + username = request.getUser() + password = request.getPassword() + + if config.has_option("server", "ldapunsafeapi") and config.getboolean("server", "ldapunsafeapi") and not isWebFrontend(request): + # This request does not comes from the webfrontend and unsafe mode is set. + # Granting full access. + vprint("[LDAP] Access granted for unsafe API") + for k in ldap_permissions.keys(): + ldap_permissions[k] = True + return True, ldap_permissions + + if username or password: + l = ldap.initialize(LDAPServer) + vprint("[LDAP] Authenticate {}".format(username)) + ldapUsername = LDAPTemplateLogin.replace("__login__", username) + try: + if l.bind_s(ldapUsername, password, ldap.AUTH_SIMPLE): + vprint("[LDAP] Authentication accepted for user {}".format(username)) + request.addCookie("authenticated_user", username, path="/") + ldap_permissions = _getLdapPermissions(l, username) + return True, ldap_permissions + + except ldap.LDAPError as e: + vprint("[LDAP] Authentication failed for user {}".format(username)) + vprint("[LDAP] {}".format(e)) + pass + else: + vprint("[LDAP] Authentication required") + request.setHeader("WWW-Authenticate", 'Basic realm="Coalition login"') + request.setResponseCode(http.UNAUTHORIZED) + return False, {} + return True, ldap_permissions def grantAddJob(user, cmd): - """Check if the logged in user can add this command.""" - def checkWhiteList(wl): - for pattern in wl: - if (re.match (pattern, cmd)): - return True - else: - vprint("[LDAP] Not authorized. User {} is not allowed to add the command {}".format(user, cmd)) - return False - - # Is user defined white list ? - if user in UserCmdWhiteList: - wl = UserCmdWhiteList[user] - if checkWhiteList(wl): - return True - # If in the global command white list - if GlobalCmdWhiteList: - if checkWhiteList(GlobalCmdWhiteList): - return True - return False - else: - # If in the global command white list - if GlobalCmdWhiteList: - if not checkWhiteList(GlobalCmdWhiteList): - return False - - # Cleared - return True + """Check if the logged in user can add this command.""" + def checkWhiteList(wl): + for pattern in wl: + if (re.match (pattern, cmd)): + return True + else: + vprint("[LDAP] Not authorized. User {} is not allowed to add the command {}".format(user, cmd)) + return False + + # Is user defined white list ? + if user in UserCmdWhiteList: + wl = UserCmdWhiteList[user] + if checkWhiteList(wl): + return True + # If in the global command white list + if GlobalCmdWhiteList: + if checkWhiteList(GlobalCmdWhiteList): + return True + return False + else: + # If in the global command white list + if GlobalCmdWhiteList: + if not checkWhiteList(GlobalCmdWhiteList): + return False + + # Cleared + return True def listenUDP(): - """Listen to UDP socket to respond to the workers broadcast.""" - from socket import SOL_SOCKET, SO_BROADCAST - from socket import socket, AF_INET, SOCK_DGRAM, error - s = socket (AF_INET, SOCK_DGRAM) - s.bind (('0.0.0.0', port)) - while 1: - try: - data, addr = s.recvfrom (1024) - s.sendto ("roxor", addr) - except: - pass + """Listen to UDP socket to respond to the workers broadcast.""" + from socket import SOL_SOCKET, SO_BROADCAST + from socket import socket, AF_INET, SOCK_DGRAM, error + s = socket (AF_INET, SOCK_DGRAM) + s.bind (('0.0.0.0', port)) + while 1: + try: + data, addr = s.recvfrom (1024) + s.sendto ("roxor", addr) + except: + pass def main(): - """Start the UDP server used for the broadcast.""" - thread.start_new_thread (listenUDP, ()) - - from twisted.internet import reactor - from twisted.web import server - root = Root("public_html") - webService = Master() - workers = Workers() - root.putChild('xmlrpc', webService) - root.putChild('api', webService) - root.putChild('workers', workers) - vprint ("[Init] Listen on port " + str (port)) - reactor.listenTCP(port, server.Site(root)) - reactor.run() + """Start the UDP server used for the broadcast.""" + thread.start_new_thread (listenUDP, ()) + + from twisted.internet import reactor + from twisted.web import server + root = Root("public_html") + webService = Master() + workers = Workers() + root.putChild('xmlrpc', webService) + root.putChild('api', webService) + root.putChild('workers', workers) + vprint ("[Init] Listen on port " + str (port)) + reactor.listenTCP(port, server.Site(root)) + reactor.run() ### Classes ### class LogFilter: - """A log filter object. The log pattern must include a '%percent' or a '%one' key word.""" - - def __init__ (self, pattern): - # 0~100 or 0~1 ? - self.IsPercent = re.match (".*%percent.*", pattern) != None - - # Build the final pattern for the RE - if self.IsPercent: - pattern = re.sub ("%percent", "([0-9.]+)", pattern) - else: - pattern = re.sub ("%one", "([0-9.]+)", pattern) - - # Final progress filter - self.RE = re.compile(pattern) - - # Put it in the cache - global LogFilterCache - LogFilterCache[pattern] = self - - def filterLogs (self, log): - """Return the filtered log and the last progress, if any""" - progress = None - for m in self.RE.finditer (log): - capture = m.group(1) - try: - progress = float(capture) / (self.IsPercent and 100.0 or 1.0) - except ValueError: - pass - #return self.RE.sub ("", log), progress - return log, progress + """A log filter object. The log pattern must include a '%percent' or a '%one' key word.""" + + def __init__ (self, pattern): + # 0~100 or 0~1 ? + self.IsPercent = re.match (".*%percent.*", pattern) != None + + # Build the final pattern for the RE + if self.IsPercent: + pattern = re.sub ("%percent", "([0-9.]+)", pattern) + else: + pattern = re.sub ("%one", "([0-9.]+)", pattern) + + # Final progress filter + self.RE = re.compile(pattern) + + # Put it in the cache + global LogFilterCache + LogFilterCache[pattern] = self + + def filterLogs (self, log): + """Return the filtered log and the last progress, if any""" + progress = None + for m in self.RE.finditer (log): + capture = m.group(1) + try: + progress = float(capture) / (self.IsPercent and 100.0 or 1.0) + except ValueError: + pass + #return self.RE.sub ("", log), progress + return log, progress def ldapUserAllowed(user, action): - """Check if user is allowed to do this action.""" - vprint("Is user {} allowed to do {}?".format(user, action)) - # Cleared - return True + """Check if user is allowed to do this action.""" + vprint("Is user {} allowed to do {}?".format(user, action)) + # Cleared + return True def isWebFrontend(request): - """Check if the request comes from the webfrontend.""" - m = re.match(r"^/api/webfrontend/", request.path) - if m: - return True - else: - return False + """Check if the request comes from the webfrontend.""" + m = re.match(r"^/api/webfrontend/", request.path) + if m: + return True + else: + return False ### Twisted class ### class Root(static.File): - """Create twisted landing page and check if LDAP authentication is required.""" - - def __init__(self, path, defaultType="text/html", ignoredExts=(), registry=None, allowExt=0): - static.File.__init__(self, path, defaultType, ignoredExts, registry, allowExt) - - def render(self, request): - if isWebFrontend(request): - (authenticated, permissions) = authenticate(request, ldap_permissions) - request.path = request.path.replace("webfrontend/", "", 1) - else: - authenticated = True - permissions = ldap_permissions - if authenticated: - return static.File.render(self, request) - request.setResponseCode(http.UNAUTHORIZED) - return "LDAP authorization required." + """Create twisted landing page and check if LDAP authentication is required.""" + + def __init__(self, path, defaultType="text/html", ignoredExts=(), registry=None, allowExt=0): + static.File.__init__(self, path, defaultType, ignoredExts, registry, allowExt) + + def render(self, request): + if isWebFrontend(request): + (authenticated, permissions) = authenticate(request, ldap_permissions) + request.path = request.path.replace("webfrontend/", "", 1) + else: + authenticated = True + permissions = ldap_permissions + if authenticated: + return static.File.render(self, request) + request.setResponseCode(http.UNAUTHORIZED) + return "LDAP authorization required." ### XMLRPC API classes ### class Master(xmlrpc.XMLRPC): - """Defines XMLRPC and API for users interactions. Defines logger.""" - - def __init__(self): - self.user = "" # Default value, overwritten later in case of LDAP authentication - - def render(self, request): - with db: - vprint("[{}] {}".format(request.method, request.path)) - if isWebFrontend(request): - (authenticated, permissions) = authenticate(request, ldap_permissions) - request.path = request.path.replace("webfrontend/", "", 1) - else: - authenticated = True - permissions = ldap_permissions - if authenticated: - self.user = db.ldap_user = request.getUser() - db.permissions = permissions - - def getArg(name, default): - value = request.args.get(name, [default]) - return value[0] - - # The legacy method for compatibility - if request.path == "/xmlrpc/addjob": - parent = getArg("parent", "0") - title = getArg("title", "New job") - cmd = getArg("cmd", getArg("command", "")) - dir = getArg("dir", ".") - environment = getArg("env", None) - if environment == "": - environment = None - priority = getArg("priority", "1000") - timeout = getArg("timeout", "0") - affinity = getArg("affinity", "") - dependencies = getArg("dependencies", "") - progress_pattern = getArg("localprogress", "") - url = getArg("url", "") - user = getArg("user", "") - state = getArg("state", "WAITING") - paused = getArg("paused", "0") - if self.user != "": - user = self.user - - if grantAddJob(self.user, cmd): - vprint ("Add job: {}".format(cmd)) - # try as an int - parent = int(parent) - if type(dependencies) is str: - # Parse the dependencies string - dependencies = re.findall('(\d+)', dependencies) - for i, dep in enumerate(dependencies) : - dependencies[i] = int(dep) - - job = db.newJob (parent, str (title), str (cmd), str (dir), str (environment), - str (state), int (paused), int (timeout), int (priority), str (affinity), - str (user), str (url), str (progress_pattern)) - if job is not None: - db.setJobDependencies(job['id'], dependencies) - return str(job['id']) - return "-1" - else: - try: - value = request.content.getvalue() - if request.method != "GET": - data = value and json.loads(request.content.getvalue()) or {} - if verbose: - vprint ("[Content] {}".format(repr(data))) - else: - if verbose: - vprint ("[Content] {}".format(repr(request.args))) - - def getArg(name, default): - if request.method == "GET": - # GET params - value = request.args.get(name, [default])[0] - value = type(default)(default if value == None else value) - assert(value != None) - return value - else: - # JSON params - value = data.get(name) - value = type(default)(default if value == None else value) - assert(value != None) - return value - - def api_rest(): - """REST API.""" - - # REST PUT API - if request.method == "PUT": - if request.path == "/api/jobs": - if grantAddJob(self.user, getArg("command","")): - job = db.newJob ((getArg("parent",0)), - (getArg("title","")), - (getArg("command","")), - (getArg("dir","")), - (getArg("environment","")), - (getArg("state","WAITING")), - (getArg("paused",0)), - (getArg("timeout",1000)), - (getArg("priority",1000)), - (getArg("affinity", "")), - (getArg("user", "")), - (getArg("url", "")), - (getArg("progress_pattern", "")), - (getArg("dependencies", []))) - return job['id'] - else: - return False - - # REST GET API - elif request.method == "GET": - m = re.match(r"^/api/jobs/(\d+)$", request.path) - if m: - return db.getJob(int(m.group (1))) - m = re.match(r"^/api/jobs/(\d+)/children$", request.path) - if m: - return db.getJobChildren(int(m.group (1)), {}) - m = re.match(r"^/api/jobs/(\d+)/dependencies$", request.path) - if m: - return db.getJobDependencies(int(m.group (1))) - m = re.match(r"^/api/jobs/(\d+)/childrendependencies$", request.path) - if m: - return db.getChildrenDependencyIds(int(m.group (1))) - m = re.match(r"^/api/jobs/(\d+)/log$", request.path) - if m: - return self.getLog(int(m.group (1))) - if request.path == "/api/jobs": - return db.getJobChildren(0, {}) - - m = re.match(r"^/api/jobs/count/where/$", request.path) - if m: - return db.getCountJobsWhere(request.args["where_clause"]) - - m = re.match(r"^/api/jobs/where/$", request.path) - if m: - return db.getJobsWhere( - where_clause=request.args["where_clause"][0], - index_min=request.args["min"][0], - index_max=request.args["max"][0], - ) - - if request.path == "/api/workers": - return db.getWorkers() - if request.path == "/api/events": - return db.getEvents(getArg("job", -1), getArg("worker", ""), getArg("howlong", -1)) - if request.path == "/api/affinities": - return db.getAffinities() - - if request.path == "/api/jobs/users/": - return db.getJobsUsers() - - if request.path == "/api/jobs/states/": - return db.getJobsStates() - - if request.path == "/api/jobs/workers/": - return db.getJobsWorkers() - - if request.path == "/api/jobs/priorities/": - return db.getJobsPriorities() - - if request.path == "/api/jobs/affinities/": - return db.getJobsAffinities() - - # REST POST API - elif request.method == "POST": - if request.path == "/api/jobs": - db.editJobs(data) - return 1 - if request.path == "/api/workers": - db.editWorkers(data) - return 1 - m = re.match(r"^/api/jobs/(\d+)/dependencies$", request.path) - if m: - db.setJobDependencies(int(m.group (1)), data) - return 1 - if request.path == "/api/resetjobs": - for jobId in data: - db.resetJob(int(jobId)) - return 1 - if request.path == "/api/reseterrorjobs": - for jobId in data: - db.resetErrorJob(int(jobId)) - return 1 - if request.path == "/api/startjobs": - for jobId in data: - db.startJob(int(jobId)) - return 1 - if request.path == "/api/pausejobs": - for jobId in data: - db.pauseJob(int(jobId)) - return 1 - if request.path == "/api/stopworkers": - for name in data: - db.stopWorker(name) - return 1 - if request.path == "/api/startworkers": - for name in data: - db.startWorker(name) - return 1 - if request.path == "/api/affinities": - db.setAffinities(data) - return 1 - if request.path == "/api/terminateworkers": - if servermode != "normal": # Cloud mode - for name in data: - db.cloudmanager.stopInstance(name) - db._setWorkerState(name, "TERMINATED") - return 1 - else: - return None - - # REST DELETE API - elif request.method == "DELETE": - if request.path == "/api/jobs": - for jobId in data: - deletedJobs = [] - db.deleteJob(int(jobId), deletedJobs) - for deleteJobId in deletedJobs: - self.deleteLog(deleteJobId) - return 1 - if request.path == "/api/workers": - for name in data: - db.deleteWorker(name) - return 1 - - result = api_rest () - if result != None: - # Only JSON right now - return json.dumps(result) - else: - # return server.NOT_DONE_YET - request.setResponseCode(404) - return "Web service not found." - except LdapError as error: - vprint(error) - request.setResponseCode(http.UNAUTHORIZED) - return "LDAP authorization required." - - def getLog (self, jobId): - # Look for the job - log = "" - try: - logFile = open (getLogFilename (jobId), "r") - while (1): - # Read some lines of logs - line = logFile.readline() - # "" means EOF - if line == "": - break - log = log + line - logFile.close () - except IOError: - pass - return log - - def deleteLog (self, jobId): - # Look for the job - try: - os.remove (getLogFilename (jobId)) - except OSError: - pass + """Defines XMLRPC and API for users interactions. Defines logger.""" + + def __init__(self): + self.user = "" # Default value, overwritten later in case of LDAP authentication + + def render(self, request): + with db: + vprint("[{}] {}".format(request.method, request.path)) + if isWebFrontend(request): + (authenticated, permissions) = authenticate(request, ldap_permissions) + request.path = request.path.replace("webfrontend/", "", 1) + else: + authenticated = True + permissions = ldap_permissions + if authenticated: + self.user = db.ldap_user = request.getUser() + db.permissions = permissions + + def getArg(name, default): + value = request.args.get(name, [default]) + return value[0] + + # The legacy method for compatibility + if request.path == "/xmlrpc/addjob": + parent = getArg("parent", "0") + title = getArg("title", "New job") + cmd = getArg("cmd", getArg("command", "")) + dir = getArg("dir", ".") + environment = getArg("env", None) + if environment == "": + environment = None + priority = getArg("priority", "1000") + timeout = getArg("timeout", "0") + affinity = getArg("affinity", "") + dependencies = getArg("dependencies", "") + progress_pattern = getArg("localprogress", "") + url = getArg("url", "") + user = getArg("user", "") + state = getArg("state", "WAITING") + paused = getArg("paused", "0") + if self.user != "": + user = self.user + + if grantAddJob(self.user, cmd): + vprint ("Add job: {}".format(cmd)) + # try as an int + parent = int(parent) + if type(dependencies) is str: + # Parse the dependencies string + dependencies = re.findall('(\d+)', dependencies) + for i, dep in enumerate(dependencies) : + dependencies[i] = int(dep) + + job = db.newJob (parent, str (title), str (cmd), str (dir), str (environment), + str (state), int (paused), int (timeout), int (priority), str (affinity), + str (user), str (url), str (progress_pattern)) + if job is not None: + db.setJobDependencies(job['id'], dependencies) + return str(job['id']) + return "-1" + else: + try: + value = request.content.getvalue() + if request.method != "GET": + data = value and json.loads(request.content.getvalue()) or {} + if verbose: + vprint ("[Content] {}".format(repr(data))) + else: + if verbose: + vprint ("[Content] {}".format(repr(request.args))) + + def getArg(name, default): + if request.method == "GET": + # GET params + value = request.args.get(name, [default])[0] + value = type(default)(default if value == None else value) + assert(value != None) + return value + else: + # JSON params + value = data.get(name) + value = type(default)(default if value == None else value) + assert(value != None) + return value + + def api_rest(): + """REST API.""" + + # REST PUT API + if request.method == "PUT": + if request.path == "/api/jobs": + if grantAddJob(self.user, getArg("command","")): + job = db.newJob ((getArg("parent",0)), + (getArg("title","")), + (getArg("command","")), + (getArg("dir","")), + (getArg("environment","")), + (getArg("state","WAITING")), + (getArg("paused",0)), + (getArg("timeout",1000)), + (getArg("priority",1000)), + (getArg("affinity", "")), + (getArg("user", "")), + (getArg("url", "")), + (getArg("progress_pattern", "")), + (getArg("dependencies", []))) + return job['id'] + else: + return False + + # REST GET API + elif request.method == "GET": + m = re.match(r"^/api/jobs/(\d+)$", request.path) + if m: + return db.getJob(int(m.group (1))) + m = re.match(r"^/api/jobs/(\d+)/children$", request.path) + if m: + return db.getJobChildren(int(m.group (1)), {}) + m = re.match(r"^/api/jobs/(\d+)/dependencies$", request.path) + if m: + return db.getJobDependencies(int(m.group (1))) + m = re.match(r"^/api/jobs/(\d+)/childrendependencies$", request.path) + if m: + return db.getChildrenDependencyIds(int(m.group (1))) + m = re.match(r"^/api/jobs/(\d+)/log$", request.path) + if m: + return self.getLog(int(m.group (1))) + if request.path == "/api/jobs": + return db.getJobChildren(0, {}) + + m = re.match(r"^/api/jobs/count/where/$", request.path) + if m: + return db.getCountJobsWhere(request.args["where_clause"]) + + m = re.match(r"^/api/jobs/where/$", request.path) + if m: + return db.getJobsWhere( + where_clause=request.args["where_clause"][0], + index_min=request.args["min"][0], + index_max=request.args["max"][0], + ) + + if request.path == "/api/workers": + return db.getWorkers() + if request.path == "/api/events": + return db.getEvents(getArg("job", -1), getArg("worker", ""), getArg("howlong", -1)) + if request.path == "/api/affinities": + return db.getAffinities() + + if request.path == "/api/jobs/users/": + return db.getJobsUsers() + + if request.path == "/api/jobs/states/": + return db.getJobsStates() + + if request.path == "/api/jobs/workers/": + return db.getJobsWorkers() + + if request.path == "/api/jobs/priorities/": + return db.getJobsPriorities() + + if request.path == "/api/jobs/affinities/": + return db.getJobsAffinities() + + # REST POST API + elif request.method == "POST": + if request.path == "/api/jobs": + db.editJobs(data) + return 1 + if request.path == "/api/workers": + db.editWorkers(data) + return 1 + m = re.match(r"^/api/jobs/(\d+)/dependencies$", request.path) + if m: + db.setJobDependencies(int(m.group (1)), data) + return 1 + if request.path == "/api/resetjobs": + for jobId in data: + db.resetJob(int(jobId)) + return 1 + if request.path == "/api/reseterrorjobs": + for jobId in data: + db.resetErrorJob(int(jobId)) + return 1 + if request.path == "/api/startjobs": + for jobId in data: + db.startJob(int(jobId)) + return 1 + if request.path == "/api/pausejobs": + for jobId in data: + db.pauseJob(int(jobId)) + return 1 + if request.path == "/api/stopworkers": + for name in data: + db.stopWorker(name) + return 1 + if request.path == "/api/startworkers": + for name in data: + db.startWorker(name) + return 1 + if request.path == "/api/affinities": + db.setAffinities(data) + return 1 + if request.path == "/api/terminateworkers": + if servermode != "normal": # Cloud mode + for name in data: + db.cloudmanager.stopInstance(name) + db._setWorkerState(name, "TERMINATED") + return 1 + else: + return None + + # REST DELETE API + elif request.method == "DELETE": + if request.path == "/api/jobs": + for jobId in data: + deletedJobs = [] + db.deleteJob(int(jobId), deletedJobs) + for deleteJobId in deletedJobs: + self.deleteLog(deleteJobId) + return 1 + if request.path == "/api/workers": + for name in data: + db.deleteWorker(name) + return 1 + + result = api_rest () + if result != None: + # Only JSON right now + return json.dumps(result) + else: + # return server.NOT_DONE_YET + request.setResponseCode(404) + return "Web service not found." + except LdapError as error: + vprint(error) + request.setResponseCode(http.UNAUTHORIZED) + return "LDAP authorization required." + + def getLog (self, jobId): + # Look for the job + log = "" + try: + logFile = open (getLogFilename (jobId), "r") + while (1): + # Read some lines of logs + line = logFile.readline() + # "" means EOF + if line == "": + break + log = log + line + logFile.close () + except IOError: + pass + return log + + def deleteLog (self, jobId): + # Look for the job + try: + os.remove (getLogFilename (jobId)) + except OSError: + pass class Workers(xmlrpc.XMLRPC): - """Unauthenticated XmlRPC server for Worker.""" - - def render (self, request): - with db: - vprint ("[" + request.method + "] "+request.path) - def getArg (name, default): - value = request.args.get (name, [default]) - return value[0] - - if request.path == "/workers/heartbeat": - return self.json_heartbeat (getArg ('hostname', ''), getArg ('jobId', '-1'), getArg ('log', ''), getArg ('load', '[0]'), getArg ('free_memory', '0'), getArg ('total_memory', '0'), request.getClientIP ()) - elif request.path == "/workers/pickjob": - return self.json_pickjob (getArg ('hostname', ''), getArg ('load', '[0]'), getArg ('free_memory', '0'), getArg ('total_memory', '0'), request.getClientIP ()) - elif request.path == "/workers/endjob": - return self.json_endjob (getArg ('hostname', ''), getArg ('jobId', '1'), getArg ('errorCode', '0'), request.getClientIP ()) - else: - # return server.NOT_DONE_YET - return xmlrpc.XMLRPC.render (self, request) - - def json_heartbeat (self, hostname, jobId, log, load, free_memory, total_memory, ip): - result = db.heartbeat (hostname, int(jobId), load, int(free_memory), int(total_memory), str(ip)) - if log != "" : - try: - logFile = open (getLogFilename (jobId), "a") - log = base64.decodestring(log) - - # Filter the log progression message - progress = None - job = db.getJob (int (jobId)) - progress_pattern = getattr (job, "progress_pattern", DefaultLocalProgressPattern) - if progress_pattern != "": - vprint ("progressPattern : \n" + str(progress_pattern)) - lp = None - gp = None - lFilter = getLogFilter (progress_pattern) - log, lp = lFilter.filterLogs (log) - if lp != None: - vprint ("lp : "+ str(lp)+"\n") - if lp != job['progress']: - db.setJobProgress (int (jobId), lp) - logFile.write (log) - if not result: - logFile.write ("KillJob: server required worker to kill job.\n") - logFile.close () - except IOError: - vprint ("Error in logs") - return result and "true" or "false" - - def json_pickjob (self, hostname, load, free_memory, total_memory, ip): - return str (db.pickJob (hostname, load, int(free_memory), int(total_memory), str(ip))) - - def json_endjob (self, hostname, jobId, errorCode, ip): - return str (db.endJob (hostname, int(jobId), int(errorCode), str(ip))) + """Unauthenticated XmlRPC server for Worker.""" + + def render (self, request): + with db: + vprint ("[" + request.method + "] "+request.path) + def getArg (name, default): + value = request.args.get (name, [default]) + return value[0] + + if request.path == "/workers/heartbeat": + return self.json_heartbeat (getArg ('hostname', ''), getArg ('jobId', '-1'), getArg ('log', ''), getArg ('load', '[0]'), getArg ('free_memory', '0'), getArg ('total_memory', '0'), request.getClientIP ()) + elif request.path == "/workers/pickjob": + return self.json_pickjob (getArg ('hostname', ''), getArg ('load', '[0]'), getArg ('free_memory', '0'), getArg ('total_memory', '0'), request.getClientIP ()) + elif request.path == "/workers/endjob": + return self.json_endjob (getArg ('hostname', ''), getArg ('jobId', '1'), getArg ('errorCode', '0'), request.getClientIP ()) + else: + # return server.NOT_DONE_YET + return xmlrpc.XMLRPC.render (self, request) + + def json_heartbeat (self, hostname, jobId, log, load, free_memory, total_memory, ip): + result = db.heartbeat (hostname, int(jobId), load, int(free_memory), int(total_memory), str(ip)) + if log != "" : + try: + logFile = open (getLogFilename (jobId), "a") + log = base64.decodestring(log) + + # Filter the log progression message + progress = None + job = db.getJob (int (jobId)) + progress_pattern = getattr (job, "progress_pattern", DefaultLocalProgressPattern) + if progress_pattern != "": + vprint ("progressPattern : \n" + str(progress_pattern)) + lp = None + gp = None + lFilter = getLogFilter (progress_pattern) + log, lp = lFilter.filterLogs (log) + if lp != None: + vprint ("lp : "+ str(lp)+"\n") + if lp != job['progress']: + db.setJobProgress (int (jobId), lp) + logFile.write (log) + if not result: + logFile.write ("KillJob: server required worker to kill job.\n") + logFile.close () + except IOError: + vprint ("Error in logs") + return result and "true" or "false" + + def json_pickjob (self, hostname, load, free_memory, total_memory, ip): + return str (db.pickJob (hostname, load, int(free_memory), int(total_memory), str(ip))) + + def json_endjob (self, hostname, jobId, errorCode, ip): + return str (db.endJob (hostname, int(jobId), int(errorCode), str(ip))) GErr=0 GOk=0 @@ -657,34 +657,34 @@ def json_endjob (self, hostname, jobId, errorCode, ip): # Go to the script directory global installDir, dataDir if sys.platform=="win32": - import _winreg - # under windows, uses the registry setup by the installer - try: - hKey = _winreg.OpenKey (_winreg.HKEY_LOCAL_MACHINE, "SOFTWARE\\Mercenaries Engineering\\Coalition", 0, _winreg.KEY_READ) - installDir, _type = _winreg.QueryValueEx (hKey, "Installdir") - dataDir, _type = _winreg.QueryValueEx (hKey, "Datadir") - except OSError: - installDir = "." - dataDir = "." + import _winreg + # under windows, uses the registry setup by the installer + try: + hKey = _winreg.OpenKey (_winreg.HKEY_LOCAL_MACHINE, "SOFTWARE\\Mercenaries Engineering\\Coalition", 0, _winreg.KEY_READ) + installDir, _type = _winreg.QueryValueEx (hKey, "Installdir") + dataDir, _type = _winreg.QueryValueEx (hKey, "Datadir") + except OSError: + installDir = "." + dataDir = "." else: - installDir = "." - dataDir = "." + installDir = "." + dataDir = "." os.chdir (installDir) # Create the logs/ directory try: - os.mkdir (dataDir + "/logs", 0755); + os.mkdir (dataDir + "/logs", 0755); except OSError: - pass + pass config = ConfigParser.SafeConfigParser() config.read ("coalition.ini") # Default config file values if not config.has_section('server'): - config.add_section("server") + config.add_section("server") if not config.has_option("server", "db_type"): - config.set ("server", "db_type", "sqlite") + config.set ("server", "db_type", "sqlite") port = cfgInt ('port', 19211) @@ -708,29 +708,29 @@ def json_endjob (self, hostname, jobId, errorCode, ip): UserCmdWhiteList = {} UserCmdWhiteListUser = None for line in _CmdWhiteList.splitlines (False): - _re = re.match ("^@(.*)", line) - if _re: - UserCmdWhiteListUser = _re.group(1) - if not UserCmdWhiteListUser in UserCmdWhiteList: - UserCmdWhiteList[UserCmdWhiteListUser] = [] - else: - if UserCmdWhiteListUser: - UserCmdWhiteList[UserCmdWhiteListUser].append (line) - else: - if not GlobalCmdWhiteList: - GlobalCmdWhiteList = [] - GlobalCmdWhiteList.append (line) + _re = re.match ("^@(.*)", line) + if _re: + UserCmdWhiteListUser = _re.group(1) + if not UserCmdWhiteListUser in UserCmdWhiteList: + UserCmdWhiteList[UserCmdWhiteListUser] = [] + else: + if UserCmdWhiteListUser: + UserCmdWhiteList[UserCmdWhiteListUser].append (line) + else: + if not GlobalCmdWhiteList: + GlobalCmdWhiteList = [] + GlobalCmdWhiteList.append (line) ldap_permissions = { - "ldaptemplatecreatejob": True, - "ldaptemplateviewjob": True, - "ldaptemplateeditjob": True, - "ldaptemplatedeletejob": True, - "ldaptemplatecreatejobglobal": True, - "ldaptemplateviewjobglobal": True, - "ldaptemplateeditjobglobal": True, - "ldaptemplatedeletejobglobal": True, - } + "ldaptemplatecreatejob": True, + "ldaptemplateviewjob": True, + "ldaptemplateeditjob": True, + "ldaptemplatedeletejob": True, + "ldaptemplatecreatejobglobal": True, + "ldaptemplateviewjobglobal": True, + "ldaptemplateeditjobglobal": True, + "ldaptemplatedeletejobglobal": True, + } DefaultLocalProgressPattern = "PROGRESS:%percent" DefaultGlobalProgressPattern = None @@ -745,123 +745,123 @@ def json_endjob (self, hostname, jobId, errorCode, ip): # Cloud mode servermode = cfgStr ('servermode', 'normal') if servermode != "normal": - cloudconfig = ConfigParser.SafeConfigParser() - if servermode == "aws": - cloudconfig.read("cloud_aws.ini") - elif servermode == "gcloud": - cloudconfig.read("cloud_gcloud.ini") - elif servermode == "qarnot_api": - cloudconfig.read("cloud_qarnot.ini") - cloudconfig.set("coalition", "port", str(port)) + cloudconfig = ConfigParser.SafeConfigParser() + if servermode == "aws": + cloudconfig.read("cloud_aws.ini") + elif servermode == "gcloud": + cloudconfig.read("cloud_gcloud.ini") + elif servermode == "qarnot_api": + cloudconfig.read("cloud_qarnot.ini") + cloudconfig.set("coalition", "port", str(port)) else: - cloudconfig = None + cloudconfig = None # Parse the options try: - opts, args = getopt.getopt(sys.argv[1:], "hp:vcs", ["help", "port=", - "verbose", "init", "migrate", "reset"]) - if len(args) != 0: - usage() - sys.exit(2) + opts, args = getopt.getopt(sys.argv[1:], "hp:vcs", ["help", "port=", + "verbose", "init", "migrate", "reset"]) + if len(args) != 0: + usage() + sys.exit(2) except getopt.GetoptError, err: - # print help information and exit: - print str(err) # will print something like "option -a not recognized" - usage() - sys.exit(2) + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) for o, a in opts: - if o in ("-h", "--help"): - usage () - sys.exit(2) - elif o in ("-v", "--verbose"): - verbose = True - elif o in ("-p", "--port"): - port = int(a) - elif o in ("--migrate"): - migratedb = True - elif o in ("--reset"): - resetdb = True - elif o in ("--init"): - initdb = True - else: - assert False, "unhandled option " + o - - if LDAPServer != "": - import ldap + if o in ("-h", "--help"): + usage () + sys.exit(2) + elif o in ("-v", "--verbose"): + verbose = True + elif o in ("-p", "--port"): + port = int(a) + elif o in ("--migrate"): + migratedb = True + elif o in ("--reset"): + resetdb = True + elif o in ("--init"): + initdb = True + else: + assert False, "unhandled option " + o + + if LDAPServer != "": + import ldap if not verbose or service: - try: - outfile = open(dataDir + '/server.log', 'a') - sys.stdout = outfile - sys.stderr = outfile - def _closeLogFile(): - outfile.close() - atexit.register(_closeLogFile) - except: - pass + try: + outfile = open(dataDir + '/server.log', 'a') + sys.stdout = outfile + sys.stderr = outfile + def _closeLogFile(): + outfile.close() + atexit.register(_closeLogFile) + except: + pass vprint ("[Init] --- Start ------------------------------------------------------------") print ("[Init] "+time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(time.time ()))) if service: - vprint ("[Init] Running service") + vprint ("[Init] Running service") else: - vprint ("[Init] Running standard console") + vprint ("[Init] Running standard console") # Init the good database if cfgStr ('db_type', 'sqlite') == "mysql": - vprint ("[Init] Use mysql") - from db_mysql import DBMySQL - db = DBMySQL (cfgStr ('db_mysql_host', "127.0.0.1"), cfgStr ('db_mysql_user', ""), cfgStr ('db_mysql_password', ""), cfgStr ('db_mysql_base', "base"), config=config, cloudconfig=cloudconfig) + vprint ("[Init] Use mysql") + from db_mysql import DBMySQL + db = DBMySQL (cfgStr ('db_mysql_host', "127.0.0.1"), cfgStr ('db_mysql_user', ""), cfgStr ('db_mysql_password', ""), cfgStr ('db_mysql_base', "base"), config=config, cloudconfig=cloudconfig) else: - vprint ("[Init] Use sqlite") - from db_sqlite import DBSQLite - db = DBSQLite (cfgStr ('db_sqlite_file', "coalition.db"), config=config, cloudconfig=cloudconfig) + vprint ("[Init] Use sqlite") + from db_sqlite import DBSQLite + db = DBSQLite (cfgStr ('db_sqlite_file', "coalition.db"), config=config, cloudconfig=cloudconfig) db.NotifyError = notifyError db.NotifyFinished = notifyFinished db.Verbose = verbose if initdb: - vprint ("[Init] Initial database setup") - if not db.initDatabase(): - exit(1) + vprint ("[Init] Initial database setup") + if not db.initDatabase(): + exit(1) if not len(db._getDatabaseTables()): - db.initDatabase () + db.initDatabase () with db: - requires_migration = db.requiresMigration() - if not migratedb and requires_migration: - print(dedent(""" - Coalition cannot start since the database schema and the source code - are not compatible. The database needs to be migrated. First the - database should be backuped in case the migration fails. Then, the - command 'coalition server.py --verbose --migrate' should be run. - Another option is to install the previous version of coalition code - that worked with the current database schema.""")) - exit(1) - - if migratedb and not requires_migration: - print(dedent(""" - The database does not require migration, but the '--migrate' parameter was provided.""")) - exit(1) - - if requires_migration and migratedb: - print(dedent(""" - Please consider doing a backup of the database first. Are you ready to proceed?""")) - if _interactiveConfirmation("Yes, proceed to migration!"): - success = db.migrateDatabase() - if success: - print("Database migration was successfull.") - exit(0) - else: - print("A problem occured during the database migration.") - exit(1) - else: - print("Database migration was cancelled by user.") - exit(0) - - if resetdb: - db.reset () + requires_migration = db.requiresMigration() + if not migratedb and requires_migration: + print(dedent(""" + Coalition cannot start since the database schema and the source code + are not compatible. The database needs to be migrated. First the + database should be backuped in case the migration fails. Then, the + command 'coalition server.py --verbose --migrate' should be run. + Another option is to install the previous version of coalition code + that worked with the current database schema.""")) + exit(1) + + if migratedb and not requires_migration: + print(dedent(""" + The database does not require migration, but the '--migrate' parameter was provided.""")) + exit(1) + + if requires_migration and migratedb: + print(dedent(""" + Please consider doing a backup of the database first. Are you ready to proceed?""")) + if _interactiveConfirmation("Yes, proceed to migration!"): + success = db.migrateDatabase() + if success: + print("Database migration was successfull.") + exit(0) + else: + print("A problem occured during the database migration.") + exit(1) + else: + print("Database migration was cancelled by user.") + exit(0) + + if resetdb: + db.reset () LogFilterCache = {} @@ -869,47 +869,47 @@ def _closeLogFile(): ### Main manager ### if sys.platform=="win32" and service: - # Windows Service - import win32serviceutil - import win32service - import win32event - - class WindowsService(win32serviceutil.ServiceFramework): - _svc_name_ = "CoalitionServer" - _svc_display_name_ = "Coalition Server" - - def __init__(self, args): - vprint ("[Init] Service init") - win32serviceutil.ServiceFramework.__init__(self, args) - self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) - - def SvcStop(self): - vprint ("[Stop] Service stop") - self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING) - win32event.SetEvent(self.hWaitStop) - - def SvcDoRun(self): - vprint ("[Run] Service running") - import servicemanager - self.CheckForQuit() - main() - vprint ("Service quitting") - - def CheckForQuit(self): - vprint ("[Stop] Checking for quit...") - retval = win32event.WaitForSingleObject(self.hWaitStop, 10) - if not retval == win32event.WAIT_TIMEOUT: - # Received Quit from Win32 - reactor.stop() - - reactor.callLater(1.0, self.CheckForQuit) - - if __name__=='__main__': - win32serviceutil.HandleCommandLine(WindowsService) + # Windows Service + import win32serviceutil + import win32service + import win32event + + class WindowsService(win32serviceutil.ServiceFramework): + _svc_name_ = "CoalitionServer" + _svc_display_name_ = "Coalition Server" + + def __init__(self, args): + vprint ("[Init] Service init") + win32serviceutil.ServiceFramework.__init__(self, args) + self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) + + def SvcStop(self): + vprint ("[Stop] Service stop") + self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING) + win32event.SetEvent(self.hWaitStop) + + def SvcDoRun(self): + vprint ("[Run] Service running") + import servicemanager + self.CheckForQuit() + main() + vprint ("Service quitting") + + def CheckForQuit(self): + vprint ("[Stop] Checking for quit...") + retval = win32event.WaitForSingleObject(self.hWaitStop, 10) + if not retval == win32event.WAIT_TIMEOUT: + # Received Quit from Win32 + reactor.stop() + + reactor.callLater(1.0, self.CheckForQuit) + + if __name__=='__main__': + win32serviceutil.HandleCommandLine(WindowsService) else: - # Simple server - if __name__ == '__main__': - main() + # Simple server + if __name__ == '__main__': + main() # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/tests/main_test.py b/tests/main_test.py index 4ff12d9..0719453 100755 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -12,502 +12,502 @@ VERBOSITY = 5 def test_server_python_api(): - tests = [ - "test_newJob", - "test_getJob", - "test_getWorkers", - "test_editWorkers", - "test_priorities", - "test_affinities_first", - "test_setJobDependencies", - "test_states", - "test_children_finish_before_parent", - "test_no_job_error",] - suite = unittest.TestSuite(map(ServerPythonApiTestCase, tests)) - return unittest.TextTestRunner(verbosity=VERBOSITY).run(suite) + tests = [ + "test_newJob", + "test_getJob", + "test_getWorkers", + "test_editWorkers", + "test_priorities", + "test_affinities_first", + "test_setJobDependencies", + "test_states", + "test_children_finish_before_parent", + "test_no_job_error",] + suite = unittest.TestSuite(map(ServerPythonApiTestCase, tests)) + return unittest.TextTestRunner(verbosity=VERBOSITY).run(suite) def test_server_xmlrpc(): - tests = ['test_setJobDependencies',] - suite = unittest.TestSuite(map(ServerXmlrpcTestCase, tests)) - unittest.TextTestRunner(verbosity=VERBOSITY).run(suite) + tests = ['test_setJobDependencies',] + suite = unittest.TestSuite(map(ServerXmlrpcTestCase, tests)) + unittest.TextTestRunner(verbosity=VERBOSITY).run(suite) def launch_server(): - """Launch a coalition server.""" - # The --init parameter prevents database overwriting. The database has to be - # initially empty. - cmd = ["python", "server.py"] - return subprocess.Popen(cmd) + """Launch a coalition server.""" + # The --init parameter prevents database overwriting. The database has to be + # initially empty. + cmd = ["python", "server.py"] + return subprocess.Popen(cmd) def launch_worker(identifier): - """Launch coalition worker.""" - cmd = ["python", "worker.py", "-n", identifier, "http://{}:{}".format(HOST,PORT)] - return subprocess.Popen(cmd) + """Launch coalition worker.""" + cmd = ["python", "worker.py", "-n", identifier, "http://{}:{}".format(HOST,PORT)] + return subprocess.Popen(cmd) class ServerPythonApiTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.server = launch_server() - time.sleep(5) - if self.server.poll() is not None: - print("Server failed to start.") - exit(1) - self.workers = [launch_worker("worker-{}".format(i)) for i in range(2)] - self.conn = coalition.Connection(HOST, PORT) - affinities = dict() - for i in range(1, 65): - affinities[str(i)] = "" - affinities["1"] = "linux" - affinities["2"] = "win" - affinities["3"] = "windows project" - affinities["4"] = "windows" - affinities["5"] = "dos" - self.conn.setAffinities(affinities) - self.depJobID = self.conn.newJob(command="echo dependencies", title="jobDependencies", state='PAUSED') - self.parentID = self.conn.newJob(title="parent") - self.childrenID = [self.conn.newJob(command="echo 'job-{}'".format(i), - title="job-{}".format(i), parent=self.parentID, state='PAUSED') for i in range(NUM_JOBS)] - self.firstSleepJobId = self.conn.newJob(command="sleep 2", title="First Job", state="WAITING", affinity="linux", priority=129) - self.secondSleepJobId = self.conn.newJob(command="sleep 2", - title="Second Job", state="WAITING", affinity="linux", priority=128) - self.windowsProjectJobId = self.conn.newJob(command="sleep 1", title="windows project", state="WAITING", affinity="windows project", priority=127) - self.winJobId = self.conn.newJob(command="sleep 1", title="Win", state="WAITING", affinity="win", priority=127) - self.dosJobId = self.conn.newJob(command="sleep 1", title="Dos", state="WAITING", affinity="dos", priority=127) - self.basicJobId = self.conn.newJob(command="sleep 1", title="Basic", - state="WAITING", priority=300) - - @classmethod - def tearDownClass(self): - for worker in self.workers: - worker.terminate() - self.server.terminate() - - def test_newJob(self): - self.assertNotEqual(self.depJobID, None) - self.assertNotEqual(self.parentID, None) - - def test_getJob(self): - depJob = self.conn.getJob(self.depJobID) - firstSleepJob = self.conn.getJob(self.firstSleepJobId) - secondSleepJob = self.conn.getJob(self.secondSleepJobId) - - self.assertEqual(depJob.id, self.depJobID) - self.assertEqual(depJob.title, 'jobDependencies') - self.assertEqual(depJob.command, 'echo dependencies') - self.assertEqual(depJob.state, "PAUSED") - - self.assertEqual(firstSleepJob.id, self.firstSleepJobId) - self.assertEqual(firstSleepJob.title, "First Job") - #self.assertEqual(firstSleepJob.state, "WAITING") - self.assertEqual(firstSleepJob.affinity, "linux") - self.assertEqual(firstSleepJob.priority, 129) - - self.assertEqual(secondSleepJob.id, self.secondSleepJobId) - self.assertEqual(secondSleepJob.title, "Second Job") - #self.assertEqual(secondSleepJob.state, "WAITING") - self.assertEqual(secondSleepJob.affinity, "linux") - self.assertEqual(secondSleepJob.priority, 128) - - def test_getWorkers(self): - for k in range(1,10): - workers = self.conn.getWorkers() - if len(workers) >= 2: - break - time.sleep (1) - - self.assertEqual(len(workers), 2) - - def test_editWorkers(self): - workers = self.conn.getWorkers() - new_workers = dict() - new_workers[workers[0]['name']] = dict() - new_workers[workers[0]['name']]['affinity'] = "windows\nlinux" - new_workers[workers[1]['name']] = dict() - new_workers[workers[1]['name']]['affinity'] = "windows project\nwin\ndos" - self.assertEqual(self.conn.editWorkers(new_workers), '1') - - def test_priorities(self): - firstSleepJob = self.conn.getJob(self.firstSleepJobId) - secondSleepJob = self.conn.getJob(self.secondSleepJobId) - while not (secondSleepJob.state == "FINISHED") & (firstSleepJob.state == "FINISHED"): - time.sleep(1) - firstSleepJob = self.conn.getJob(self.firstSleepJobId) - secondSleepJob = self.conn.getJob(self.secondSleepJobId) - self.assertTrue(secondSleepJob.start_time > firstSleepJob.start_time) - - def test_affinities_first(self): - windowsProjectJob = self.conn.getJob( self.windowsProjectJobId ) - winJob = self.conn.getJob( self.winJobId ) - dosJob = self.conn.getJob( self.dosJobId ) - basicJob = self.conn.getJob( self.basicJobId ) - - while not ( windowsProjectJob.state == "FINISHED" ) & ( winJob.state == "FINISHED" ) & ( dosJob.state == "FINISHED" ) & ( basicJob.state == "FINISHED" ): - windowsProjectJob = self.conn.getJob( self.windowsProjectJobId ) - winJob = self.conn.getJob( self.winJobId ) - dosJob = self.conn.getJob( self.dosJobId ) - basicJob = self.conn.getJob( self.basicJobId ) - - self.assertTrue( windowsProjectJob.start_time < winJob.start_time ) - self.assertTrue( winJob.start_time < dosJob.start_time ) - self.assertTrue( windowsProjectJob.start_time < dosJob.start_time ) - self.assertTrue( ( dosJob.start_time < basicJob.start_time ) | ( dosJob.worker != basicJob.worker ) ) - - def test_setJobDependencies(self): - self.conn.setJobDependencies (self.depJobID, self.childrenID) - depsJobs = self.conn.getJobDependencies (self.depJobID) - depsID = [job.id for job in depsJobs] - with self.conn: - depJob = self.conn.getJob(self.depJobID) - depJob.state = "WAITING" - self.assertTrue(any(map(lambda v: v in self.childrenID, depsID))) - - def test_states(self): - with self.conn: - for job in self.conn.getJobChildren(self.parentID): - job.state = "WAITING" - depJob = self.conn.getJob (self.depJobID) - self.assertNotEqual(depJob.state, "PAUSED") - - def test_children_finish_before_parent(self): - for i in self.childrenID: - job = self.conn.getJob (i) - with self.conn: - job.state = "WAITING" - - depJob = self.conn.getJob (self.depJobID) - while depJob.state != "FINISHED": - depJob = self.conn.getJob (self.depJobID) - time.sleep(1) - - def test_no_job_error(self): - parent = self.conn.getJob (self.parentID) - self.assertEqual (parent.state, "FINISHED") - self.assertEqual (parent.working, 0) - self.assertEqual (parent.errors, 0) - self.assertEqual (parent.run_done, 0) + @classmethod + def setUpClass(self): + self.server = launch_server() + time.sleep(5) + if self.server.poll() is not None: + print("Server failed to start.") + exit(1) + self.workers = [launch_worker("worker-{}".format(i)) for i in range(2)] + self.conn = coalition.Connection(HOST, PORT) + affinities = dict() + for i in range(1, 65): + affinities[str(i)] = "" + affinities["1"] = "linux" + affinities["2"] = "win" + affinities["3"] = "windows project" + affinities["4"] = "windows" + affinities["5"] = "dos" + self.conn.setAffinities(affinities) + self.depJobID = self.conn.newJob(command="echo dependencies", title="jobDependencies", state='PAUSED') + self.parentID = self.conn.newJob(title="parent") + self.childrenID = [self.conn.newJob(command="echo 'job-{}'".format(i), + title="job-{}".format(i), parent=self.parentID, state='PAUSED') for i in range(NUM_JOBS)] + self.firstSleepJobId = self.conn.newJob(command="sleep 2", title="First Job", state="WAITING", affinity="linux", priority=129) + self.secondSleepJobId = self.conn.newJob(command="sleep 2", + title="Second Job", state="WAITING", affinity="linux", priority=128) + self.windowsProjectJobId = self.conn.newJob(command="sleep 1", title="windows project", state="WAITING", affinity="windows project", priority=127) + self.winJobId = self.conn.newJob(command="sleep 1", title="Win", state="WAITING", affinity="win", priority=127) + self.dosJobId = self.conn.newJob(command="sleep 1", title="Dos", state="WAITING", affinity="dos", priority=127) + self.basicJobId = self.conn.newJob(command="sleep 1", title="Basic", + state="WAITING", priority=300) + + @classmethod + def tearDownClass(self): + for worker in self.workers: + worker.terminate() + self.server.terminate() + + def test_newJob(self): + self.assertNotEqual(self.depJobID, None) + self.assertNotEqual(self.parentID, None) + + def test_getJob(self): + depJob = self.conn.getJob(self.depJobID) + firstSleepJob = self.conn.getJob(self.firstSleepJobId) + secondSleepJob = self.conn.getJob(self.secondSleepJobId) + + self.assertEqual(depJob.id, self.depJobID) + self.assertEqual(depJob.title, 'jobDependencies') + self.assertEqual(depJob.command, 'echo dependencies') + self.assertEqual(depJob.state, "PAUSED") + + self.assertEqual(firstSleepJob.id, self.firstSleepJobId) + self.assertEqual(firstSleepJob.title, "First Job") + #self.assertEqual(firstSleepJob.state, "WAITING") + self.assertEqual(firstSleepJob.affinity, "linux") + self.assertEqual(firstSleepJob.priority, 129) + + self.assertEqual(secondSleepJob.id, self.secondSleepJobId) + self.assertEqual(secondSleepJob.title, "Second Job") + #self.assertEqual(secondSleepJob.state, "WAITING") + self.assertEqual(secondSleepJob.affinity, "linux") + self.assertEqual(secondSleepJob.priority, 128) + + def test_getWorkers(self): + for k in range(1,10): + workers = self.conn.getWorkers() + if len(workers) >= 2: + break + time.sleep (1) + + self.assertEqual(len(workers), 2) + + def test_editWorkers(self): + workers = self.conn.getWorkers() + new_workers = dict() + new_workers[workers[0]['name']] = dict() + new_workers[workers[0]['name']]['affinity'] = "windows\nlinux" + new_workers[workers[1]['name']] = dict() + new_workers[workers[1]['name']]['affinity'] = "windows project\nwin\ndos" + self.assertEqual(self.conn.editWorkers(new_workers), '1') + + def test_priorities(self): + firstSleepJob = self.conn.getJob(self.firstSleepJobId) + secondSleepJob = self.conn.getJob(self.secondSleepJobId) + while not (secondSleepJob.state == "FINISHED") & (firstSleepJob.state == "FINISHED"): + time.sleep(1) + firstSleepJob = self.conn.getJob(self.firstSleepJobId) + secondSleepJob = self.conn.getJob(self.secondSleepJobId) + self.assertTrue(secondSleepJob.start_time > firstSleepJob.start_time) + + def test_affinities_first(self): + windowsProjectJob = self.conn.getJob( self.windowsProjectJobId ) + winJob = self.conn.getJob( self.winJobId ) + dosJob = self.conn.getJob( self.dosJobId ) + basicJob = self.conn.getJob( self.basicJobId ) + + while not ( windowsProjectJob.state == "FINISHED" ) & ( winJob.state == "FINISHED" ) & ( dosJob.state == "FINISHED" ) & ( basicJob.state == "FINISHED" ): + windowsProjectJob = self.conn.getJob( self.windowsProjectJobId ) + winJob = self.conn.getJob( self.winJobId ) + dosJob = self.conn.getJob( self.dosJobId ) + basicJob = self.conn.getJob( self.basicJobId ) + + self.assertTrue( windowsProjectJob.start_time < winJob.start_time ) + self.assertTrue( winJob.start_time < dosJob.start_time ) + self.assertTrue( windowsProjectJob.start_time < dosJob.start_time ) + self.assertTrue( ( dosJob.start_time < basicJob.start_time ) | ( dosJob.worker != basicJob.worker ) ) + + def test_setJobDependencies(self): + self.conn.setJobDependencies (self.depJobID, self.childrenID) + depsJobs = self.conn.getJobDependencies (self.depJobID) + depsID = [job.id for job in depsJobs] + with self.conn: + depJob = self.conn.getJob(self.depJobID) + depJob.state = "WAITING" + self.assertTrue(any(map(lambda v: v in self.childrenID, depsID))) + + def test_states(self): + with self.conn: + for job in self.conn.getJobChildren(self.parentID): + job.state = "WAITING" + depJob = self.conn.getJob (self.depJobID) + self.assertNotEqual(depJob.state, "PAUSED") + + def test_children_finish_before_parent(self): + for i in self.childrenID: + job = self.conn.getJob (i) + with self.conn: + job.state = "WAITING" + + depJob = self.conn.getJob (self.depJobID) + while depJob.state != "FINISHED": + depJob = self.conn.getJob (self.depJobID) + time.sleep(1) + + def test_no_job_error(self): + parent = self.conn.getJob (self.parentID) + self.assertEqual (parent.state, "FINISHED") + self.assertEqual (parent.working, 0) + self.assertEqual (parent.errors, 0) + self.assertEqual (parent.run_done, 0) class ServerXmlrpcTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.server = launch_server() - self.workers = [launch_worker("worker1"), launch_worker("worker2")] - self.conn = coalition.Connection(HOST, PORT) - - @classmethod - def tearDownClass(self): - for worker in self.workers: - worker.terminate() - self.server.terminate() - - def test_setJobDependencies(self): - """set job2 dependent on job1""" - job1 = self.conn.newJob(0, "Test-1", "ls /", ".", "", "WAITING", False, 100, 15, "" , "", "", "")['id'] - job2 = self.conn.newJob(0, "Test-2", "ls /", ".", "", "WAITING", False, 100, 15, "" , "", "", "")['id'] - self.conn.setJobDependencies(job2, [job1]) - self.assertEqual(len(self.getJobDependencies(job2)), 1) - self.assertEqual(self.getJob(job2)['state'], "PENDING") - - def test_pick_job(self): - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], job1) - assertEqual(self.getWorker ('worker1') ['state'], "WORKING") - assertEqual(self.getJob(job1)['state'], "WORKING") - assertEqual(pick2[0], -1) - assertEqual(self.getJob(job2)['state'], "PENDING") - - def test_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - h2 = self.heartbeat ("worker2", pick2[0], 1, 1, 1, '127.0.0.1') - assertIsNotNone (h1) - assertEqual(self.getWorker('worker1')['state'], "WORKING") - assertIsNone(h2) - assertEqual(self.getWorker('worker2')['state'], "WAITING") - - def test_worker1_finish_job(self): - self.endJob("worker1", pick1[0], 0, "127.0.0.1") - assertEqual(self.getWorker('worker1')['state'], "WAITING") - assertEqual(self.getJob(job1)['state'], "FINISHED") - assertEqual(self.getJob(job2)['state'], "WAITING") - - def test_worker1_pick_job(self): - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], job2) - assertEqual(self.getWorker('worker1')['state'], "WORKING") - assertEqual(self.getJob(job2)['state'], "WORKING") - - def test_worker1_finish_job(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - assertEqual(self.getWorker('worker1')['state'], "WAITING") - assertEqual(self.getJob(job1) ['state'], "FINISHED") - - def test_worker2_pick_job(self): - pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") - assertEqual (pick2[0], -1) - assertEqual (self.getWorker ('worker2') ['state'], "WAITING") - - def test_worker2_pick_job(self): - job3 = self.newJob (0, "Test-1", "ls /", ".", "", "PAUSED", False, 100, 15, "" , "", "", "") ['id'] - pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") - assertEqual (pick2[0], -1) - assertEqual (self.getJob (job3) ['state'], "PAUSED") - assertEqual (self.getWorker ('worker2') ['state'], "WAITING") - - def test_start_job3(self): - self.startJob (job3) - assertEqual (self.getJob (job3)['state'], "WAITING") - - def test_worker2_pick_job(self): - pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") - assertEqual (pick2[0], job3) - assertEqual (self.getJob (job3) ['state'], "WORKING") - assertEqual (self.getWorker ('worker2') ['state'], "WORKING") - - def test_worker2_error_job(self): - self.endJob ("worker2", pick2[0], 1, "127.0.0.1") - assertEqual (self.getJob (job3) ['state'], "ERROR") - assertEqual (self.getWorker ('worker2') ['state'], "WAITING") - - def test_worker2_pick_job(sefl): - pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") - assertEqual (pick2[0], -1) - - def test_reset_job3(self): - self.resetJob (job3) - assertEqual (self.getJob (job3)['state'], "WAITING") - - def test_worker2_pick_job(self): - pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") - assertEqual (pick2[0], job3) - - def test_worker2_end_job(self): - self.endJob ("worker2", pick2[0], 0, "127.0.0.1") - assertEqual (self.getJob (job3) ['state'], "FINISHED") - assertEqual (self.getWorker ('worker2') ['state'], "WAITING") - - def test_worker1_pick_job(self): - job4 = self.newJob (0, "Test-1", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] - job5 = self.newJob (0, "Test-2", "ls /", ".", "", "WAITING", False, 100, 12, "" , "", "", "") ['id'] - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual (pick1[0], job5) - - def test_delete_job4(self): - self.deleteJob (job4) - assertIsNone(self.getJob(job4)) - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNtNone (h1) - assertEqual(self.getWorker('worker1')['state'], "WORKING") - - def test_delete_job5(self): - self.deleteJob (job5) - assertIsNone(self.getJob(job5)) - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNone(h1) - assertEqual(self.getWorker('worker1')['state'], "WAITING") - - def test_pause_jobs(self): - job6 = self.newJob (0, "Test-1", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] - self.pauseJob (job6) - assertEqual (self.getJob (job6) ['state'], "PAUSED") - assertEqual (self.getJob (job6) ['h_paused'], True) - - def test_worker1_pick_job(self): - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual (pick1[0], -1) - - def test_start_job6(self): - self.startJob (job6) - assertEqual (self.getJob (job6) ['state'], "WAITING") - assertEqual (self.getJob (job6) ['h_paused'], False) - - def test_worker1_pick_job(self): - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual (pick1[0], job6) - assertEqual (self.getJob (job6) ['state'], "WORKING") - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNotNone (h1) - - def test_pause_job6(self): - self.pauseJob (job6) - assertEqual (self.getJob (job6) ['state'], "PAUSED") - assertEqual (self.getJob (job6) ['h_paused'], True) - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNone(h1) - - def test_start_job6(self): - self.startJob (job6) - assertEqual (self.getJob (job6) ['state'], "WAITING") - assertEqual (self.getJob (job6) ['h_paused'], False) - - def test_worker1_pick_job(self): - self.stopWorker ("worker1") - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual (pick1[0], -1) - assertEqual (self.getWorker ('worker1') ['state'], "WAITING") - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNone(h1) - - def test_worker1_pick_job(self): - self.startWorker ("worker1") - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual (pick1[0], job6) - assertEqual (self.getJob (job6) ['state'], "WORKING") - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNotNone(h1) - - def test_worker1_deleted(self): - self.deleteWorker ("worker1") - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNotNone(h1) - - def test_worker1_stopped(self): - self.stopWorker ("worker1") - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNone(h1) - - def test_delete_job6(self): - self.deleteJob (job6) - assertIsNone(self.getJob (job6)) - - def test_worker1_pick_job(self): - self.startWorker ("worker1") - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], -1) - - def test_create_job7(self): - job7 = self.newJob (0, "Parent-1", "", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], -1) - - def test_create_job8(self): - job8 = self.newJob (job7, "Child-1", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual (pick1[0], job8) - - def test_worker1_heartbeats(self): - h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') - assertIsNotNone(h1) - - def test_worker1_finish_job_and_change_job7_priority(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - job8prio = self.getJob (job8) ['h_priority'] - self.editJobs ({ job7: { "priority": 12 } }) - assertLess(job8prio, self.getJob(job8)['h_priority']) - - def test_worker1_pick_job12(self): - job9 = self.newJob (0, "Parent-2", "", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] - job10 = self.newJob (0, "Parent-3", "", ".", "", "WAITING", False, 100, 11, "" , "", "", "") ['id'] - job11 = self.newJob (job9, "Child-2", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] - job12 = self.newJob (job10, "Child-3", "ls /", ".", "", "WAITING", False, 100, 8, "" , "", "", "") ['id'] - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assert (pick1[0] == job12) - - def test_worker1_pick_job11(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assert (pick1[0] == job11) - - def test_worker1_finish_job(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - - - def test_intricate_dependencies(self): - # performs intricate dependencies testing - # like a group dependent on a paused job - # and job dependent on this group - job20 = self.newJob (0, "job20", "sleep 2", ".", "", "PAUSED", False, 0, 100, "" , "", "", "") ['id'] - job21 = self.newJob (0, "job21", "", ".", "", "WAITING", False, 0, 100, "" , "", "", "") ['id'] - self.setJobDependencies (job21, [ job20 ]) - job22 = self.newJob (job21, "job22", "sleep 2", ".", "", "WAITING", False, 0, 100, "" , "", "", "") ['id'] - job23 = self.newJob (0, "job23", "sleep 2", ".", "", "WAITING", False, 0, 150, "" , "", "", "") ['id'] - self.setJobDependencies (job23, [ job21 ]) - job24 = self.newJob (0, "job24", "sleep 2", ".", "", "WAITING", False, 0, 200, "" , "", "", "") ['id'] - self.setJobDependencies (job24, [ job20 ]) - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - # can't start any job, all are dependent on job20 which is paused - assertEqual(pick1[0], -1) - assertEqual(self.getJob(job20)['state'], 'PAUSED') - assertEqual(self.getJob(job21)['state'], 'PENDING') - assertEqual(self.getJob(job22)['state'], 'WAITING') - assertIsNotNone(self.getJob(job22)['h_paused']) - assertEqual(self.getJob(job23)['state'], 'PENDING') - assertEqual(self.getJob(job24)['state'], 'PENDING') - - def test_can_only_pick_jobs20(self): - self.startJob (job20) - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], job20) - - def test_worker1_finish_job(self): - self.endJob("worker1", pick1[0], 0, "127.0.0.1") - assertEqual(self.getJob(job20)['state'], 'FINISHED') - assertEqual(self.getJob(job21)['state'], 'WAITING') - assertEqual(self.getJob(job22)['state'], 'WAITING') - assertNotNone(not self.getJob(job22) ['h_paused']) - assertEqual(self.getJob(job23)['state'], 'PENDING') - assertEqual(self.getJob(job24)['state'], 'WAITING') - - def test_worker1_pick_jobs24(self): - # pick job24 as it is the top priority job - pick1 = self.pickJob("worker1", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], job24) - - def test_worker1_finish_job(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - assertEqual(self.getJob(job20)['state'], 'FINISHED') - assertEqual(self.getJob(job21)['state'], 'WAITING') - assertEqual(self.getJob(job22)['state'], 'WAITING') - assertIsNone(self.getJob(job22)['h_paused']) - assertEqual(self.getJob(job23)['state'], 'PENDING') - assertEqual(self.getJob(job24)['state'], 'FINISHED') - - def test_pick_job22(self): - # pick job22 as job23 is still pending and job21 can't be picked as a group - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assertEqual(pick1[0], job22) - - def test_worker1_finish_job(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - - assertEqual (self.getJob (job20) ['state'], 'FINISHED') - assertEqual (self.getJob (job21) ['state'], 'FINISHED') - assertEqual (self.getJob (job22) ['state'], 'FINISHED') - assertEqual (self.getJob (job23) ['state'], 'WAITING') - assertEqual (self.getJob (job24) ['state'], 'FINISHED') - - def test_worker1_pick_job23(self): - # and eventually pick job23 - pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") - assert (pick1[0] == job23) - def test_worker1_finish_jobs(self): - self.endJob ("worker1", pick1[0], 0, "127.0.0.1") - - assertEqual (self.getJob (job20) ['state'], 'FINISHED') - assertEqual (self.getJob (job21) ['state'], 'FINISHED') - assertEqual (self.getJob (job22) ['state'], 'FINISHED') - assertEqual (self.getJob (job23) ['state'], 'FINISHED') - assertEqual (self.getJob (job24) ['state'], 'FINISHED') + @classmethod + def setUpClass(self): + self.server = launch_server() + self.workers = [launch_worker("worker1"), launch_worker("worker2")] + self.conn = coalition.Connection(HOST, PORT) + + @classmethod + def tearDownClass(self): + for worker in self.workers: + worker.terminate() + self.server.terminate() + + def test_setJobDependencies(self): + """set job2 dependent on job1""" + job1 = self.conn.newJob(0, "Test-1", "ls /", ".", "", "WAITING", False, 100, 15, "" , "", "", "")['id'] + job2 = self.conn.newJob(0, "Test-2", "ls /", ".", "", "WAITING", False, 100, 15, "" , "", "", "")['id'] + self.conn.setJobDependencies(job2, [job1]) + self.assertEqual(len(self.getJobDependencies(job2)), 1) + self.assertEqual(self.getJob(job2)['state'], "PENDING") + + def test_pick_job(self): + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], job1) + assertEqual(self.getWorker ('worker1') ['state'], "WORKING") + assertEqual(self.getJob(job1)['state'], "WORKING") + assertEqual(pick2[0], -1) + assertEqual(self.getJob(job2)['state'], "PENDING") + + def test_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + h2 = self.heartbeat ("worker2", pick2[0], 1, 1, 1, '127.0.0.1') + assertIsNotNone (h1) + assertEqual(self.getWorker('worker1')['state'], "WORKING") + assertIsNone(h2) + assertEqual(self.getWorker('worker2')['state'], "WAITING") + + def test_worker1_finish_job(self): + self.endJob("worker1", pick1[0], 0, "127.0.0.1") + assertEqual(self.getWorker('worker1')['state'], "WAITING") + assertEqual(self.getJob(job1)['state'], "FINISHED") + assertEqual(self.getJob(job2)['state'], "WAITING") + + def test_worker1_pick_job(self): + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], job2) + assertEqual(self.getWorker('worker1')['state'], "WORKING") + assertEqual(self.getJob(job2)['state'], "WORKING") + + def test_worker1_finish_job(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + assertEqual(self.getWorker('worker1')['state'], "WAITING") + assertEqual(self.getJob(job1) ['state'], "FINISHED") + + def test_worker2_pick_job(self): + pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") + assertEqual (pick2[0], -1) + assertEqual (self.getWorker ('worker2') ['state'], "WAITING") + + def test_worker2_pick_job(self): + job3 = self.newJob (0, "Test-1", "ls /", ".", "", "PAUSED", False, 100, 15, "" , "", "", "") ['id'] + pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") + assertEqual (pick2[0], -1) + assertEqual (self.getJob (job3) ['state'], "PAUSED") + assertEqual (self.getWorker ('worker2') ['state'], "WAITING") + + def test_start_job3(self): + self.startJob (job3) + assertEqual (self.getJob (job3)['state'], "WAITING") + + def test_worker2_pick_job(self): + pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") + assertEqual (pick2[0], job3) + assertEqual (self.getJob (job3) ['state'], "WORKING") + assertEqual (self.getWorker ('worker2') ['state'], "WORKING") + + def test_worker2_error_job(self): + self.endJob ("worker2", pick2[0], 1, "127.0.0.1") + assertEqual (self.getJob (job3) ['state'], "ERROR") + assertEqual (self.getWorker ('worker2') ['state'], "WAITING") + + def test_worker2_pick_job(sefl): + pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") + assertEqual (pick2[0], -1) + + def test_reset_job3(self): + self.resetJob (job3) + assertEqual (self.getJob (job3)['state'], "WAITING") + + def test_worker2_pick_job(self): + pick2 = self.pickJob ("worker2", 1, 1, 1, "127.0.0.1") + assertEqual (pick2[0], job3) + + def test_worker2_end_job(self): + self.endJob ("worker2", pick2[0], 0, "127.0.0.1") + assertEqual (self.getJob (job3) ['state'], "FINISHED") + assertEqual (self.getWorker ('worker2') ['state'], "WAITING") + + def test_worker1_pick_job(self): + job4 = self.newJob (0, "Test-1", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] + job5 = self.newJob (0, "Test-2", "ls /", ".", "", "WAITING", False, 100, 12, "" , "", "", "") ['id'] + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual (pick1[0], job5) + + def test_delete_job4(self): + self.deleteJob (job4) + assertIsNone(self.getJob(job4)) + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNtNone (h1) + assertEqual(self.getWorker('worker1')['state'], "WORKING") + + def test_delete_job5(self): + self.deleteJob (job5) + assertIsNone(self.getJob(job5)) + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNone(h1) + assertEqual(self.getWorker('worker1')['state'], "WAITING") + + def test_pause_jobs(self): + job6 = self.newJob (0, "Test-1", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] + self.pauseJob (job6) + assertEqual (self.getJob (job6) ['state'], "PAUSED") + assertEqual (self.getJob (job6) ['h_paused'], True) + + def test_worker1_pick_job(self): + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual (pick1[0], -1) + + def test_start_job6(self): + self.startJob (job6) + assertEqual (self.getJob (job6) ['state'], "WAITING") + assertEqual (self.getJob (job6) ['h_paused'], False) + + def test_worker1_pick_job(self): + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual (pick1[0], job6) + assertEqual (self.getJob (job6) ['state'], "WORKING") + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNotNone (h1) + + def test_pause_job6(self): + self.pauseJob (job6) + assertEqual (self.getJob (job6) ['state'], "PAUSED") + assertEqual (self.getJob (job6) ['h_paused'], True) + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNone(h1) + + def test_start_job6(self): + self.startJob (job6) + assertEqual (self.getJob (job6) ['state'], "WAITING") + assertEqual (self.getJob (job6) ['h_paused'], False) + + def test_worker1_pick_job(self): + self.stopWorker ("worker1") + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual (pick1[0], -1) + assertEqual (self.getWorker ('worker1') ['state'], "WAITING") + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNone(h1) + + def test_worker1_pick_job(self): + self.startWorker ("worker1") + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual (pick1[0], job6) + assertEqual (self.getJob (job6) ['state'], "WORKING") + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNotNone(h1) + + def test_worker1_deleted(self): + self.deleteWorker ("worker1") + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNotNone(h1) + + def test_worker1_stopped(self): + self.stopWorker ("worker1") + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNone(h1) + + def test_delete_job6(self): + self.deleteJob (job6) + assertIsNone(self.getJob (job6)) + + def test_worker1_pick_job(self): + self.startWorker ("worker1") + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], -1) + + def test_create_job7(self): + job7 = self.newJob (0, "Parent-1", "", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], -1) + + def test_create_job8(self): + job8 = self.newJob (job7, "Child-1", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual (pick1[0], job8) + + def test_worker1_heartbeats(self): + h1 = self.heartbeat ("worker1", pick1[0], 1, 1, 1, '127.0.0.1') + assertIsNotNone(h1) + + def test_worker1_finish_job_and_change_job7_priority(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + job8prio = self.getJob (job8) ['h_priority'] + self.editJobs ({ job7: { "priority": 12 } }) + assertLess(job8prio, self.getJob(job8)['h_priority']) + + def test_worker1_pick_job12(self): + job9 = self.newJob (0, "Parent-2", "", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] + job10 = self.newJob (0, "Parent-3", "", ".", "", "WAITING", False, 100, 11, "" , "", "", "") ['id'] + job11 = self.newJob (job9, "Child-2", "ls /", ".", "", "WAITING", False, 100, 10, "" , "", "", "") ['id'] + job12 = self.newJob (job10, "Child-3", "ls /", ".", "", "WAITING", False, 100, 8, "" , "", "", "") ['id'] + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assert (pick1[0] == job12) + + def test_worker1_pick_job11(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assert (pick1[0] == job11) + + def test_worker1_finish_job(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + + + def test_intricate_dependencies(self): + # performs intricate dependencies testing + # like a group dependent on a paused job + # and job dependent on this group + job20 = self.newJob (0, "job20", "sleep 2", ".", "", "PAUSED", False, 0, 100, "" , "", "", "") ['id'] + job21 = self.newJob (0, "job21", "", ".", "", "WAITING", False, 0, 100, "" , "", "", "") ['id'] + self.setJobDependencies (job21, [ job20 ]) + job22 = self.newJob (job21, "job22", "sleep 2", ".", "", "WAITING", False, 0, 100, "" , "", "", "") ['id'] + job23 = self.newJob (0, "job23", "sleep 2", ".", "", "WAITING", False, 0, 150, "" , "", "", "") ['id'] + self.setJobDependencies (job23, [ job21 ]) + job24 = self.newJob (0, "job24", "sleep 2", ".", "", "WAITING", False, 0, 200, "" , "", "", "") ['id'] + self.setJobDependencies (job24, [ job20 ]) + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + # can't start any job, all are dependent on job20 which is paused + assertEqual(pick1[0], -1) + assertEqual(self.getJob(job20)['state'], 'PAUSED') + assertEqual(self.getJob(job21)['state'], 'PENDING') + assertEqual(self.getJob(job22)['state'], 'WAITING') + assertIsNotNone(self.getJob(job22)['h_paused']) + assertEqual(self.getJob(job23)['state'], 'PENDING') + assertEqual(self.getJob(job24)['state'], 'PENDING') + + def test_can_only_pick_jobs20(self): + self.startJob (job20) + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], job20) + + def test_worker1_finish_job(self): + self.endJob("worker1", pick1[0], 0, "127.0.0.1") + assertEqual(self.getJob(job20)['state'], 'FINISHED') + assertEqual(self.getJob(job21)['state'], 'WAITING') + assertEqual(self.getJob(job22)['state'], 'WAITING') + assertNotNone(not self.getJob(job22) ['h_paused']) + assertEqual(self.getJob(job23)['state'], 'PENDING') + assertEqual(self.getJob(job24)['state'], 'WAITING') + + def test_worker1_pick_jobs24(self): + # pick job24 as it is the top priority job + pick1 = self.pickJob("worker1", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], job24) + + def test_worker1_finish_job(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + assertEqual(self.getJob(job20)['state'], 'FINISHED') + assertEqual(self.getJob(job21)['state'], 'WAITING') + assertEqual(self.getJob(job22)['state'], 'WAITING') + assertIsNone(self.getJob(job22)['h_paused']) + assertEqual(self.getJob(job23)['state'], 'PENDING') + assertEqual(self.getJob(job24)['state'], 'FINISHED') + + def test_pick_job22(self): + # pick job22 as job23 is still pending and job21 can't be picked as a group + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assertEqual(pick1[0], job22) + + def test_worker1_finish_job(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + + assertEqual (self.getJob (job20) ['state'], 'FINISHED') + assertEqual (self.getJob (job21) ['state'], 'FINISHED') + assertEqual (self.getJob (job22) ['state'], 'FINISHED') + assertEqual (self.getJob (job23) ['state'], 'WAITING') + assertEqual (self.getJob (job24) ['state'], 'FINISHED') + + def test_worker1_pick_job23(self): + # and eventually pick job23 + pick1 = self.pickJob ("worker1", 1, 1, 1, "127.0.0.1") + assert (pick1[0] == job23) + def test_worker1_finish_jobs(self): + self.endJob ("worker1", pick1[0], 0, "127.0.0.1") + + assertEqual (self.getJob (job20) ['state'], 'FINISHED') + assertEqual (self.getJob (job21) ['state'], 'FINISHED') + assertEqual (self.getJob (job22) ['state'], 'FINISHED') + assertEqual (self.getJob (job23) ['state'], 'FINISHED') + assertEqual (self.getJob (job24) ['state'], 'FINISHED') if __name__ == "__main__": - result = test_server_python_api() - if result.wasSuccessful(): - exit(0) - else: - exit(1) + result = test_server_python_api() + if result.wasSuccessful(): + exit(0) + else: + exit(1) # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/worker.py b/worker.py index d097d4a..964dd8e 100644 --- a/worker.py +++ b/worker.py @@ -13,11 +13,11 @@ import host_cpu, host_mem if sys.platform=="win32": - import _winreg - import win32serviceutil - import win32service - import win32event - import win32api + import _winreg + import win32serviceutil + import win32service + import win32event + import win32api # Options global serverUrl, debug, verbose, sleepTime, broadcastPort, gogogo, workers @@ -41,15 +41,15 @@ # Go to the script directory global coalitionDir if sys.platform=="win32": - import _winreg - # under windows, uses the registry setup by the installer - try: - hKey = _winreg.OpenKey (_winreg.HKEY_LOCAL_MACHINE, "SOFTWARE\\Mercenaries Engineering\\Coalition", 0, _winreg.KEY_READ) - coalitionDir, type = _winreg.QueryValueEx (hKey, "Installdir") - except OSError: - coalitionDir = "." + import _winreg + # under windows, uses the registry setup by the installer + try: + hKey = _winreg.OpenKey (_winreg.HKEY_LOCAL_MACHINE, "SOFTWARE\\Mercenaries Engineering\\Coalition", 0, _winreg.KEY_READ) + coalitionDir, type = _winreg.QueryValueEx (hKey, "Installdir") + except OSError: + coalitionDir = "." else: - coalitionDir = "." + coalitionDir = "." os.chdir (coalitionDir) # Read the config file @@ -57,31 +57,31 @@ config.read ("coalition.ini") def cfgInt (name, defvalue): - global config - if config.has_option('worker', name): - try: - return int (config.get('worker', name)) - except: - pass - return defvalue + global config + if config.has_option('worker', name): + try: + return int (config.get('worker', name)) + except: + pass + return defvalue def cfgBool (name, defvalue): - global config - if config.has_option('worker', name): - try: - return int (config.get('worker', name)) != 0 - except: - pass - return defvalue + global config + if config.has_option('worker', name): + try: + return int (config.get('worker', name)) != 0 + except: + pass + return defvalue def cfgStr (name, defvalue): - global config - if config.has_option('worker', name): - try: - return config.get('worker', name) - except: - pass - return defvalue + global config + if config.has_option('worker', name): + try: + return config.get('worker', name) + except: + pass + return defvalue serverUrl = cfgStr ('serverUrl', '') workers = cfgInt ('workers', 1) @@ -94,464 +94,464 @@ def cfgStr (name, defvalue): logfile = cfgStr ('logfile', './worker.log') def usage(): - print ("Usage: worker [OPTIONS] [SERVER_URL]") - print ("Start a Coalition worker using the server located at SERVER_URL.") - print ("If no SERVER_URL is specified, the worker will try to locate the server using a broadcast.\n") - print ("Options:") - print (" -h, --help\t\tShow this help") - print (" -v, --verbose\t\tIncrease verbosity") - print (" -d, --debug\t\tRun without the main try/catch") - print (" -u, --startup=COMMAND\t\tStartup command executed at worker startup") - #print (" -a, --affinity=AFFINITY\tAffinity words to jobs (default: \"\"") - print (" -n, --name=NAME\tWorker name (default: "+name+")") - print (" -s, --sleep=SLEEPTIME\tSleep time between two heart beats (default: "+str (sleepTime)+"s)") - print (" -w, --workers=WORKERS\t\tNumber of workers to run (default: 1)") - print (" -c, --cpus=CPUS\t\tIndicated number of cpus per worker, determines the number of worker to execute (default: 0, all available cpus)") - print (" -i, --install\t\tInstall service (Windows only)") - print ("\nExample : worker -s 30 -v http://localhost:19211") + print ("Usage: worker [OPTIONS] [SERVER_URL]") + print ("Start a Coalition worker using the server located at SERVER_URL.") + print ("If no SERVER_URL is specified, the worker will try to locate the server using a broadcast.\n") + print ("Options:") + print (" -h, --help\t\tShow this help") + print (" -v, --verbose\t\tIncrease verbosity") + print (" -d, --debug\t\tRun without the main try/catch") + print (" -u, --startup=COMMAND\t\tStartup command executed at worker startup") + #print (" -a, --affinity=AFFINITY\tAffinity words to jobs (default: \"\"") + print (" -n, --name=NAME\tWorker name (default: "+name+")") + print (" -s, --sleep=SLEEPTIME\tSleep time between two heart beats (default: "+str (sleepTime)+"s)") + print (" -w, --workers=WORKERS\t\tNumber of workers to run (default: 1)") + print (" -c, --cpus=CPUS\t\tIndicated number of cpus per worker, determines the number of worker to execute (default: 0, all available cpus)") + print (" -i, --install\t\tInstall service (Windows only)") + print ("\nExample : worker -s 30 -v http://localhost:19211") if not service: - # Parse the options - try: - opts, args = getopt.getopt(sys.argv[1:], "a:c:dhin:s:u:vw:", ["affinity=", "cpus=", "debug", "help", "install", "name=", "sleep=", "startup=", "verbose", "workers="]) - if len(args) > 0: - serverUrl = args[0] - except getopt.GetoptError, err: - # print help information and exit: - print str(err) # will print something like "option -a not recognized" - usage() - sys.exit(2) - for o, a in opts: - if o in ("-a", "--affinity"): - affinity = a - elif o in ("-c", "--cpus"): - cpus = int (a) - elif o in ("-d", "--debug"): - debug = True - elif o in ("-h", "--help"): - usage() - sys.exit(2) - elif o in ("-i", "--install"): - install = True - elif o in ("-n", "--name"): - name = a - elif o in ("-s", "--sleep"): - sleepTime = float (a) - elif o in ("-u", "--startup"): - startup = a - elif o in ("-v", "--verbose"): - verbose = True - elif o in ("-w", "--workers"): - workers = int (a) - else: - assert False, "unhandled option " + o + # Parse the options + try: + opts, args = getopt.getopt(sys.argv[1:], "a:c:dhin:s:u:vw:", ["affinity=", "cpus=", "debug", "help", "install", "name=", "sleep=", "startup=", "verbose", "workers="]) + if len(args) > 0: + serverUrl = args[0] + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + for o, a in opts: + if o in ("-a", "--affinity"): + affinity = a + elif o in ("-c", "--cpus"): + cpus = int (a) + elif o in ("-d", "--debug"): + debug = True + elif o in ("-h", "--help"): + usage() + sys.exit(2) + elif o in ("-i", "--install"): + install = True + elif o in ("-n", "--name"): + name = a + elif o in ("-s", "--sleep"): + sleepTime = float (a) + elif o in ("-u", "--startup"): + startup = a + elif o in ("-v", "--verbose"): + verbose = True + elif o in ("-w", "--workers"): + workers = int (a) + else: + assert False, "unhandled option " + o if not verbose or service: - outfile = open(logfile, 'a') - sys.stdout = outfile - sys.stderr = outfile + outfile = open(logfile, 'a') + sys.stdout = outfile + sys.stderr = outfile # Log for debugging def vprint (str): - if verbose: - print (str) - sys.stdout.flush() + if verbose: + print (str) + sys.stdout.flush() # Log for debugging def debugRaw (str): - if verbose: - print (str) - sys.stdout.flush() + if verbose: + print (str) + sys.stdout.flush() vprint ("--- Start ------------------------------------------------------------") # If 'cpus' option set, compute the number of workers out of the total number of cpus if cpus != None: - if platform.platform == "win32": - try: - totalcpus = int (os.getenv ("NUMBER_OF_PROCESSORS")) - cpus = min (totalcpus, cpus) - workers = max (1, totalcpus / cpus) - except: - pass - else: - pass + if platform.platform == "win32": + try: + totalcpus = int (os.getenv ("NUMBER_OF_PROCESSORS")) + cpus = min (totalcpus, cpus) + workers = max (1, totalcpus / cpus) + except: + pass + else: + pass vprint ("Running with " + str (workers) + " workers.") random.seed () def shuffleSleepTime (sleepTime): - return sleepTime * (1.0 + (random.random ()-0.5)*0.2) + return sleepTime * (1.0 + (random.random ()-0.5)*0.2) # Safe method to run a command on the server, if retry is true, the function won't return until the message is passed def workerRun (worker, func, retry): - global sleepTime, gogogo - while (gogogo): - serverConn = None - try: - serverConn = httplib.HTTPConnection (re.sub ('^http://', '', serverUrl)) - result = func (serverConn) - serverConn.close () - return result - except (socket.error,httplib.HTTPException),err: - print ("Error sending to the server : ", str (err)) - pass - if serverConn != None: - serverConn.close () - if not retry: - vprint ("Server down, continue...") - break - vprint ("No server") - if gogogo: - time.sleep (shuffleSleepTime (sleepTime)) + global sleepTime, gogogo + while (gogogo): + serverConn = None + try: + serverConn = httplib.HTTPConnection (re.sub ('^http://', '', serverUrl)) + result = func (serverConn) + serverConn.close () + return result + except (socket.error,httplib.HTTPException),err: + print ("Error sending to the server : ", str (err)) + pass + if serverConn != None: + serverConn.close () + if not retry: + vprint ("Server down, continue...") + break + vprint ("No server") + if gogogo: + time.sleep (shuffleSleepTime (sleepTime)) # A Singler worker class Worker: - def __init__ (self, name): - self.Name = name # The worker name - self.Working = False # The worker current state - self.PId = 0 # The worker current process pid - self.User = "" - self.ErrorCode = 0 # The process exit error code - self.LogLock = thread.allocate_lock() # Logs lock - self.Log = "" # Logs - self.HostCPU = host_cpu.HostCPU () - self.total_memory = host_mem.getTotalMem () - - - # LoadAvg - def workerGetLoadAvg (self): - usage = self.HostCPU.getUsage () - return usage - - def workerEvalEnv (self, _str, _env): - if platform.system () != 'Windows': - def _mapDrive (match): - return '$(' + match.group(1).upper () + '_DRIVE)' - _str = re.sub ('^([a-zA-Z]):', _mapDrive, _str) - def _getenv (match): - m = match.group(1) - # if _env exists, first try in _env - if _env: - try: - return _env[m] - except: - pass - result = os.getenv (m) - if result == None: - self.info ("ERROR : Environment variable not found : " + match.group(1)) - result = "" - return result - while re.search ('\$\(([^)]*)\)', _str): - _str = re.sub ('\$\(([^)]*)\)', _getenv, _str) - return _str - - # Add to the logs - def info (self, str): - self.LogLock.acquire() - try: - self.Log = self.Log + "* " + str + "\n"; - vprint (str) - finally: - self.LogLock.release() - - # Thread function to execute the job process - def _execProcess (self, cmd, dir, user, environment): - self.info ("START **********************************") - self.info ("WORKER : " + self.Name) - self.info ("DATE : " + datetime.datetime.today ().strftime("%d/%m/%y %H:%M")) - - # Special command ? - if runcommand != '': - cmd = string.replace(string.replace(string.replace(runcommand, '__user__', user), '__dir__', dir), '__cmd__', cmd) - else: - if dir != "" : - try: - # Linux, change the \\ for / - if sys.platform != "win32" : - dir = re.sub ("\\\\", "/", dir) - os.chdir (dir) - except OSError, err: - self.info ("ERROR : Can't change dir to " + dir + ": " + str (err)) - - # Run the job - self.info ("CMD : " + cmd) - - # Make sure - os.umask(002) - process = subprocess.Popen (cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=environment) - - # Get the pid - self.PId = int (process.pid) - self.User = user - while (1): - # Read some lines of logs - line = process.stdout.readline() - - # "" means EOF - if line == "": - break - - debugRaw (line) - self.LogLock.acquire() - try: - self.Log = self.Log + line - finally: - self.LogLock.release() - - # Get the error code of the job - self.ErrorCode = process.wait () - self.info ("EXIT : " + str(self.ErrorCode)) - self.info ("END ********\n") - - - def execProcess (self, cmd, dir, user, env): - global debug, sleepTime - if debug: - self._execProcess (cmd, dir, user, env) - else: - try: - self._execProcess (cmd, dir, user, env) - except: - self.ErrorCode = -1 - print ("Fatal error executing the job...") - time.sleep (shuffleSleepTime (sleepTime)) - # Signal to the main process the job is finished - self.Working = False - Event.set () - - ### To kill the current worker job - def killJob (self): - if self.PId != 0: - vprint ("kill " + str (self.PId)) - try: - self.killr (self.PId) - self.PId = 0 - except OSError as exc: - vprint ("kill failed") - vprint (exc) - pass - - ### To kill all child process - def killr (self, pid): - if sys.platform != "win32": - names = os.listdir ("/proc/") - for name in names: - try: - f = open ("/proc/" + name +"/stat","r") - line = f.readline() - words = string.split(line) - if words[3] == str (pid): - vprint ("Found in " + name) - self.killr (int (name)) - except IOError as exc: - #vprint (exc) - pass - try: - if sys.platform == "win32": - subprocess.Popen ("taskkill /F /T /PID %i"%pid, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - elif runcommand != '': - killcmd = "kill -9 "+ str (pid) - cmd = string.replace(string.replace(string.replace(runcommand, '__user__', self.User), '__dir__', '.'), '__cmd__', killcmd) - vprint ("Kill process with runcommand : "+cmd) - subprocess.Popen (cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - else: - vprint ("Kill process with os.kill") - os.kill (pid, signal.SIGKILL) - except OSError as exc: - vprint ("Can't kill the process %i"%pid) - vprint (exc) - except: - vprint ("Can't kill the process %i"%pid) - vprint (sys.exc_info ()[0]) - - # Flush the logs to the server - def heartbeat (self, jobId, retry): - vprint ("Flush logs (" + str (len (self.Log)) + " bytes)") - def func (serverConn): - result = True - - self.LogLock.acquire() - try: - params = urllib.urlencode ({ - 'hostname':self.Name, - 'jobId':jobId, - 'log':base64.b64encode (self.Log), - 'load':self.workerGetLoadAvg (), - 'free_memory':int(host_mem.getAvailableMem()/1024/1024), - 'total_memory':int(self.total_memory/1024/1024) - }) - serverConn.request ("POST", "/workers/heartbeat", params, Headers) - response = serverConn.getresponse() - result = response.read() - self.Log = "" - finally: - self.LogLock.release() - - if result == "false": - vprint ("Server ask to stop the job " + str (jobId)) - # Send the kill signal to the process - self.killJob () - workerRun (self, func, retry) - - # Worker main loop - def mainLoop (self): - global sleepTime - vprint ("Ask for a job") - # Function to ask a job to the server - def startFunc (serverConn): - params = urllib.urlencode ({ - 'hostname':self.Name, - 'load':self.workerGetLoadAvg (), - 'free_memory':int(host_mem.getAvailableMem()/1024/1024), - 'total_memory':int(self.total_memory/1024/1024) - }) - serverConn.request ("POST", "/workers/pickjob", params, Headers) - response = serverConn.getresponse() - result = response.read() - return eval (result) - - # Block until this message to handled by the server - jobId, cmd, dir, user, env = workerRun (self, startFunc, True) - - if jobId != -1: - self.Log = "" - - _env = None - if env: - # Duplicate environment to add overrides - _env = {} - try: - for key, value in os.environ.items (): - _env[key] = value; - except: - pass - try: - for k in env.split ("\\n"): - try: - key, value = k.split ("=", 1) - _env[key] = value - except: - pass - except: - _env = None - - _cmd = self.workerEvalEnv (cmd, _env) - _dir = self.workerEvalEnv (dir, _env) - - vprint ("Start job " + str (jobId) + " in " + _dir + " : " + _cmd) - vprint ("Job environment is " + str (_env)) - - # Reset the globals - self.Working = True - stop = False - self.PId = 0 - - # Launch a new thread to run the process - - # Set the working directory in the main thead - thread.start_new_thread (self.execProcess, (_cmd, _dir, user, _env)) - - # Flush the logs - while (self.Working): - self.heartbeat (jobId, False) - Event.clear () - Event.wait (shuffleSleepTime (sleepTime)) - - # Flush for real for the last time - self.heartbeat (jobId, True) - - vprint ("Finished job " + str (jobId) + " (code " + str (self.ErrorCode) + ") : " + _cmd) - - # Function to end the job - def endFunc (serverConn): - params = urllib.urlencode ({ - 'hostname':self.Name, - 'jobId':jobId, - 'errorCode':self.ErrorCode, - }) - serverConn.request ("POST", "/workers/endjob", params, Headers) - response = serverConn.getresponse() - response.read () - - # Block until this message to handled by the server - workerRun (self, endFunc, True) - else: - time.sleep (shuffleSleepTime (sleepTime)) + def __init__ (self, name): + self.Name = name # The worker name + self.Working = False # The worker current state + self.PId = 0 # The worker current process pid + self.User = "" + self.ErrorCode = 0 # The process exit error code + self.LogLock = thread.allocate_lock() # Logs lock + self.Log = "" # Logs + self.HostCPU = host_cpu.HostCPU () + self.total_memory = host_mem.getTotalMem () + + + # LoadAvg + def workerGetLoadAvg (self): + usage = self.HostCPU.getUsage () + return usage + + def workerEvalEnv (self, _str, _env): + if platform.system () != 'Windows': + def _mapDrive (match): + return '$(' + match.group(1).upper () + '_DRIVE)' + _str = re.sub ('^([a-zA-Z]):', _mapDrive, _str) + def _getenv (match): + m = match.group(1) + # if _env exists, first try in _env + if _env: + try: + return _env[m] + except: + pass + result = os.getenv (m) + if result == None: + self.info ("ERROR : Environment variable not found : " + match.group(1)) + result = "" + return result + while re.search ('\$\(([^)]*)\)', _str): + _str = re.sub ('\$\(([^)]*)\)', _getenv, _str) + return _str + + # Add to the logs + def info (self, str): + self.LogLock.acquire() + try: + self.Log = self.Log + "* " + str + "\n"; + vprint (str) + finally: + self.LogLock.release() + + # Thread function to execute the job process + def _execProcess (self, cmd, dir, user, environment): + self.info ("START **********************************") + self.info ("WORKER : " + self.Name) + self.info ("DATE : " + datetime.datetime.today ().strftime("%d/%m/%y %H:%M")) + + # Special command ? + if runcommand != '': + cmd = string.replace(string.replace(string.replace(runcommand, '__user__', user), '__dir__', dir), '__cmd__', cmd) + else: + if dir != "" : + try: + # Linux, change the \\ for / + if sys.platform != "win32" : + dir = re.sub ("\\\\", "/", dir) + os.chdir (dir) + except OSError, err: + self.info ("ERROR : Can't change dir to " + dir + ": " + str (err)) + + # Run the job + self.info ("CMD : " + cmd) + + # Make sure + os.umask(002) + process = subprocess.Popen (cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=environment) + + # Get the pid + self.PId = int (process.pid) + self.User = user + while (1): + # Read some lines of logs + line = process.stdout.readline() + + # "" means EOF + if line == "": + break + + debugRaw (line) + self.LogLock.acquire() + try: + self.Log = self.Log + line + finally: + self.LogLock.release() + + # Get the error code of the job + self.ErrorCode = process.wait () + self.info ("EXIT : " + str(self.ErrorCode)) + self.info ("END ********\n") + + + def execProcess (self, cmd, dir, user, env): + global debug, sleepTime + if debug: + self._execProcess (cmd, dir, user, env) + else: + try: + self._execProcess (cmd, dir, user, env) + except: + self.ErrorCode = -1 + print ("Fatal error executing the job...") + time.sleep (shuffleSleepTime (sleepTime)) + # Signal to the main process the job is finished + self.Working = False + Event.set () + + ### To kill the current worker job + def killJob (self): + if self.PId != 0: + vprint ("kill " + str (self.PId)) + try: + self.killr (self.PId) + self.PId = 0 + except OSError as exc: + vprint ("kill failed") + vprint (exc) + pass + + ### To kill all child process + def killr (self, pid): + if sys.platform != "win32": + names = os.listdir ("/proc/") + for name in names: + try: + f = open ("/proc/" + name +"/stat","r") + line = f.readline() + words = string.split(line) + if words[3] == str (pid): + vprint ("Found in " + name) + self.killr (int (name)) + except IOError as exc: + #vprint (exc) + pass + try: + if sys.platform == "win32": + subprocess.Popen ("taskkill /F /T /PID %i"%pid, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + elif runcommand != '': + killcmd = "kill -9 "+ str (pid) + cmd = string.replace(string.replace(string.replace(runcommand, '__user__', self.User), '__dir__', '.'), '__cmd__', killcmd) + vprint ("Kill process with runcommand : "+cmd) + subprocess.Popen (cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + else: + vprint ("Kill process with os.kill") + os.kill (pid, signal.SIGKILL) + except OSError as exc: + vprint ("Can't kill the process %i"%pid) + vprint (exc) + except: + vprint ("Can't kill the process %i"%pid) + vprint (sys.exc_info ()[0]) + + # Flush the logs to the server + def heartbeat (self, jobId, retry): + vprint ("Flush logs (" + str (len (self.Log)) + " bytes)") + def func (serverConn): + result = True + + self.LogLock.acquire() + try: + params = urllib.urlencode ({ + 'hostname':self.Name, + 'jobId':jobId, + 'log':base64.b64encode (self.Log), + 'load':self.workerGetLoadAvg (), + 'free_memory':int(host_mem.getAvailableMem()/1024/1024), + 'total_memory':int(self.total_memory/1024/1024) + }) + serverConn.request ("POST", "/workers/heartbeat", params, Headers) + response = serverConn.getresponse() + result = response.read() + self.Log = "" + finally: + self.LogLock.release() + + if result == "false": + vprint ("Server ask to stop the job " + str (jobId)) + # Send the kill signal to the process + self.killJob () + workerRun (self, func, retry) + + # Worker main loop + def mainLoop (self): + global sleepTime + vprint ("Ask for a job") + # Function to ask a job to the server + def startFunc (serverConn): + params = urllib.urlencode ({ + 'hostname':self.Name, + 'load':self.workerGetLoadAvg (), + 'free_memory':int(host_mem.getAvailableMem()/1024/1024), + 'total_memory':int(self.total_memory/1024/1024) + }) + serverConn.request ("POST", "/workers/pickjob", params, Headers) + response = serverConn.getresponse() + result = response.read() + return eval (result) + + # Block until this message to handled by the server + jobId, cmd, dir, user, env = workerRun (self, startFunc, True) + + if jobId != -1: + self.Log = "" + + _env = None + if env: + # Duplicate environment to add overrides + _env = {} + try: + for key, value in os.environ.items (): + _env[key] = value; + except: + pass + try: + for k in env.split ("\\n"): + try: + key, value = k.split ("=", 1) + _env[key] = value + except: + pass + except: + _env = None + + _cmd = self.workerEvalEnv (cmd, _env) + _dir = self.workerEvalEnv (dir, _env) + + vprint ("Start job " + str (jobId) + " in " + _dir + " : " + _cmd) + vprint ("Job environment is " + str (_env)) + + # Reset the globals + self.Working = True + stop = False + self.PId = 0 + + # Launch a new thread to run the process + + # Set the working directory in the main thead + thread.start_new_thread (self.execProcess, (_cmd, _dir, user, _env)) + + # Flush the logs + while (self.Working): + self.heartbeat (jobId, False) + Event.clear () + Event.wait (shuffleSleepTime (sleepTime)) + + # Flush for real for the last time + self.heartbeat (jobId, True) + + vprint ("Finished job " + str (jobId) + " (code " + str (self.ErrorCode) + ") : " + _cmd) + + # Function to end the job + def endFunc (serverConn): + params = urllib.urlencode ({ + 'hostname':self.Name, + 'jobId':jobId, + 'errorCode':self.ErrorCode, + }) + serverConn.request ("POST", "/workers/endjob", params, Headers) + response = serverConn.getresponse() + response.read () + + # Block until this message to handled by the server + workerRun (self, endFunc, True) + else: + time.sleep (shuffleSleepTime (sleepTime)) def main (): - global name, serverUrl, sleepTime, broadcastPort, gogogo, workers, startup - - print ("Startup command is '" + str (startup) + "'") - if startup != "": - cmd = startup - if sys.platform=="win32": - cmd = '"' + cmd + '"' - process = subprocess.Popen (cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - errorCode = process.wait () - print ("Startup command exited with code " + str (errorCode)) - - # If no server, look for it with a broadcast - if serverUrl == "": - from socket import SOL_SOCKET, SO_BROADCAST - from socket import socket, AF_INET, SOCK_DGRAM, timeout - - s = socket (AF_INET, SOCK_DGRAM) - s.setsockopt(SOL_SOCKET, SO_BROADCAST, True) - s.bind (('0.0.0.0', 0)) - s.settimeout (1) - while (gogogo): - try: - vprint ("Broadcast port " + str (broadcastPort)) - s.sendto ("coalition", ('255.255.255.255', broadcastPort)) - data, addr = s.recvfrom (1024) - if data == "roxor": - serverUrl = "http://" + addr[0] + ":" + str (broadcastPort) - print ("Server found at " + serverUrl) - vprint ("Found : " + serverUrl) - found = True - break - except timeout: - pass - s.close () - - while serverUrl[-1] == '/': - serverUrl = serverUrl[:-1] - - print ("Working...") - - def threadfunc (worker): - global debug, sleepTime, gogogo - while gogogo: - if debug: - worker.mainLoop () - else: - try: - worker.mainLoop () - except: - print ("Fatal error, retry...") - if gogogo: - time.sleep (shuffleSleepTime (sleepTime)) - vprint ("WORKER " + worker.Name + " is kindly asked to quit.") - # kill any job in process - worker.killJob () - - # start each thread - if workers == 1: - # No suffix if one worker - worker = Worker (name) - thread.start_new_thread (threadfunc, (worker,)) - else: - for k in range (workers): - worker = Worker (name + "-" + str (k+1)) - thread.start_new_thread (threadfunc, (worker,)) - - # and let the main thread wait - while gogogo: - time.sleep (shuffleSleepTime (sleepTime)) + global name, serverUrl, sleepTime, broadcastPort, gogogo, workers, startup + + print ("Startup command is '" + str (startup) + "'") + if startup != "": + cmd = startup + if sys.platform=="win32": + cmd = '"' + cmd + '"' + process = subprocess.Popen (cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + errorCode = process.wait () + print ("Startup command exited with code " + str (errorCode)) + + # If no server, look for it with a broadcast + if serverUrl == "": + from socket import SOL_SOCKET, SO_BROADCAST + from socket import socket, AF_INET, SOCK_DGRAM, timeout + + s = socket (AF_INET, SOCK_DGRAM) + s.setsockopt(SOL_SOCKET, SO_BROADCAST, True) + s.bind (('0.0.0.0', 0)) + s.settimeout (1) + while (gogogo): + try: + vprint ("Broadcast port " + str (broadcastPort)) + s.sendto ("coalition", ('255.255.255.255', broadcastPort)) + data, addr = s.recvfrom (1024) + if data == "roxor": + serverUrl = "http://" + addr[0] + ":" + str (broadcastPort) + print ("Server found at " + serverUrl) + vprint ("Found : " + serverUrl) + found = True + break + except timeout: + pass + s.close () + + while serverUrl[-1] == '/': + serverUrl = serverUrl[:-1] + + print ("Working...") + + def threadfunc (worker): + global debug, sleepTime, gogogo + while gogogo: + if debug: + worker.mainLoop () + else: + try: + worker.mainLoop () + except: + print ("Fatal error, retry...") + if gogogo: + time.sleep (shuffleSleepTime (sleepTime)) + vprint ("WORKER " + worker.Name + " is kindly asked to quit.") + # kill any job in process + worker.killJob () + + # start each thread + if workers == 1: + # No suffix if one worker + worker = Worker (name) + thread.start_new_thread (threadfunc, (worker,)) + else: + for k in range (workers): + worker = Worker (name + "-" + str (k+1)) + thread.start_new_thread (threadfunc, (worker,)) + + # and let the main thread wait + while gogogo: + time.sleep (shuffleSleepTime (sleepTime)) if not service: - main() + main() # vim: tabstop=4 noexpandtab shiftwidth=4 softtabstop=4 textwidth=79 diff --git a/worker_service.py b/worker_service.py index e88a8e4..524d6a3 100644 --- a/worker_service.py +++ b/worker_service.py @@ -10,29 +10,29 @@ import servicemanager class WindowsService(win32serviceutil.ServiceFramework): - _svc_name_ = "CoalitionWorker" - _svc_display_name_ = "Coalition Worker" - - def __init__(self, args): - win32serviceutil.ServiceFramework.__init__(self, args) - self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) - - def SvcStop(self): - global gogogo - gogogo = False - self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING) - win32event.SetEvent(self.hWaitStop) - - def SvcDoRun(self): - self.CheckForQuit() - main() - - def CheckForQuit(self): - global gogogo - retval = win32event.WaitForSingleObject(self.hWaitStop, 10) - if not retval == win32event.WAIT_TIMEOUT: - # Received Quit from Win32 - gogogo = False + _svc_name_ = "CoalitionWorker" + _svc_display_name_ = "Coalition Worker" + + def __init__(self, args): + win32serviceutil.ServiceFramework.__init__(self, args) + self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) + + def SvcStop(self): + global gogogo + gogogo = False + self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING) + win32event.SetEvent(self.hWaitStop) + + def SvcDoRun(self): + self.CheckForQuit() + main() + + def CheckForQuit(self): + global gogogo + retval = win32event.WaitForSingleObject(self.hWaitStop, 10) + if not retval == win32event.WAIT_TIMEOUT: + # Received Quit from Win32 + gogogo = False win32serviceutil.HandleCommandLine(WindowsService)