Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  from __future__ import absolute_import 
  15  from __future__ import print_function 
  16  import subprocess 
  17  import logging 
  18  import os 
  19  import time 
  20  import re 
  21  import glob 
  22  import inspect 
  23  import sys 
  24  import six 
  25  from six.moves import range 
  26  from six.moves import input 
  27   
  28  logger = logging.getLogger('madgraph.cluster')  
  29   
  30  try: 
  31      from madgraph import MadGraph5Error 
  32      import madgraph.various.misc as misc 
  33  except Exception as error: 
  34      if __debug__: 
  35          print(str(error)) 
  36      from internal import MadGraph5Error 
  37      import internal.misc as misc 
  38   
  39  pjoin = os.path.join 
40 41 -class ClusterManagmentError(MadGraph5Error):
42 pass
43
44 -class NotImplemented(MadGraph5Error):
45 pass
46 47 48 multiple_try = misc.multiple_try 49 pjoin = os.path.join
50 51 52 -def check_interupt(error=KeyboardInterrupt):
53 54 def deco_interupt(f): 55 def deco_f_interupt(self, *args, **opt): 56 try: 57 return f(self, *args, **opt) 58 except error: 59 try: 60 self.remove(*args, **opt) 61 except Exception: 62 pass 63 raise error
64 return deco_f_interupt 65 return deco_interupt 66
67 -def store_input(arg=''):
68 69 def deco_store(f): 70 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 71 input_files=[], output_files=[], required_output=[], nb_submit=0): 72 frame = inspect.currentframe() 73 args, _, _, values = inspect.getargvalues(frame) 74 args = dict([(i, values[i]) for i in args if i != 'self']) 75 id = f(self, **args) 76 if self.nb_retry > 0: 77 self.retry_args[id] = args 78 return id
79 return deco_f_store 80 return deco_store 81
82 -def need_transfer(options):
83 """ This function checks whether compression of input files are necessary 84 given the running options given. """ 85 86 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 87 return False 88 else: 89 return True
90
91 -class Cluster(object):
92 """Basic Class for all cluster type submission""" 93 name = 'mother class' 94 identifier_length = 14 95
96 - def __init__(self,*args, **opts):
97 """Init the cluster""" 98 99 self.submitted = 0 100 self.submitted_ids = [] 101 self.finish = 0 102 self.submitted_dirs = [] #HTCaaS 103 self.submitted_exes = [] #HTCaaS 104 self.submitted_args = [] #HTCaaS 105 106 if 'cluster_queue' in opts: 107 self.cluster_queue = opts['cluster_queue'] 108 else: 109 self.cluster_queue = 'madgraph' 110 if 'cluster_temp_path' in opts: 111 self.temp_dir = opts['cluster_temp_path'] 112 else: 113 self.temp_dir = None 114 self.options = {'cluster_status_update': (600, 30)} 115 for key,value in opts.items(): 116 self.options[key] = value 117 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 118 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 119 self.options = dict(opts) 120 self.retry_args = {} 121 # controlling jobs in controlled type submision 122 self.packet = {} 123 self.id_to_packet = {}
124
125 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 126 log=None, required_output=[], nb_submit=0):
127 """How to make one submission. Return status id on the cluster.""" 128 raise NotImplemented('No implementation of how to submit a job to cluster \'%s\'' % self.name)
129 130 131 @store_input()
132 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 133 log=None, input_files=[], output_files=[], required_output=[], 134 nb_submit=0):
135 """How to make one submission. Return status id on the cluster. 136 NO SHARE DISK""" 137 138 if cwd is None: 139 cwd = os.getcwd() 140 if not os.path.exists(prog): 141 prog = os.path.join(cwd, prog) 142 143 if not required_output and output_files: 144 required_output = output_files 145 146 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 147 (input_files == [] == output_files): 148 149 return self.submit(prog, argument, cwd, stdout, stderr, log, 150 required_output=required_output, nb_submit=nb_submit) 151 152 if not input_files and not output_files: 153 # not input/output so not using submit2 154 return self.submit(prog, argument, cwd, stdout, stderr, log, 155 required_output=required_output, nb_submit=nb_submit) 156 157 if cwd is None: 158 cwd = os.getcwd() 159 if not os.path.exists(prog): 160 prog = os.path.join(cwd, prog) 161 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 162 163 text = """#!/bin/bash 164 MYTMP=%(tmpdir)s/run$%(job_id)s 165 MYPWD=%(cwd)s 166 mkdir -p $MYTMP 167 cd $MYPWD 168 input_files=( %(input_files)s ) 169 for i in ${input_files[@]} 170 do 171 cp -R -L $i $MYTMP 172 done 173 cd $MYTMP 174 echo '%(arguments)s' > arguments 175 chmod +x ./%(script)s 176 %(program)s ./%(script)s %(arguments)s 177 exit=$? 178 output_files=( %(output_files)s ) 179 for i in ${output_files[@]} 180 do 181 cp -r $MYTMP/$i $MYPWD 182 done 183 # if [ "$exit" -eq "0" ] 184 # then 185 rm -rf $MYTMP 186 # fi 187 """ 188 189 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 190 'cwd': cwd, 'job_id': self.job_id, 191 'input_files': ' '.join(input_files + [prog]), 192 'output_files': ' '.join(output_files), 193 'arguments': ' '.join([str(a) for a in argument]), 194 'program': ' ' if '.py' in prog else 'bash'} 195 196 # writing a new script for the submission 197 new_prog = pjoin(cwd, temp_file_name) 198 open(new_prog, 'w').write(text % dico) 199 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 200 201 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 202 required_output=required_output, nb_submit=nb_submit)
203 204
205 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 206 log=None, input_files=[], output_files=[], required_output=[], 207 nb_submit=0, packet_member=None):
208 """This function wrap the cluster submition with cluster independant 209 method should not be overwritten (but for DAG type submission)""" 210 211 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 212 output_files, required_output, nb_submit) 213 214 215 if not packet_member: 216 return id 217 else: 218 if isinstance(packet_member, Packet): 219 self.id_to_packet[id] = packet_member 220 packet_member.put(id) 221 if packet_member.tag not in self.packet: 222 self.packet[packet_member.tag] = packet_member 223 else: 224 if packet_member in self.packet: 225 packet = self.packet[packet_member] 226 packet.put(id) 227 self.id_to_packet[id] = packet 228 return id
229
230 - def control(self, me_dir=None):
231 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 232 if not self.submitted_ids: 233 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name) 234 idle, run, fail = 0, 0, 0 235 for pid in self.submitted_ids[:]: 236 status = self.control_one_job(id) 237 if status == 'I': 238 idle += 1 239 elif status == 'R': 240 run += 1 241 elif status == 'F': 242 self.finish +=1 243 self.submitted_ids.remove(pid) 244 else: 245 fail += 1 246 247 return idle, run, self.finish, fail
248
249 - def control_one_job(self, pid):
250 """ control the status of a single job with it's cluster id """ 251 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
252
253 - def get_jobs_identifier(self, path, second_path=None):
254 """get a unique run_name for all the jobs helps to identify the runs 255 in the controller for some cluster.""" 256 257 if second_path: 258 path = os.path.realpath(pjoin(path, second_path)) 259 elif not os.path.exists(path): 260 return path # job already done 261 262 if 'SubProcesses' in path: 263 target = path.rsplit('/SubProcesses',1)[0] 264 elif 'MCatNLO' in path: 265 target = path.rsplit('/MCatNLO',1)[0] 266 elif 'PY8_parallelization' in path: 267 target = path.rsplit('/PY8_parallelization',1)[0] 268 elif second_path: 269 target=path 270 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 271 else: 272 target = path 273 274 if target.endswith('/'): 275 target = target[:-1] 276 277 target = misc.digest(target.encode())[-self.identifier_length:] 278 if not target[0].isalpha(): 279 target = 'a' + target[1:] 280 281 return target
282 283 284 @check_interupt()
285 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
286 """Wait that all job are finish. 287 if minimal_job set, then return if idle + run is lower than that number""" 288 289 290 mode = 1 # 0 is long waiting/ 1 is short waiting 291 nb_iter = 0 292 nb_short = 0 293 change_at = 5 # number of iteration from which we wait longer between update. 294 295 if update_first: 296 idle, run, finish, fail = self.control(me_dir) 297 update_first(idle, run, finish) 298 299 #usefull shortcut for readibility 300 longtime, shorttime = self.options['cluster_status_update'] 301 302 nb_job = 0 303 304 if self.options['cluster_type'] == 'htcaas2': 305 me_dir = self.metasubmit(self) 306 307 while 1: 308 old_mode = mode 309 nb_iter += 1 310 idle, run, finish, fail = self.control(me_dir) 311 if nb_job: 312 if idle + run + finish + fail != nb_job: 313 nb_job = idle + run + finish + fail 314 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 315 else: 316 nb_job = idle + run + finish + fail 317 if fail: 318 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 319 if idle + run == 0: 320 #time.sleep(20) #security to ensure that the file are really written on the disk 321 logger.info('All jobs finished') 322 fct(idle, run, finish) 323 break 324 if idle + run < minimal_job: 325 return 326 fct(idle, run, finish) 327 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 328 if nb_iter < change_at: 329 mode = 1 330 elif idle < run: 331 if old_mode == 0: 332 if nb_short: 333 mode = 0 #we already be back from short to long so stay in long 334 #check if we need to go back to short mode 335 elif idle: 336 if nb_iter > change_at + int(longtime)//shorttime: 337 mode = 0 #stay in long waiting mode 338 else: 339 mode = 1 # pass in short waiting mode 340 nb_short =0 341 else: 342 mode = 1 # pass in short waiting mode 343 nb_short = 0 344 elif old_mode == 1: 345 nb_short +=1 346 if nb_short > 3* max(change_at, int(longtime)//shorttime): 347 mode = 0 #go back in slow waiting 348 else: 349 mode = 0 350 351 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 352 if old_mode > mode: 353 logger.info('''Start to wait %ss between checking status. 354 Note that you can change this time in the configuration file. 355 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 356 357 #now Waiting! 358 if mode == 0: 359 try: 360 time.sleep(self.options['cluster_status_update'][0]) 361 except KeyboardInterrupt: 362 logger.info('start to update the status') 363 nb_iter = min(0, change_at -2) 364 nb_short = 0 365 else: 366 time.sleep(self.options['cluster_status_update'][1]) 367 368 369 self.submitted = 0 370 self.submitted_ids = []
371
372 - def check_termination(self, job_id):
373 """Check the termination of the jobs with job_id and relaunch it if needed.""" 374 375 376 if job_id not in self.retry_args: 377 if job_id in self.id_to_packet: 378 nb_in_packet = self.id_to_packet[job_id].remove_one() 379 if nb_in_packet == 0: 380 # packet done run the associate function 381 packet = self.id_to_packet[job_id] 382 # fully ensure that the packet is finished (thread safe) 383 packet.queue.join() 384 #running the function 385 packet.fct(*packet.args) 386 del self.id_to_packet[job_id] 387 return 'resubmit' 388 else: 389 return True 390 391 args = self.retry_args[job_id] 392 if 'time_check' in args: 393 time_check = args['time_check'] 394 else: 395 time_check = 0 396 397 for path in args['required_output']: 398 if args['cwd']: 399 path = pjoin(args['cwd'], path) 400 # check that file exists and is not empty. 401 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 402 break 403 else: 404 # all requested output are present 405 if time_check > 0: 406 logger.info('Job %s Finally found the missing output.' % (job_id)) 407 del self.retry_args[job_id] 408 self.submitted_ids.remove(job_id) 409 # check if the job_id is in a packet 410 if job_id in self.id_to_packet: 411 nb_in_packet = self.id_to_packet[job_id].remove_one() 412 if nb_in_packet == 0: 413 # packet done run the associate function 414 packet = self.id_to_packet[job_id] 415 # fully ensure that the packet is finished (thread safe) 416 packet.queue.join() 417 #running the function 418 packet.fct(*packet.args) 419 del self.id_to_packet[job_id] 420 return 'resubmit' 421 422 return 'done' 423 424 if time_check == 0: 425 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 426 args['time_check'] = time.time() 427 return 'wait' 428 elif self.cluster_retry_wait > time.time() - time_check: 429 return 'wait' 430 431 #jobs failed to be completed even after waiting time!! 432 if self.nb_retry < 0: 433 logger.critical('''Fail to run correctly job %s. 434 with option: %s 435 file missing: %s''' % (job_id, args, path)) 436 input('press enter to continue.') 437 elif self.nb_retry == 0: 438 logger.critical('''Fail to run correctly job %s. 439 with option: %s 440 file missing: %s. 441 Stopping all runs.''' % (job_id, args, path)) 442 self.remove() 443 elif args['nb_submit'] >= self.nb_retry: 444 logger.critical('''Fail to run correctly job %s. 445 with option: %s 446 file missing: %s 447 Fails %s times 448 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 449 self.remove() 450 else: 451 args['nb_submit'] += 1 452 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 453 del self.retry_args[job_id] 454 self.submitted_ids.remove(job_id) 455 if 'time_check' in args: 456 del args['time_check'] 457 if job_id in self.id_to_packet: 458 self.id_to_packet[job_id].remove_one() 459 args['packet_member'] = self.id_to_packet[job_id] 460 del self.id_to_packet[job_id] 461 self.cluster_submit(**args) 462 else: 463 self.submit2(**args) 464 return 'resubmit' 465 return 'done'
466 467 @check_interupt()
468 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 469 stderr=None, log=None, required_output=[], nb_submit=0, 470 input_files=[], output_files=[]):
471 """launch one job on the cluster and wait for it""" 472 473 special_output = False # tag for concatenate the error with the output. 474 if stderr == -2 and stdout: 475 #We are suppose to send the output to stdout 476 special_output = True 477 stderr = stdout + '.err' 478 479 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 480 required_output=required_output, input_files=input_files, 481 output_files=output_files) 482 483 if self.options['cluster_type']=='htcaas2': 484 if self.submitted == self.submitted_ids[-1]: 485 id = self.metasubmit(self) 486 487 frame = inspect.currentframe() 488 args, _, _, values = inspect.getargvalues(frame) 489 args = dict([(i, values[i]) for i in args if i != 'self']) 490 self.retry_args[id] = args 491 492 nb_wait=0 493 while 1: 494 nb_wait+=1 495 status = self.control_one_job(id) 496 if not status in ['R','I']: 497 status = self.check_termination(id) 498 if status in ['wait']: 499 time.sleep(30) 500 continue 501 elif status in ['resubmit']: 502 id = self.submitted_ids[0] 503 time.sleep(30) 504 continue 505 #really stop! 506 time.sleep(30) #security to ensure that the file are really written on the disk 507 break 508 time.sleep(self.options['cluster_status_update'][1]) 509 510 if required_output: 511 status = self.check_termination(id) 512 if status == 'wait': 513 run += 1 514 elif status == 'resubmit': 515 idle += 1 516 517 518 if special_output: 519 # combine the stdout and the stderr 520 #wait up to 50 s to see if those files exists 521 for i in range(5): 522 if os.path.exists(stdout): 523 if not os.path.exists(stderr): 524 time.sleep(5) 525 if os.path.exists(stderr): 526 err_text = open(stderr).read() 527 if not err_text: 528 return 529 logger.warning(err_text) 530 text = open(stdout).read() 531 open(stdout,'w').write(text + err_text) 532 else: 533 return 534 time.sleep(10)
535
536 - def remove(self, *args, **opts):
537 """ """ 538 logger.warning("""This cluster didn't support job removal, 539 the jobs are still running on the cluster.""")
540 541 @store_input()
542 - def metasubmit(self, me_dir):
543 logger.warning("""This cluster didn't support metajob submit.""") 544 return 0
545
546 - def modify_interface(self, run_interface):
547 """routine which allow to modify the run_card/mg5cmd object to change the 548 default behavior of the runs. 549 This is called at the time of the compilation of the run_card. 550 Note that this function can be called multiple times by run. 551 """ 552 #run_card = run_interface.run_card 553 return
554
555 -class Packet(object):
556 """ an object for handling packet of job, it is designed to be thread safe 557 """ 558
559 - def __init__(self, name, fct, args, opts={}):
560 import six.moves.queue 561 import threading 562 self.queue = six.moves.queue.Queue() 563 self.tag = name 564 self.fct = fct 565 self.args = args 566 self.opts = opts 567 self.done = threading.Event()
568
569 - def put(self, *args, **opts):
570 self.queue.put(*args, **opts)
571 572 append = put 573
574 - def remove_one(self):
575 self.queue.get(True) 576 self.queue.task_done() 577 return self.queue.qsize()
578
579 -class MultiCore(Cluster):
580 """class for dealing with the submission in multiple node""" 581 582 job_id = "$" 583
584 - def __init__(self, *args, **opt):
585 """Init the cluster """ 586 587 588 super(MultiCore, self).__init__(self, *args, **opt) 589 590 import six.moves.queue 591 import threading 592 import six.moves._thread 593 self.queue = six.moves.queue.Queue() # list of job to do 594 self.done = six.moves.queue.Queue() # list of job finisned 595 self.submitted = six.moves.queue.Queue() # one entry by job submitted 596 self.stoprequest = threading.Event() #flag to ensure everything to close 597 self.demons = [] 598 self.nb_done =0 599 if 'nb_core' in opt: 600 self.nb_core = opt['nb_core'] 601 elif isinstance(args[0],int): 602 self.nb_core = args[0] 603 else: 604 self.nb_core = 1 605 self.update_fct = None 606 607 self.lock = threading.Event() # allow nice lock of the main thread 608 self.pids = six.moves.queue.Queue() # allow to clean jobs submit via subprocess 609 self.done_pid = [] # list of job finisned 610 self.done_pid_queue = six.moves.queue.Queue() 611 self.fail_msg = None 612 613 # starting the worker node 614 for _ in range(self.nb_core): 615 self.start_demon()
616 617
618 - def start_demon(self):
619 import threading 620 t = threading.Thread(target=self.worker) 621 t.daemon = True 622 t.start() 623 self.demons.append(t)
624 625
626 - def worker(self):
627 import six.moves.queue 628 import six.moves._thread 629 while not self.stoprequest.isSet(): 630 try: 631 args = self.queue.get() 632 tag, exe, arg, opt = args 633 try: 634 # check for executable case 635 if isinstance(exe,str): 636 if os.path.exists(exe) and not exe.startswith('/'): 637 exe = './' + exe 638 if isinstance(opt['stdout'],str): 639 opt['stdout'] = open(opt['stdout'],'w') 640 if opt['stderr'] == None: 641 opt['stderr'] = subprocess.STDOUT 642 if arg: 643 proc = misc.Popen([exe] + arg, **opt) 644 else: 645 proc = misc.Popen(exe, **opt) 646 pid = proc.pid 647 self.pids.put(pid) 648 proc.wait() 649 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 650 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 651 (' '.join([exe]+arg), proc.returncode) 652 logger.warning(fail_msg) 653 self.stoprequest.set() 654 self.remove(fail_msg) 655 # handle the case when this is a python function. Note that 656 # this use Thread so they are NO built-in parralelization this is 657 # going to work on a single core! (but this is fine for IO intensive 658 # function. for CPU intensive fct this will slow down the computation 659 else: 660 pid = tag 661 self.pids.put(pid) 662 # the function should return 0 if everything is fine 663 # the error message otherwise 664 returncode = exe(*arg, **opt) 665 if returncode != 0: 666 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 667 self.stoprequest.set() 668 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 669 except Exception as error: 670 self.fail_msg = sys.exc_info() 671 logger.warning(str(error)) 672 self.stoprequest.set() 673 self.remove(error) 674 675 if __debug__: 676 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 677 678 self.queue.task_done() 679 self.done.put(tag) 680 self.done_pid_queue.put(pid) 681 #release the mother to print the status on the screen 682 try: 683 self.lock.set() 684 except six.moves._thread.error: 685 continue 686 except six.moves.queue.Empty: 687 continue
688 689 690 691
692 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 693 log=None, required_output=[], nb_submit=0):
694 """submit a job on multicore machine""" 695 696 tag = (prog, tuple(argument), cwd, nb_submit) 697 if isinstance(prog, str): 698 699 opt = {'cwd': cwd, 700 'stdout':stdout, 701 'stderr': stderr} 702 703 self.queue.put((tag, prog, argument, opt)) 704 self.submitted.put(1) 705 return tag 706 else: 707 # python function 708 self.queue.put((tag, prog, argument, {})) 709 self.submitted.put(1) 710 return tag
711
712 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 713 stderr=None, log=None, **opts):
714 """launch one job and wait for it""" 715 if isinstance(stdout, str): 716 stdout = open(stdout, 'w') 717 if isinstance(stderr, str): 718 stdout = open(stderr, 'w') 719 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
720
721 - def remove(self, error=None):
722 """Ensure that all thread are killed""" 723 724 # ensure the worker to stop 725 self.stoprequest.set() 726 if error and not self.fail_msg: 727 self.fail_msg = error 728 729 # cleaning the queue done_pid_queue and move them to done_pid 730 while not self.done_pid_queue.empty(): 731 pid = self.done_pid_queue.get() 732 self.done_pid.append(pid) 733 # self.done_pid_queue.task_done() 734 735 while not self.pids.empty(): 736 pid = self.pids.get() 737 self.pids.task_done() 738 if isinstance(pid, tuple): 739 continue 740 if pid in self.done_pid: 741 continue 742 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 743 % {'pid':pid} ) 744 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
745 746
747 - def wait(self, me_dir, update_status, update_first=None):
748 """Waiting that all the jobs are done. This function also control that 749 the submission by packet are handle correctly (i.e. submit the function)""" 750 751 import six.moves.queue 752 import threading 753 754 try: # to catch KeyBoardInterupt to see which kind of error to display 755 last_status = (0, 0, 0) 756 sleep_time = 1 757 use_lock = True 758 first = True 759 while True: 760 force_one_more_loop = False # some security 761 762 # Loop over the job tagged as done to check if some packet of jobs 763 # are finished in case, put the associate function in the queue 764 while self.done.qsize(): 765 try: 766 tag = self.done.get(True, 1) 767 except six.moves.queue.Empty: 768 pass 769 else: 770 if self.id_to_packet and tuple(tag) in self.id_to_packet: 771 packet = self.id_to_packet[tuple(tag)] 772 remaining = packet.remove_one() 773 if remaining == 0: 774 # fully ensure that the packet is finished (thread safe) 775 packet.queue.join() 776 self.submit(packet.fct, packet.args) 777 force_one_more_loop = True 778 self.nb_done += 1 779 self.done.task_done() 780 781 # Get from the various queue the Idle/Done/Running information 782 # Those variable should be thread safe but approximate. 783 Idle = self.queue.qsize() 784 Done = self.nb_done + self.done.qsize() 785 Running = max(0, self.submitted.qsize() - Idle - Done) 786 787 if Idle + Running <= 0 and not force_one_more_loop: 788 update_status(Idle, Running, Done) 789 # Going the quit since everything is done 790 # Fully Ensure that everything is indeed done. 791 self.queue.join() 792 break 793 794 if (Idle, Running, Done) != last_status: 795 if first and update_first: 796 update_first(Idle, Running, Done) 797 first = False 798 else: 799 update_status(Idle, Running, Done) 800 last_status = (Idle, Running, Done) 801 802 # cleaning the queue done_pid_queue and move them to done_pid 803 while not self.done_pid_queue.empty(): 804 pid = self.done_pid_queue.get() 805 self.done_pid.append(pid) 806 self.done_pid_queue.task_done() 807 808 809 # Define how to wait for the next iteration 810 if use_lock: 811 # simply wait that a worker release the lock 812 use_lock = self.lock.wait(300) 813 self.lock.clear() 814 if not use_lock and Idle > 0: 815 use_lock = True 816 else: 817 # to be sure that we will never fully lock at the end pass to 818 # a simple time.sleep() 819 time.sleep(sleep_time) 820 sleep_time = min(sleep_time + 2, 180) 821 if update_first: 822 update_first(Idle, Running, Done) 823 824 if self.stoprequest.isSet(): 825 if isinstance(self.fail_msg, Exception): 826 raise self.fail_msg 827 elif isinstance(self.fail_msg, str): 828 raise Exception(self.fail_msg) 829 else: 830 misc.sprint(self.fail_msg) 831 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 832 # reset variable for next submission 833 try: 834 self.lock.clear() 835 except Exception: 836 pass 837 self.done = six.moves.queue.Queue() 838 self.done_pid = [] 839 self.done_pid_queue = six.moves.queue.Queue() 840 self.nb_done = 0 841 self.submitted = six.moves.queue.Queue() 842 self.pids = six.moves.queue.Queue() 843 self.stoprequest.clear() 844 845 except KeyboardInterrupt: 846 # if one of the node fails -> return that error 847 if isinstance(self.fail_msg, Exception): 848 raise self.fail_msg 849 elif isinstance(self.fail_msg, str): 850 raise Exception(self.fail_msg) 851 elif self.fail_msg: 852 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 853 # else return orignal error 854 raise
855
856 -class CondorCluster(Cluster):
857 """Basic class for dealing with cluster submission""" 858 859 name = 'condor' 860 job_id = 'CONDOR_ID' 861 862 863 864 @multiple_try()
865 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 866 required_output=[], nb_submit=0):
867 """Submit a job prog to a Condor cluster""" 868 869 text = """Executable = %(prog)s 870 output = %(stdout)s 871 error = %(stderr)s 872 log = %(log)s 873 %(argument)s 874 environment = CONDOR_ID=$(Cluster).$(Process) 875 Universe = vanilla 876 notification = Error 877 Initialdir = %(cwd)s 878 %(requirement)s 879 getenv=True 880 queue 1 881 """ 882 883 if self.cluster_queue not in ['None', None]: 884 requirement = 'Requirements = %s=?=True' % self.cluster_queue 885 else: 886 requirement = '' 887 888 if cwd is None: 889 cwd = os.getcwd() 890 if stdout is None: 891 stdout = '/dev/null' 892 if stderr is None: 893 stderr = '/dev/null' 894 if log is None: 895 log = '/dev/null' 896 if not os.path.exists(prog): 897 prog = os.path.join(cwd, prog) 898 if argument: 899 argument = 'Arguments = %s' % ' '.join(argument) 900 else: 901 argument = '' 902 903 904 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 905 'stderr': stderr,'log': log,'argument': argument, 906 'requirement': requirement} 907 908 #open('submit_condor','w').write(text % dico) 909 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 910 stdin=subprocess.PIPE) 911 output, _ = a.communicate((text % dico).encode()) 912 #output = a.stdout.read() 913 #Submitting job(s). 914 #Logging submit event(s). 915 #1 job(s) submitted to cluster 2253622. 916 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 917 output = output.decode() 918 try: 919 id = pat.search(output).groups()[0] 920 except: 921 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 922 % output) 923 self.submitted += 1 924 self.submitted_ids.append(id) 925 return id
926 927 @store_input() 928 @multiple_try()
929 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 930 log=None, input_files=[], output_files=[], required_output=[], 931 nb_submit=0):
932 """Submit the job on the cluster NO SHARE DISK 933 input/output file should be give relative to cwd 934 """ 935 936 if not required_output and output_files: 937 required_output = output_files 938 939 if (input_files == [] == output_files): 940 return self.submit(prog, argument, cwd, stdout, stderr, log, 941 required_output=required_output, nb_submit=nb_submit) 942 943 text = """Executable = %(prog)s 944 output = %(stdout)s 945 error = %(stderr)s 946 log = %(log)s 947 %(argument)s 948 should_transfer_files = YES 949 when_to_transfer_output = ON_EXIT 950 transfer_input_files = %(input_files)s 951 %(output_files)s 952 Universe = vanilla 953 notification = Error 954 Initialdir = %(cwd)s 955 %(requirement)s 956 getenv=True 957 queue 1 958 """ 959 960 if self.cluster_queue not in ['None', None]: 961 requirement = 'Requirements = %s=?=True' % self.cluster_queue 962 else: 963 requirement = '' 964 965 if cwd is None: 966 cwd = os.getcwd() 967 if stdout is None: 968 stdout = '/dev/null' 969 if stderr is None: 970 stderr = '/dev/null' 971 if log is None: 972 log = '/dev/null' 973 if not os.path.exists(prog): 974 prog = os.path.join(cwd, prog) 975 if argument: 976 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 977 else: 978 argument = '' 979 # input/output file treatment 980 if input_files: 981 input_files = ','.join(input_files) 982 else: 983 input_files = '' 984 if output_files: 985 output_files = 'transfer_output_files = %s' % ','.join(output_files) 986 else: 987 output_files = '' 988 989 990 991 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 992 'stderr': stderr,'log': log,'argument': argument, 993 'requirement': requirement, 'input_files':input_files, 994 'output_files':output_files} 995 996 #open('submit_condor','w').write(text % dico) 997 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 998 stdin=subprocess.PIPE) 999 output, _ = a.communicate((text % dico).encode()) 1000 #output = a.stdout.read() 1001 #Submitting job(s). 1002 #Logging submit event(s). 1003 #1 job(s) submitted to cluster 2253622. 1004 output = output.decode() 1005 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 1006 try: 1007 id = pat.search(output).groups()[0] 1008 except: 1009 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1010 % output) 1011 self.submitted += 1 1012 self.submitted_ids.append(id) 1013 return id
1014 1015 1016 1017 1018 1019 @multiple_try(nb_try=10, sleep=10)
1020 - def control_one_job(self, id):
1021 """ control the status of a single job with it's cluster id """ 1022 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1023 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1024 stderr=subprocess.PIPE) 1025 1026 error = status.stderr.read().decode() 1027 if status.returncode or error: 1028 raise ClusterManagmentError('condor_q returns error: %s' % error) 1029 1030 return status.stdout.readline().decode().strip()
1031 1032 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'} 1033 @check_interupt() 1034 @multiple_try(nb_try=10, sleep=10)
1035 - def control(self, me_dir):
1036 """ control the status of a single job with it's cluster id """ 1037 1038 if not self.submitted_ids: 1039 return 0, 0, 0, 0 1040 1041 packet = 15000 1042 idle, run, fail = 0, 0, 0 1043 ongoing = [] 1044 for i in range(1+(len(self.submitted_ids)-1)//packet): 1045 start = i * packet 1046 stop = (i+1) * packet 1047 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1048 " -format \"%d \" ClusterId " + \ 1049 " -format \"%d\\n\" JobStatus " 1050 1051 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1052 stderr=subprocess.PIPE) 1053 error = status.stderr.read().decode() 1054 if status.returncode or error: 1055 raise ClusterManagmentError('condor_q returns error: %s' % error) 1056 1057 for line in status.stdout: 1058 id, status = line.decode().strip().split() 1059 status = self.jobstatus[status] 1060 ongoing.append(id) 1061 if status in ['I','U']: 1062 idle += 1 1063 elif status == 'R': 1064 run += 1 1065 elif status != 'C': 1066 fail += 1 1067 1068 for id in list(self.submitted_ids): 1069 if id not in ongoing: 1070 status = self.check_termination(id) 1071 if status == 'wait': 1072 run += 1 1073 elif status == 'resubmit': 1074 idle += 1 1075 1076 return idle, run, self.submitted - (idle+run+fail), fail
1077 1078 @multiple_try()
1079 - def remove(self, *args, **opts):
1080 """Clean the jobson the cluster""" 1081 1082 if not self.submitted_ids: 1083 return 1084 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1085 1086 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1087 self.submitted_ids = []
1088
1089 -class PBSCluster(Cluster):
1090 """Basic class for dealing with cluster submission""" 1091 1092 name = 'pbs' 1093 job_id = 'PBS_JOBID' 1094 idle_tag = ['Q'] 1095 running_tag = ['T','E','R'] 1096 complete_tag = ['C'] 1097 1098 maximum_submited_jobs = 2500 1099 1100 @multiple_try()
1101 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1102 required_output=[], nb_submit=0):
1103 """Submit a job prog to a PBS cluster""" 1104 1105 me_dir = self.get_jobs_identifier(cwd, prog) 1106 1107 if len(self.submitted_ids) > self.maximum_submited_jobs: 1108 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1109 self.wait(me_dir, fct, self.maximum_submited_jobs) 1110 1111 1112 text = "" 1113 if cwd is None: 1114 cwd = os.getcwd() 1115 else: 1116 text = " cd %s;" % cwd 1117 if stdout is None: 1118 stdout = '/dev/null' 1119 if stderr is None: 1120 stderr = '/dev/null' 1121 elif stderr == -2: # -2 is subprocess.STDOUT 1122 stderr = stdout 1123 if log is None: 1124 log = '/dev/null' 1125 1126 if not os.path.isabs(prog): 1127 text += "./%s" % prog 1128 else: 1129 text+= prog 1130 1131 if argument: 1132 text += ' ' + ' '.join(argument) 1133 1134 command = ['qsub','-o', stdout, 1135 '-N', me_dir, 1136 '-e', stderr, 1137 '-V'] 1138 1139 if self.cluster_queue and self.cluster_queue != 'None': 1140 command.extend(['-q', self.cluster_queue]) 1141 1142 a = misc.Popen(command, stdout=subprocess.PIPE, 1143 stderr=subprocess.STDOUT, 1144 stdin=subprocess.PIPE, cwd=cwd) 1145 1146 output = a.communicate(text.encode())[0].decode() 1147 id = output.split('.')[0] 1148 if not id.isdigit() or a.returncode !=0: 1149 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1150 % output) 1151 1152 self.submitted += 1 1153 self.submitted_ids.append(id) 1154 return id
1155 1156 @multiple_try()
1157 - def control_one_job(self, id):
1158 """ control the status of a single job with it's cluster id """ 1159 cmd = 'qstat '+str(id) 1160 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1161 stderr=subprocess.STDOUT) 1162 1163 for line in status.stdout: 1164 line = line.decode().strip() 1165 if 'cannot connect to server' in line or 'cannot read reply' in line: 1166 raise ClusterManagmentError('server disconnected') 1167 if 'Unknown' in line: 1168 return 'F' 1169 elif line.startswith(str(id)): 1170 jobstatus = line.split()[4] 1171 else: 1172 jobstatus="" 1173 1174 if status.returncode != 0 and status.returncode is not None: 1175 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode) 1176 if jobstatus in self.idle_tag: 1177 return 'I' 1178 elif jobstatus in self.running_tag: 1179 return 'R' 1180 return 'F'
1181 1182 1183 @multiple_try()
1184 - def control(self, me_dir):
1185 """ control the status of a single job with it's cluster id """ 1186 cmd = "qstat" 1187 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1188 1189 me_dir = self.get_jobs_identifier(me_dir) 1190 1191 ongoing = [] 1192 1193 idle, run, fail = 0, 0, 0 1194 for line in status.stdout: 1195 line = line.decode() 1196 if 'cannot connect to server' in line or 'cannot read reply' in line: 1197 raise ClusterManagmentError('server disconnected') 1198 if me_dir in line: 1199 ongoing.append(line.split()[0].split('.')[0]) 1200 status2 = line.split()[4] 1201 if status2 in self.idle_tag: 1202 idle += 1 1203 elif status2 in self.running_tag: 1204 run += 1 1205 elif status2 in self.complete_tag: 1206 if not self.check_termination(line.split()[0].split('.')[0]): 1207 idle += 1 1208 else: 1209 fail += 1 1210 1211 if status.returncode != 0 and status.returncode is not None: 1212 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode) 1213 1214 for id in list(self.submitted_ids): 1215 if id not in ongoing: 1216 status2 = self.check_termination(id) 1217 if status2 == 'wait': 1218 run += 1 1219 elif status2 == 'resubmit': 1220 idle += 1 1221 1222 return idle, run, self.submitted - (idle+run+fail), fail
1223 1224 @multiple_try()
1225 - def remove(self, *args, **opts):
1226 """Clean the jobs on the cluster""" 1227 1228 if not self.submitted_ids: 1229 return 1230 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1231 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1232 self.submitted_ids = []
1233
1234 1235 -class SGECluster(Cluster):
1236 """Basic class for dealing with cluster submission""" 1237 # Class written by Arian Abrahantes. 1238 1239 name = 'sge' 1240 job_id = 'JOB_ID' 1241 idle_tag = ['qw', 'hqw','hRqw','w'] 1242 running_tag = ['r','t','Rr','Rt'] 1243 identifier_length = 10 1244
1245 - def def_get_path(self,location):
1246 """replace string for path issues""" 1247 location = os.path.realpath(location) 1248 homePath = os.getenv("HOME") 1249 if homePath: 1250 location = location.replace(homePath,'$HOME') 1251 return location
1252 1253 @multiple_try()
1254 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1255 required_output=[], nb_submit=0):
1256 """Submit a job prog to an SGE cluster""" 1257 1258 me_dir = self.get_jobs_identifier(cwd, prog) 1259 1260 1261 if cwd is None: 1262 #cwd = os.getcwd() 1263 cwd = self.def_get_path(os.getcwd()) 1264 cwd1 = self.def_get_path(cwd) 1265 text = " cd %s;" % cwd1 1266 if stdout is None: 1267 stdout = '/dev/null' 1268 else: 1269 stdout = self.def_get_path(stdout) 1270 if stderr is None: 1271 stderr = '/dev/null' 1272 elif stderr == -2: # -2 is subprocess.STDOUT 1273 stderr = stdout 1274 else: 1275 stderr = self.def_get_path(stderr) 1276 1277 if log is None: 1278 log = '/dev/null' 1279 else: 1280 log = self.def_get_path(log) 1281 1282 text += prog 1283 if argument: 1284 text += ' ' + ' '.join(argument) 1285 1286 #if anything slips through argument 1287 #print "!=== inteded change ",text.replace('/srv/nfs','') 1288 #text = text.replace('/srv/nfs','') 1289 homePath = os.getenv("HOME") 1290 if homePath: 1291 text = text.replace(homePath,'$HOME') 1292 1293 logger.debug("!=== input %s" % text) 1294 logger.debug("!=== output %s" % stdout) 1295 logger.debug("!=== error %s" % stderr) 1296 logger.debug("!=== logs %s" % log) 1297 1298 command = ['qsub','-o', stdout, 1299 '-N', me_dir, 1300 '-e', stderr, 1301 '-V'] 1302 1303 if self.cluster_queue and self.cluster_queue != 'None': 1304 command.extend(['-q', self.cluster_queue]) 1305 1306 a = misc.Popen(command, stdout=subprocess.PIPE, 1307 stderr=subprocess.STDOUT, 1308 stdin=subprocess.PIPE, cwd=cwd) 1309 1310 output = a.communicate(text.encode())[0].decode() 1311 id = output.split(' ')[2] 1312 if not id.isdigit(): 1313 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1314 % output) 1315 self.submitted += 1 1316 self.submitted_ids.append(id) 1317 logger.debug(output) 1318 1319 return id
1320 1321 @multiple_try()
1322 - def control_one_job(self, id):
1323 """ control the status of a single job with it's cluster id """ 1324 #cmd = 'qstat '+str(id) 1325 cmd = 'qstat ' 1326 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1327 for line in status.stdout: 1328 line = line.decode() 1329 #print "!==",line 1330 #line = line.strip() 1331 #if 'Unknown' in line: 1332 # return 'F' 1333 #elif line.startswith(str(id)): 1334 # status = line.split()[4] 1335 if str(id) in line: 1336 status = line.split()[4] 1337 #print "!=status", status 1338 if status in self.idle_tag: 1339 return 'I' 1340 elif status in self.running_tag: 1341 return 'R' 1342 return 'F'
1343 1344 @multiple_try()
1345 - def control(self, me_dir):
1346 """ control the status of a single job with it's cluster id """ 1347 cmd = "qstat " 1348 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1349 1350 me_dir = self.get_jobs_identifier(me_dir) 1351 1352 finished = list(self.submitted_ids) 1353 1354 idle, run, fail = 0, 0, 0 1355 for line in status.stdout: 1356 line = line.decode() 1357 if me_dir in line: 1358 id,_,_,_,status = line.split()[:5] 1359 if status in self.idle_tag: 1360 idle += 1 1361 finished.remove(id) 1362 elif status in self.running_tag: 1363 run += 1 1364 finished.remove(id) 1365 else: 1366 logger.debug(line) 1367 fail += 1 1368 finished.remove(id) 1369 1370 for id in finished: 1371 self.check_termination(id) 1372 1373 return idle, run, self.submitted - (idle+run+fail), fail
1374 1375 1376 1377 @multiple_try()
1378 - def remove(self, *args, **opts):
1379 """Clean the jobs on the cluster""" 1380 1381 if not self.submitted_ids: 1382 return 1383 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1384 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1385 self.submitted_ids = []
1386
1387 1388 -class LSFCluster(Cluster):
1389 """Basic class for dealing with cluster submission""" 1390 1391 name = 'lsf' 1392 job_id = 'LSB_JOBID' 1393 1394 @multiple_try()
1395 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1396 required_output=[], nb_submit=0):
1397 """Submit the job prog to an LSF cluster""" 1398 1399 1400 me_dir = self.get_jobs_identifier(cwd, prog) 1401 1402 text = "" 1403 command = ['bsub', '-C0', '-J', me_dir] 1404 if cwd is None: 1405 cwd = os.getcwd() 1406 else: 1407 text = " cd %s;" % cwd 1408 if stdout and isinstance(stdout, str): 1409 command.extend(['-o', stdout]) 1410 if stderr and isinstance(stdout, str): 1411 command.extend(['-e', stderr]) 1412 elif stderr == -2: # -2 is subprocess.STDOUT 1413 pass 1414 if log is None: 1415 log = '/dev/null' 1416 1417 text += prog 1418 if argument: 1419 text += ' ' + ' '.join(argument) 1420 1421 if self.cluster_queue and self.cluster_queue != 'None': 1422 command.extend(['-q', self.cluster_queue]) 1423 1424 a = misc.Popen(command, stdout=subprocess.PIPE, 1425 stderr=subprocess.STDOUT, 1426 stdin=subprocess.PIPE, cwd=cwd) 1427 1428 output = a.communicate(text.encode())[0].decode() 1429 #Job <nnnn> is submitted to default queue <normal>. 1430 try: 1431 id = output.split('>',1)[0].split('<')[1] 1432 except: 1433 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1434 % output) 1435 if not id.isdigit(): 1436 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1437 % output) 1438 self.submitted += 1 1439 self.submitted_ids.append(id) 1440 return id
1441 1442 1443 @multiple_try()
1444 - def control_one_job(self, id):
1445 """ control the status of a single job with it's cluster id """ 1446 1447 cmd = 'bjobs '+str(id) 1448 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1449 1450 for line in status.stdout: 1451 line = line.decode().strip().upper() 1452 if 'JOBID' in line: 1453 continue 1454 elif str(id) not in line: 1455 continue 1456 status = line.split()[2] 1457 if status == 'RUN': 1458 return 'R' 1459 elif status == 'PEND': 1460 return 'I' 1461 elif status == 'DONE': 1462 return 'F' 1463 else: 1464 return 'H' 1465 return 'F'
1466 1467 @multiple_try()
1468 - def control(self, me_dir):
1469 """ control the status of a single job with it's cluster id """ 1470 1471 if not self.submitted_ids: 1472 return 0, 0, 0, 0 1473 1474 cmd = "bjobs " + ' '.join(self.submitted_ids) 1475 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1476 1477 jobstatus = {} 1478 for line in status.stdout: 1479 line = line.decode().strip() 1480 if 'JOBID' in line: 1481 continue 1482 splitline = line.split() 1483 id = splitline[0] 1484 if id not in self.submitted_ids: 1485 continue 1486 jobstatus[id] = splitline[2] 1487 1488 idle, run, fail = 0, 0, 0 1489 for id in self.submitted_ids[:]: 1490 if id in jobstatus: 1491 status = jobstatus[id] 1492 else: 1493 status = 'MISSING' 1494 if status == 'RUN': 1495 run += 1 1496 elif status == 'PEND': 1497 idle += 1 1498 else: 1499 status = self.check_termination(id) 1500 if status == 'wait': 1501 run += 1 1502 elif status == 'resubmit': 1503 idle += 1 1504 1505 return idle, run, self.submitted - (idle+run+fail), fail
1506 1507 @multiple_try()
1508 - def remove(self, *args,**opts):
1509 """Clean the jobs on the cluster""" 1510 1511 if not self.submitted_ids: 1512 return 1513 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1514 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1515 self.submitted_ids = []
1516
1517 -class GECluster(Cluster):
1518 """Class for dealing with cluster submission on a GE cluster""" 1519 1520 name = 'ge' 1521 job_id = 'JOB_ID' 1522 idle_tag = ['qw'] 1523 running_tag = ['r'] 1524 1525 @multiple_try()
1526 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1527 required_output=[], nb_submit=0):
1528 """Submit a job prog to a GE cluster""" 1529 1530 text = "" 1531 if cwd is None: 1532 cwd = os.getcwd() 1533 else: 1534 text = " cd %s; bash " % cwd 1535 if stdout is None: 1536 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1537 if stderr is None: 1538 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1539 elif stderr == -2: # -2 is subprocess.STDOUT 1540 stderr = stdout 1541 if log is None: 1542 log = '/dev/null' 1543 1544 text += prog 1545 if argument: 1546 text += ' ' + ' '.join(argument) 1547 text += '\n' 1548 tmp_submit = os.path.join(cwd, 'tmp_submit') 1549 open(tmp_submit,'w').write(text) 1550 1551 a = misc.Popen(['qsub','-o', stdout, 1552 '-e', stderr, 1553 tmp_submit], 1554 stdout=subprocess.PIPE, 1555 stderr=subprocess.STDOUT, 1556 stdin=subprocess.PIPE, cwd=cwd) 1557 1558 output = a.communicate()[0].decode() 1559 #Your job 874511 ("test.sh") has been submitted 1560 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1561 try: 1562 id = pat.search(output).groups()[0] 1563 except: 1564 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1565 % output) 1566 self.submitted += 1 1567 self.submitted_ids.append(id) 1568 return id
1569 1570 @multiple_try()
1571 - def control_one_job(self, id):
1572 """ control the status of a single job with it's cluster id """ 1573 cmd = 'qstat | grep '+str(id) 1574 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1575 if not status: 1576 return 'F' 1577 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1578 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1579 stat = '' 1580 for line in status.stdout.read().decode().split('\n'): 1581 if not line: 1582 continue 1583 line = line.strip() 1584 try: 1585 groups = pat.search(line).groups() 1586 except: 1587 raise ClusterManagmentError('bad syntax for stat: \n\"%s\"' % line) 1588 if groups[0] != id: continue 1589 stat = groups[1] 1590 if not stat: 1591 return 'F' 1592 if stat in self.idle_tag: 1593 return 'I' 1594 if stat in self.running_tag: 1595 return 'R'
1596 1597 @multiple_try()
1598 - def control(self, me_dir=None):
1599 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1600 if not self.submitted_ids: 1601 return 0, 0, 0, 0 1602 idle, run, fail = 0, 0, 0 1603 ongoing = [] 1604 for statusflag in ['p', 'r', 'sh']: 1605 cmd = 'qstat -s %s' % statusflag 1606 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1607 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1608 pat = re.compile("^(\d+)") 1609 for line in status.stdout.read().decode().split('\n'): 1610 line = line.strip() 1611 try: 1612 id = pat.search(line).groups()[0] 1613 except Exception: 1614 pass 1615 else: 1616 if id not in self.submitted_ids: 1617 continue 1618 ongoing.append(id) 1619 if statusflag == 'p': 1620 idle += 1 1621 if statusflag == 'r': 1622 run += 1 1623 if statusflag == 'sh': 1624 fail += 1 1625 for id in list(self.submitted_ids): 1626 if id not in ongoing: 1627 self.check_termination(id) 1628 #self.submitted_ids = ongoing 1629 1630 return idle, run, self.submitted - idle - run - fail, fail
1631 1632 @multiple_try()
1633 - def remove(self, *args, **opts):
1634 """Clean the jobs on the cluster""" 1635 1636 if not self.submitted_ids: 1637 return 1638 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1639 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1640 self.submitted_ids = []
1641
1642 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1643 """start a computation and not wait for it to finish. 1644 this fonction returns a lock which is locked as long as the job is 1645 running.""" 1646 1647 mc = MultiCore(1) 1648 mc.submit(exe, argument, cwd, stdout, **opt) 1649 mc.need_waiting = True 1650 return mc.lock
1651
1652 1653 -class SLURMCluster(Cluster):
1654 """Basic class for dealing with cluster submission""" 1655 1656 name = 'slurm' 1657 job_id = 'SLURM_JOBID' 1658 idle_tag = ['Q','PD','S','CF'] 1659 running_tag = ['R', 'CG'] 1660 complete_tag = ['C'] 1661 identifier_length = 8 1662 1663 @multiple_try()
1664 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1665 required_output=[], nb_submit=0):
1666 """Submit a job prog to a SLURM cluster""" 1667 1668 me_dir = self.get_jobs_identifier(cwd, prog) 1669 1670 1671 if cwd is None: 1672 cwd = os.getcwd() 1673 if stdout is None: 1674 stdout = '/dev/null' 1675 if stderr is None: 1676 stderr = '/dev/null' 1677 elif stderr == -2: # -2 is subprocess.STDOUT 1678 stderr = stdout 1679 if log is None: 1680 log = '/dev/null' 1681 1682 command = ['sbatch', '-o', stdout, 1683 '-J', me_dir, 1684 '-e', stderr, prog] + argument 1685 1686 if self.cluster_queue and self.cluster_queue != 'None': 1687 command.insert(1, '-p') 1688 command.insert(2, self.cluster_queue) 1689 1690 a = misc.Popen(command, stdout=subprocess.PIPE, 1691 stderr=subprocess.STDOUT, 1692 stdin=subprocess.PIPE, cwd=cwd) 1693 1694 output = a.communicate() 1695 output_arr = output[0].decode().split(' ') 1696 id = output_arr[3].rstrip() 1697 1698 if not id.isdigit(): 1699 id = re.findall('Submitted batch job ([\d\.]+)', ' '.join(output_arr)) 1700 1701 if not id or len(id)>1: 1702 raise ClusterManagmentError( 'fail to submit to the cluster: \n%s' \ 1703 % ('stdout: %s\nstderr %s' %(output[0],output[1]))) 1704 id = id[0] 1705 1706 1707 self.submitted += 1 1708 self.submitted_ids.append(id) 1709 return id
1710 1711 @multiple_try()
1712 - def control_one_job(self, id):
1713 """ control the status of a single job with it's cluster id """ 1714 cmd = 'squeue j'+str(id) 1715 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1716 stderr=open(os.devnull,'w')) 1717 1718 for line in status.stdout: 1719 line = line.decode().strip() 1720 if 'Invalid' in line: 1721 return 'F' 1722 elif line.startswith(str(id)): 1723 status = line.split()[4] 1724 if status in self.idle_tag: 1725 return 'I' 1726 elif status in self.running_tag: 1727 return 'R' 1728 return 'F'
1729 1730 @multiple_try()
1731 - def control(self, me_dir):
1732 """ control the status of a single job with it's cluster id """ 1733 cmd = "squeue" 1734 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1735 1736 me_dir = self.get_jobs_identifier(me_dir) 1737 1738 idle, run, fail = 0, 0, 0 1739 ongoing=[] 1740 for line in pstatus.stdout: 1741 line = line.decode() 1742 if me_dir in line: 1743 id, _, _,_ , status,_ = line.split(None,5) 1744 ongoing.append(id) 1745 if status in self.idle_tag: 1746 idle += 1 1747 elif status in self.running_tag: 1748 run += 1 1749 elif status in self.complete_tag: 1750 status = self.check_termination(id) 1751 if status == 'wait': 1752 run += 1 1753 elif status == 'resubmit': 1754 idle += 1 1755 else: 1756 fail += 1 1757 1758 #control other finished job 1759 for id in list(self.submitted_ids): 1760 if id not in ongoing: 1761 status = self.check_termination(id) 1762 if status == 'wait': 1763 run += 1 1764 elif status == 'resubmit': 1765 idle += 1 1766 1767 1768 return idle, run, self.submitted - (idle+run+fail), fail
1769 1770 @multiple_try()
1771 - def remove(self, *args, **opts):
1772 """Clean the jobs on the cluster""" 1773 1774 if not self.submitted_ids: 1775 return 1776 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1777 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1778 self.submitted_ids = []
1779
1780 -class HTCaaSCluster(Cluster):
1781 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1782 1783 name= 'htcaas' 1784 job_id = 'HTCAAS_JOBID' 1785 idle_tag = ['waiting'] 1786 running_tag = ['preparing','running'] 1787 complete_tag = ['done'] 1788 1789 @store_input() 1790 @multiple_try()
1791 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1792 log=None, input_files=[], output_files=[], required_output=[], 1793 nb_submit=0):
1794 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1795 input/output file should be given as relative to CWd 1796 """ 1797 # To make workspace name(temp) 1798 cur_usr = os.getenv('USER') 1799 1800 if cwd is None: 1801 cwd = os.getcwd() 1802 1803 cwd_cp = cwd.rsplit("/",2) 1804 1805 if not stdout is None: 1806 print("stdout: %s" % stdout) 1807 1808 if not os.path.exists(prog): 1809 prog = os.path.join(cwd, prog) 1810 1811 if not required_output and output_files: 1812 required_output = output_files 1813 1814 logger.debug(prog) 1815 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1816 cwd_arg = cwd+"/arguments" 1817 temp = ' '.join([str(a) for a in argument]) 1818 arg_cmd="echo '"+temp+"' > " + cwd_arg 1819 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1820 if argument : 1821 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1822 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1823 id = a.stdout.read().strip() 1824 1825 else: 1826 cwd_arg = cwd+"/arguments" 1827 temp = ' '.join([str(a) for a in argument]) 1828 temp_file_name = "sub." + os.path.basename(prog) 1829 text = """#!/bin/bash 1830 MYPWD=%(cwd)s 1831 cd $MYPWD 1832 input_files=(%(input_files)s ) 1833 for i in ${input_files[@]} 1834 do 1835 chmod -f +x $i 1836 done 1837 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1838 """ 1839 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1840 'arguments': ' '.join([str(a) for a in argument]), 1841 'program': ' ' if '.py' in prog else 'bash'} 1842 1843 # writing a new script for the submission 1844 new_prog = pjoin(cwd, temp_file_name) 1845 open(new_prog, 'w').write(text % dico) 1846 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1847 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1848 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1849 id = a.stdout.read().strip() 1850 logger.debug(id) 1851 1852 nb_try=0 1853 nb_limit=5 1854 if not id.isdigit() : 1855 print("[ID is not digit]:" + id) 1856 1857 while not id.isdigit() : 1858 nb_try+=1 1859 print("[fail_retry]:"+ nb_try) 1860 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1861 id = a.stdout.read().strip() 1862 if nb_try > nb_limit : 1863 raise ClusterManagementError('fail to submit to the HTCaaS cluster: \n %s' % id) 1864 break 1865 1866 self.submitted += 1 1867 self.submitted_ids.append(id) 1868 1869 return id
1870 1871 @multiple_try(nb_try=10, sleep=5)
1872 - def control_one_job(self, id):
1873 """ control the status of a single job with it's cluster id """ 1874 1875 if id == 0 : 1876 status_out ='C' 1877 else : 1878 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1879 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1880 stderr=subprocess.PIPE) 1881 error = status.stderr.read().decode() 1882 if status.returncode or error: 1883 raise ClusterManagmentError('htcaas-job-submit returns error: %s' % error) 1884 status_out= status.stdout.read().decode().strip() 1885 status_out= status_out.split(":",1)[1] 1886 if status_out == 'waiting': 1887 status_out='I' 1888 elif status_out == 'preparing' or status_out == 'running': 1889 status_out = 'R' 1890 elif status_out != 'done': 1891 status_out = 'F' 1892 elif status_out == 'done': 1893 status_out = 'C' 1894 1895 return status_out
1896 1897 @multiple_try()
1898 - def control(self, me_dir):
1899 """ control the status of a single job with it's cluster id """ 1900 if not self.submitted_ids: 1901 logger.debug("self.submitted_ids not exists") 1902 return 0, 0, 0, 0 1903 1904 ongoing = [] 1905 idle, run, fail = 0, 0, 0 1906 1907 start = self.submitted_ids[0] 1908 end = self.submitted_ids[-1] 1909 1910 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1911 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1912 1913 for line in status.stdout: 1914 #ongoing.append(line.split()[0].strip()) 1915 status2 = line.decode().split()[-1] 1916 if status2 != 'null' or line.split()[0].strip() != '0': 1917 ongoing.append(line.split()[0].strip()) 1918 logger.debug("["+line.split()[0].strip()+"]"+status2) 1919 if status2 != 'null' or line.split()[0].strip() != '0': 1920 idle += 1 1921 elif status2 in self.idle_tag: 1922 idle += 1 1923 elif status2 in self.running_tag: 1924 run += 1 1925 elif status2 in self.complete_tag: 1926 if not self.check_termination(line.split()[0]): 1927 idle +=1 1928 else: 1929 fail += 1 1930 1931 return idle, run, self.submitted - (idle+run+fail), fail
1932 1933 @multiple_try()
1934 - def remove(self, *args, **opts):
1935 """Clean the jobson the cluster""" 1936 1937 if not self.submitted_ids: 1938 return 1939 for i in range(len(self.submitted_ids)): 1940 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1941 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1942
1943 -class HTCaaS2Cluster(Cluster):
1944 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1945 1946 name= 'htcaas2' 1947 job_id = 'HTCAAS2_JOBID' 1948 idle_tag = ['waiting'] 1949 running_tag = ['preparing','running'] 1950 complete_tag = ['done'] 1951 1952 @store_input() 1953 @multiple_try()
1954 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1955 log=None, input_files=[], output_files=[], required_output=[], 1956 nb_submit=0):
1957 1958 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1959 input/output file should be given as relative to CWD 1960 """ 1961 if cwd is None: 1962 cwd = os.getcwd() 1963 1964 if not os.path.exists(prog): 1965 prog = os.path.join(cwd, prog) 1966 1967 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1968 if cwd or prog : 1969 self.submitted_dirs.append(cwd) 1970 self.submitted_exes.append(prog) 1971 else: 1972 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1973 1974 if argument : 1975 self.submitted_args.append('='.join([str(a) for a in argument])) 1976 1977 if cwd or prog : 1978 self.submitted += 1 1979 id = self.submitted 1980 self.submitted_ids.append(id) 1981 else: 1982 logger.debug("cwd and prog are not exist! ") 1983 id = 0 1984 1985 else: 1986 temp_file_name = "sub."+ os.path.basename(prog) 1987 text = """#!/bin/bash 1988 MYPWD=%(cwd)s 1989 cd $MYPWD 1990 input_files=(%(input_files)s ) 1991 for i in ${input_files[@]} 1992 do 1993 chmod -f +x $i 1994 done 1995 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1996 """ 1997 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1998 'arguments': ' '.join([str(a) for a in argument]), 1999 'program': ' ' if '.py' in prog else 'bash'} 2000 # writing a new script for the submission 2001 new_prog = pjoin(cwd, temp_file_name) 2002 open(new_prog, 'w').write(text % dico) 2003 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 2004 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 2005 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 2006 id = a.stdout.read().strip() 2007 logger.debug("[mode2]-["+str(id)+"]") 2008 if cwd and prog : 2009 self.submitted += 1 2010 self.submitted_ids.append(id) 2011 else: 2012 logger.debug("cwd and prog are not exist! ") 2013 id = 0 2014 2015 return id
2016 2017 @multiple_try()
2018 - def metasubmit(self, me_dir=None):
2019 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 2020 tmp_leng= len(self.submitted_ids)/2 2021 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 2022 tmp_dirs2= self.submitted_dirs[tmp_leng:] 2023 tmp_exes1= self.submitted_exes[0:tmp_leng] 2024 tmp_exes2= self.submitted_exes[tmp_leng:] 2025 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a != ' ']), 2026 '-e', ":".join([str(a) for a in tmp_exes1 if a and a != ' '])] 2027 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a != ' ']), 2028 '-e', ":".join([str(a) for a in tmp_exes2 if a and a != ' '])] 2029 if len(self.submitted_args) > 0 : 2030 tmp_args1= self.submitted_args[0:tmp_leng] 2031 tmp_args2= self.submitted_args[tmp_leng:] 2032 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 2033 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2034 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2035 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2036 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2037 2038 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2039 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a != ' ']), 2040 '-e', ":".join([str(a) for a in self.submitted_exes if a and a != ' '])] 2041 if len(self.submitted_args) > 0 : 2042 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2043 if self.submitted_dirs[0] or self.submitted_exes[0] : 2044 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2045 me_dir = result.stdout.read().strip() 2046 self.submitted_ids[0]=me_dir 2047 else: 2048 me_dir = self.submitted_ids[-1] 2049 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2050 me_dir = self.submitted_ids[0] 2051 else: 2052 me_dir = -1 2053 2054 logger.debug("[" + str(me_dir) + "]") 2055 2056 self.submitted_dirs = [] 2057 self.submitted_exes = [] 2058 self.submitted_args = [] 2059 2060 return me_dir
2061 2062 2063 @multiple_try(nb_try=10, sleep=5)
2064 - def control_one_job(self, id):
2065 """ control the status of a single job with it's cluster id """ 2066 #logger.debug("CONTROL ONE JOB MODE") 2067 if self.submitted == self.submitted_ids[-1] : 2068 id = self.metasubmit(self) 2069 tempid = self.submitted_ids[-1] 2070 self.submitted_ids.remove(self.submitted_ids[-1]) 2071 self.submitted_ids.append(id) 2072 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2073 2074 if id == 0 : 2075 status_out ='C' 2076 else: 2077 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2078 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2079 stderr=subprocess.PIPE) 2080 error = status.stderr.read().decode() 2081 if status.returncode or error: 2082 raise ClusterManagmentError('htcaas-job-status returns error: %s' % error) 2083 status_out= status.stdout.read().decode().strip() 2084 status_out= status_out.split(":",1)[1] 2085 logger.debug("[["+str(id)+"]]"+status_out) 2086 if status_out == 'waiting': 2087 status_out='I' 2088 elif status_out == 'preparing' or status_out == 'running': 2089 status_out = 'R' 2090 elif status_out != 'done': 2091 status_out = 'F' 2092 elif status_out == 'done': 2093 status_out = 'C' 2094 self.submitted -= 1 2095 2096 return status_out
2097 2098 @multiple_try()
2099 - def control(self, me_dir):
2100 """ control the status of a single job with it's cluster id """ 2101 if not self.submitted_ids: 2102 logger.debug("self.submitted_ids not exists") 2103 return 0, 0, 0, 0 2104 2105 if "//" in me_dir : 2106 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2107 start = me_dir.split("//")[0] 2108 end = me_dir.split("//")[1] 2109 else : 2110 start = me_dir.split("//")[1] 2111 end = me_dir.split("//")[0] 2112 elif "/" in me_dir : # update 2113 start = 0 2114 end = 0 2115 elif me_dir.isdigit(): 2116 start = me_dir 2117 end = me_dir 2118 elif not me_dir.isdigit(): 2119 me_dir = self.submitted_ids[0] 2120 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2121 2122 ongoing = [] 2123 idle, run, fail, done = 0, 0, 0, 0 2124 2125 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2126 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2127 2128 for line in status.stdout: 2129 line = line.decode() 2130 status2 = line.split()[-1] 2131 if status2 != 'null' or line.split()[0].strip() != '0': 2132 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2133 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2134 2135 if status2 == 'null' or line.split()[0].strip() == '0': 2136 idle += 1 2137 elif status2 in self.idle_tag: 2138 idle += 1 2139 elif status2 in self.running_tag: 2140 run += 1 2141 elif status2 in self.complete_tag: 2142 done += 1 2143 self.submitted -= 1 2144 if not self.check_termination(line.split()[1]): 2145 idle +=1 2146 else: 2147 fail += 1 2148 2149 return idle, run, self.submitted - (idle+run+fail), fail
2150 2151 @multiple_try()
2152 - def remove(self, *args, **opts):
2153 """Clean the jobson the cluster""" 2154 2155 if not self.submitted_ids: 2156 return 2157 id = self.submitted_ids[0] 2158 if id: 2159 cmd = "htcaas-job-cancel -m %s" % str(id) 2160 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2161 2162 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2163 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2164 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2165 2166 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2167 #fork the main process 2168