1
2
3
4
5
6
7
8
9
10
11
12
13
14 from __future__ import absolute_import
15 from __future__ import print_function
16 import subprocess
17 import logging
18 import os
19 import time
20 import re
21 import glob
22 import inspect
23 import sys
24 import six
25 from six.moves import range
26 from six.moves import input
27
28 logger = logging.getLogger('madgraph.cluster')
29
30 try:
31 from madgraph import MadGraph5Error
32 import madgraph.various.misc as misc
33 except Exception as error:
34 if __debug__:
35 print(str(error))
36 from internal import MadGraph5Error
37 import internal.misc as misc
38
39 pjoin = os.path.join
43
46
47
48 multiple_try = misc.multiple_try
49 pjoin = os.path.join
53
54 def deco_interupt(f):
55 def deco_f_interupt(self, *args, **opt):
56 try:
57 return f(self, *args, **opt)
58 except error:
59 try:
60 self.remove(*args, **opt)
61 except Exception:
62 pass
63 raise error
64 return deco_f_interupt
65 return deco_interupt
66
79 return deco_f_store
80 return deco_store
81
83 """ This function checks whether compression of input files are necessary
84 given the running options given. """
85
86 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
87 return False
88 else:
89 return True
90
92 """Basic Class for all cluster type submission"""
93 name = 'mother class'
94 identifier_length = 14
95
97 """Init the cluster"""
98
99 self.submitted = 0
100 self.submitted_ids = []
101 self.finish = 0
102 self.submitted_dirs = []
103 self.submitted_exes = []
104 self.submitted_args = []
105
106 if 'cluster_queue' in opts:
107 self.cluster_queue = opts['cluster_queue']
108 else:
109 self.cluster_queue = 'madgraph'
110 if 'cluster_temp_path' in opts:
111 self.temp_dir = opts['cluster_temp_path']
112 else:
113 self.temp_dir = None
114 self.options = {'cluster_status_update': (600, 30)}
115 for key,value in opts.items():
116 self.options[key] = value
117 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
118 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
119 self.options = dict(opts)
120 self.retry_args = {}
121
122 self.packet = {}
123 self.id_to_packet = {}
124
125 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
126 log=None, required_output=[], nb_submit=0):
127 """How to make one submission. Return status id on the cluster."""
128 raise NotImplemented('No implementation of how to submit a job to cluster \'%s\'' % self.name)
129
130
131 @store_input()
132 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
133 log=None, input_files=[], output_files=[], required_output=[],
134 nb_submit=0):
135 """How to make one submission. Return status id on the cluster.
136 NO SHARE DISK"""
137
138 if cwd is None:
139 cwd = os.getcwd()
140 if not os.path.exists(prog):
141 prog = os.path.join(cwd, prog)
142
143 if not required_output and output_files:
144 required_output = output_files
145
146 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
147 (input_files == [] == output_files):
148
149 return self.submit(prog, argument, cwd, stdout, stderr, log,
150 required_output=required_output, nb_submit=nb_submit)
151
152 if not input_files and not output_files:
153
154 return self.submit(prog, argument, cwd, stdout, stderr, log,
155 required_output=required_output, nb_submit=nb_submit)
156
157 if cwd is None:
158 cwd = os.getcwd()
159 if not os.path.exists(prog):
160 prog = os.path.join(cwd, prog)
161 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
162
163 text = """#!/bin/bash
164 MYTMP=%(tmpdir)s/run$%(job_id)s
165 MYPWD=%(cwd)s
166 mkdir -p $MYTMP
167 cd $MYPWD
168 input_files=( %(input_files)s )
169 for i in ${input_files[@]}
170 do
171 cp -R -L $i $MYTMP
172 done
173 cd $MYTMP
174 echo '%(arguments)s' > arguments
175 chmod +x ./%(script)s
176 %(program)s ./%(script)s %(arguments)s
177 exit=$?
178 output_files=( %(output_files)s )
179 for i in ${output_files[@]}
180 do
181 cp -r $MYTMP/$i $MYPWD
182 done
183 # if [ "$exit" -eq "0" ]
184 # then
185 rm -rf $MYTMP
186 # fi
187 """
188
189 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
190 'cwd': cwd, 'job_id': self.job_id,
191 'input_files': ' '.join(input_files + [prog]),
192 'output_files': ' '.join(output_files),
193 'arguments': ' '.join([str(a) for a in argument]),
194 'program': ' ' if '.py' in prog else 'bash'}
195
196
197 new_prog = pjoin(cwd, temp_file_name)
198 open(new_prog, 'w').write(text % dico)
199 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
200
201 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
202 required_output=required_output, nb_submit=nb_submit)
203
204
205 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
206 log=None, input_files=[], output_files=[], required_output=[],
207 nb_submit=0, packet_member=None):
208 """This function wrap the cluster submition with cluster independant
209 method should not be overwritten (but for DAG type submission)"""
210
211 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
212 output_files, required_output, nb_submit)
213
214
215 if not packet_member:
216 return id
217 else:
218 if isinstance(packet_member, Packet):
219 self.id_to_packet[id] = packet_member
220 packet_member.put(id)
221 if packet_member.tag not in self.packet:
222 self.packet[packet_member.tag] = packet_member
223 else:
224 if packet_member in self.packet:
225 packet = self.packet[packet_member]
226 packet.put(id)
227 self.id_to_packet[id] = packet
228 return id
229
231 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
232 if not self.submitted_ids:
233 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
234 idle, run, fail = 0, 0, 0
235 for pid in self.submitted_ids[:]:
236 status = self.control_one_job(id)
237 if status == 'I':
238 idle += 1
239 elif status == 'R':
240 run += 1
241 elif status == 'F':
242 self.finish +=1
243 self.submitted_ids.remove(pid)
244 else:
245 fail += 1
246
247 return idle, run, self.finish, fail
248
250 """ control the status of a single job with it's cluster id """
251 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
252
254 """get a unique run_name for all the jobs helps to identify the runs
255 in the controller for some cluster."""
256
257 if second_path:
258 path = os.path.realpath(pjoin(path, second_path))
259 elif not os.path.exists(path):
260 return path
261
262 if 'SubProcesses' in path:
263 target = path.rsplit('/SubProcesses',1)[0]
264 elif 'MCatNLO' in path:
265 target = path.rsplit('/MCatNLO',1)[0]
266 elif 'PY8_parallelization' in path:
267 target = path.rsplit('/PY8_parallelization',1)[0]
268 elif second_path:
269 target=path
270 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
271 else:
272 target = path
273
274 if target.endswith('/'):
275 target = target[:-1]
276
277 target = misc.digest(target.encode())[-self.identifier_length:]
278 if not target[0].isalpha():
279 target = 'a' + target[1:]
280
281 return target
282
283
284 @check_interupt()
285 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
286 """Wait that all job are finish.
287 if minimal_job set, then return if idle + run is lower than that number"""
288
289
290 mode = 1
291 nb_iter = 0
292 nb_short = 0
293 change_at = 5
294
295 if update_first:
296 idle, run, finish, fail = self.control(me_dir)
297 update_first(idle, run, finish)
298
299
300 longtime, shorttime = self.options['cluster_status_update']
301
302 nb_job = 0
303
304 if self.options['cluster_type'] == 'htcaas2':
305 me_dir = self.metasubmit(self)
306
307 while 1:
308 old_mode = mode
309 nb_iter += 1
310 idle, run, finish, fail = self.control(me_dir)
311 if nb_job:
312 if idle + run + finish + fail != nb_job:
313 nb_job = idle + run + finish + fail
314 nb_iter = 1
315 else:
316 nb_job = idle + run + finish + fail
317 if fail:
318 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
319 if idle + run == 0:
320
321 logger.info('All jobs finished')
322 fct(idle, run, finish)
323 break
324 if idle + run < minimal_job:
325 return
326 fct(idle, run, finish)
327
328 if nb_iter < change_at:
329 mode = 1
330 elif idle < run:
331 if old_mode == 0:
332 if nb_short:
333 mode = 0
334
335 elif idle:
336 if nb_iter > change_at + int(longtime)//shorttime:
337 mode = 0
338 else:
339 mode = 1
340 nb_short =0
341 else:
342 mode = 1
343 nb_short = 0
344 elif old_mode == 1:
345 nb_short +=1
346 if nb_short > 3* max(change_at, int(longtime)//shorttime):
347 mode = 0
348 else:
349 mode = 0
350
351
352 if old_mode > mode:
353 logger.info('''Start to wait %ss between checking status.
354 Note that you can change this time in the configuration file.
355 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
356
357
358 if mode == 0:
359 try:
360 time.sleep(self.options['cluster_status_update'][0])
361 except KeyboardInterrupt:
362 logger.info('start to update the status')
363 nb_iter = min(0, change_at -2)
364 nb_short = 0
365 else:
366 time.sleep(self.options['cluster_status_update'][1])
367
368
369 self.submitted = 0
370 self.submitted_ids = []
371
373 """Check the termination of the jobs with job_id and relaunch it if needed."""
374
375
376 if job_id not in self.retry_args:
377 if job_id in self.id_to_packet:
378 nb_in_packet = self.id_to_packet[job_id].remove_one()
379 if nb_in_packet == 0:
380
381 packet = self.id_to_packet[job_id]
382
383 packet.queue.join()
384
385 packet.fct(*packet.args)
386 del self.id_to_packet[job_id]
387 return 'resubmit'
388 else:
389 return True
390
391 args = self.retry_args[job_id]
392 if 'time_check' in args:
393 time_check = args['time_check']
394 else:
395 time_check = 0
396
397 for path in args['required_output']:
398 if args['cwd']:
399 path = pjoin(args['cwd'], path)
400
401 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
402 break
403 else:
404
405 if time_check > 0:
406 logger.info('Job %s Finally found the missing output.' % (job_id))
407 del self.retry_args[job_id]
408 self.submitted_ids.remove(job_id)
409
410 if job_id in self.id_to_packet:
411 nb_in_packet = self.id_to_packet[job_id].remove_one()
412 if nb_in_packet == 0:
413
414 packet = self.id_to_packet[job_id]
415
416 packet.queue.join()
417
418 packet.fct(*packet.args)
419 del self.id_to_packet[job_id]
420 return 'resubmit'
421
422 return 'done'
423
424 if time_check == 0:
425 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
426 args['time_check'] = time.time()
427 return 'wait'
428 elif self.cluster_retry_wait > time.time() - time_check:
429 return 'wait'
430
431
432 if self.nb_retry < 0:
433 logger.critical('''Fail to run correctly job %s.
434 with option: %s
435 file missing: %s''' % (job_id, args, path))
436 input('press enter to continue.')
437 elif self.nb_retry == 0:
438 logger.critical('''Fail to run correctly job %s.
439 with option: %s
440 file missing: %s.
441 Stopping all runs.''' % (job_id, args, path))
442 self.remove()
443 elif args['nb_submit'] >= self.nb_retry:
444 logger.critical('''Fail to run correctly job %s.
445 with option: %s
446 file missing: %s
447 Fails %s times
448 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
449 self.remove()
450 else:
451 args['nb_submit'] += 1
452 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
453 del self.retry_args[job_id]
454 self.submitted_ids.remove(job_id)
455 if 'time_check' in args:
456 del args['time_check']
457 if job_id in self.id_to_packet:
458 self.id_to_packet[job_id].remove_one()
459 args['packet_member'] = self.id_to_packet[job_id]
460 del self.id_to_packet[job_id]
461 self.cluster_submit(**args)
462 else:
463 self.submit2(**args)
464 return 'resubmit'
465 return 'done'
466
467 @check_interupt()
468 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
469 stderr=None, log=None, required_output=[], nb_submit=0,
470 input_files=[], output_files=[]):
471 """launch one job on the cluster and wait for it"""
472
473 special_output = False
474 if stderr == -2 and stdout:
475
476 special_output = True
477 stderr = stdout + '.err'
478
479 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
480 required_output=required_output, input_files=input_files,
481 output_files=output_files)
482
483 if self.options['cluster_type']=='htcaas2':
484 if self.submitted == self.submitted_ids[-1]:
485 id = self.metasubmit(self)
486
487 frame = inspect.currentframe()
488 args, _, _, values = inspect.getargvalues(frame)
489 args = dict([(i, values[i]) for i in args if i != 'self'])
490 self.retry_args[id] = args
491
492 nb_wait=0
493 while 1:
494 nb_wait+=1
495 status = self.control_one_job(id)
496 if not status in ['R','I']:
497 status = self.check_termination(id)
498 if status in ['wait']:
499 time.sleep(30)
500 continue
501 elif status in ['resubmit']:
502 id = self.submitted_ids[0]
503 time.sleep(30)
504 continue
505
506 time.sleep(30)
507 break
508 time.sleep(self.options['cluster_status_update'][1])
509
510 if required_output:
511 status = self.check_termination(id)
512 if status == 'wait':
513 run += 1
514 elif status == 'resubmit':
515 idle += 1
516
517
518 if special_output:
519
520
521 for i in range(5):
522 if os.path.exists(stdout):
523 if not os.path.exists(stderr):
524 time.sleep(5)
525 if os.path.exists(stderr):
526 err_text = open(stderr).read()
527 if not err_text:
528 return
529 logger.warning(err_text)
530 text = open(stdout).read()
531 open(stdout,'w').write(text + err_text)
532 else:
533 return
534 time.sleep(10)
535
536 - def remove(self, *args, **opts):
537 """ """
538 logger.warning("""This cluster didn't support job removal,
539 the jobs are still running on the cluster.""")
540
541 @store_input()
545
547 """routine which allow to modify the run_card/mg5cmd object to change the
548 default behavior of the runs.
549 This is called at the time of the compilation of the run_card.
550 Note that this function can be called multiple times by run.
551 """
552
553 return
554
556 """ an object for handling packet of job, it is designed to be thread safe
557 """
558
559 - def __init__(self, name, fct, args, opts={}):
560 import six.moves.queue
561 import threading
562 self.queue = six.moves.queue.Queue()
563 self.tag = name
564 self.fct = fct
565 self.args = args
566 self.opts = opts
567 self.done = threading.Event()
568
569 - def put(self, *args, **opts):
571
572 append = put
573
578
580 """class for dealing with the submission in multiple node"""
581
582 job_id = "$"
583
585 """Init the cluster """
586
587
588 super(MultiCore, self).__init__(self, *args, **opt)
589
590 import six.moves.queue
591 import threading
592 import six.moves._thread
593 self.queue = six.moves.queue.Queue()
594 self.done = six.moves.queue.Queue()
595 self.submitted = six.moves.queue.Queue()
596 self.stoprequest = threading.Event()
597 self.demons = []
598 self.nb_done =0
599 if 'nb_core' in opt:
600 self.nb_core = opt['nb_core']
601 elif isinstance(args[0],int):
602 self.nb_core = args[0]
603 else:
604 self.nb_core = 1
605 self.update_fct = None
606
607 self.lock = threading.Event()
608 self.pids = six.moves.queue.Queue()
609 self.done_pid = []
610 self.done_pid_queue = six.moves.queue.Queue()
611 self.fail_msg = None
612
613
614 for _ in range(self.nb_core):
615 self.start_demon()
616
617
619 import threading
620 t = threading.Thread(target=self.worker)
621 t.daemon = True
622 t.start()
623 self.demons.append(t)
624
625
627 import six.moves.queue
628 import six.moves._thread
629 while not self.stoprequest.isSet():
630 try:
631 args = self.queue.get()
632 tag, exe, arg, opt = args
633 try:
634
635 if isinstance(exe,str):
636 if os.path.exists(exe) and not exe.startswith('/'):
637 exe = './' + exe
638 if isinstance(opt['stdout'],str):
639 opt['stdout'] = open(opt['stdout'],'w')
640 if opt['stderr'] == None:
641 opt['stderr'] = subprocess.STDOUT
642 if arg:
643 proc = misc.Popen([exe] + arg, **opt)
644 else:
645 proc = misc.Popen(exe, **opt)
646 pid = proc.pid
647 self.pids.put(pid)
648 proc.wait()
649 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
650 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
651 (' '.join([exe]+arg), proc.returncode)
652 logger.warning(fail_msg)
653 self.stoprequest.set()
654 self.remove(fail_msg)
655
656
657
658
659 else:
660 pid = tag
661 self.pids.put(pid)
662
663
664 returncode = exe(*arg, **opt)
665 if returncode != 0:
666 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
667 self.stoprequest.set()
668 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
669 except Exception as error:
670 self.fail_msg = sys.exc_info()
671 logger.warning(str(error))
672 self.stoprequest.set()
673 self.remove(error)
674
675 if __debug__:
676 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
677
678 self.queue.task_done()
679 self.done.put(tag)
680 self.done_pid_queue.put(pid)
681
682 try:
683 self.lock.set()
684 except six.moves._thread.error:
685 continue
686 except six.moves.queue.Empty:
687 continue
688
689
690
691
692 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
693 log=None, required_output=[], nb_submit=0):
694 """submit a job on multicore machine"""
695
696 tag = (prog, tuple(argument), cwd, nb_submit)
697 if isinstance(prog, str):
698
699 opt = {'cwd': cwd,
700 'stdout':stdout,
701 'stderr': stderr}
702
703 self.queue.put((tag, prog, argument, opt))
704 self.submitted.put(1)
705 return tag
706 else:
707
708 self.queue.put((tag, prog, argument, {}))
709 self.submitted.put(1)
710 return tag
711
712 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
713 stderr=None, log=None, **opts):
714 """launch one job and wait for it"""
715 if isinstance(stdout, str):
716 stdout = open(stdout, 'w')
717 if isinstance(stderr, str):
718 stdout = open(stderr, 'w')
719 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
720
721 - def remove(self, error=None):
722 """Ensure that all thread are killed"""
723
724
725 self.stoprequest.set()
726 if error and not self.fail_msg:
727 self.fail_msg = error
728
729
730 while not self.done_pid_queue.empty():
731 pid = self.done_pid_queue.get()
732 self.done_pid.append(pid)
733
734
735 while not self.pids.empty():
736 pid = self.pids.get()
737 self.pids.task_done()
738 if isinstance(pid, tuple):
739 continue
740 if pid in self.done_pid:
741 continue
742 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
743 % {'pid':pid} )
744 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
745
746
747 - def wait(self, me_dir, update_status, update_first=None):
748 """Waiting that all the jobs are done. This function also control that
749 the submission by packet are handle correctly (i.e. submit the function)"""
750
751 import six.moves.queue
752 import threading
753
754 try:
755 last_status = (0, 0, 0)
756 sleep_time = 1
757 use_lock = True
758 first = True
759 while True:
760 force_one_more_loop = False
761
762
763
764 while self.done.qsize():
765 try:
766 tag = self.done.get(True, 1)
767 except six.moves.queue.Empty:
768 pass
769 else:
770 if self.id_to_packet and tuple(tag) in self.id_to_packet:
771 packet = self.id_to_packet[tuple(tag)]
772 remaining = packet.remove_one()
773 if remaining == 0:
774
775 packet.queue.join()
776 self.submit(packet.fct, packet.args)
777 force_one_more_loop = True
778 self.nb_done += 1
779 self.done.task_done()
780
781
782
783 Idle = self.queue.qsize()
784 Done = self.nb_done + self.done.qsize()
785 Running = max(0, self.submitted.qsize() - Idle - Done)
786
787 if Idle + Running <= 0 and not force_one_more_loop:
788 update_status(Idle, Running, Done)
789
790
791 self.queue.join()
792 break
793
794 if (Idle, Running, Done) != last_status:
795 if first and update_first:
796 update_first(Idle, Running, Done)
797 first = False
798 else:
799 update_status(Idle, Running, Done)
800 last_status = (Idle, Running, Done)
801
802
803 while not self.done_pid_queue.empty():
804 pid = self.done_pid_queue.get()
805 self.done_pid.append(pid)
806 self.done_pid_queue.task_done()
807
808
809
810 if use_lock:
811
812 use_lock = self.lock.wait(300)
813 self.lock.clear()
814 if not use_lock and Idle > 0:
815 use_lock = True
816 else:
817
818
819 time.sleep(sleep_time)
820 sleep_time = min(sleep_time + 2, 180)
821 if update_first:
822 update_first(Idle, Running, Done)
823
824 if self.stoprequest.isSet():
825 if isinstance(self.fail_msg, Exception):
826 raise self.fail_msg
827 elif isinstance(self.fail_msg, str):
828 raise Exception(self.fail_msg)
829 else:
830 misc.sprint(self.fail_msg)
831 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
832
833 try:
834 self.lock.clear()
835 except Exception:
836 pass
837 self.done = six.moves.queue.Queue()
838 self.done_pid = []
839 self.done_pid_queue = six.moves.queue.Queue()
840 self.nb_done = 0
841 self.submitted = six.moves.queue.Queue()
842 self.pids = six.moves.queue.Queue()
843 self.stoprequest.clear()
844
845 except KeyboardInterrupt:
846
847 if isinstance(self.fail_msg, Exception):
848 raise self.fail_msg
849 elif isinstance(self.fail_msg, str):
850 raise Exception(self.fail_msg)
851 elif self.fail_msg:
852 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
853
854 raise
855
857 """Basic class for dealing with cluster submission"""
858
859 name = 'condor'
860 job_id = 'CONDOR_ID'
861
862
863
864 @multiple_try()
865 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
866 required_output=[], nb_submit=0):
867 """Submit a job prog to a Condor cluster"""
868
869 text = """Executable = %(prog)s
870 output = %(stdout)s
871 error = %(stderr)s
872 log = %(log)s
873 %(argument)s
874 environment = CONDOR_ID=$(Cluster).$(Process)
875 Universe = vanilla
876 notification = Error
877 Initialdir = %(cwd)s
878 %(requirement)s
879 getenv=True
880 queue 1
881 """
882
883 if self.cluster_queue not in ['None', None]:
884 requirement = 'Requirements = %s=?=True' % self.cluster_queue
885 else:
886 requirement = ''
887
888 if cwd is None:
889 cwd = os.getcwd()
890 if stdout is None:
891 stdout = '/dev/null'
892 if stderr is None:
893 stderr = '/dev/null'
894 if log is None:
895 log = '/dev/null'
896 if not os.path.exists(prog):
897 prog = os.path.join(cwd, prog)
898 if argument:
899 argument = 'Arguments = %s' % ' '.join(argument)
900 else:
901 argument = ''
902
903
904 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
905 'stderr': stderr,'log': log,'argument': argument,
906 'requirement': requirement}
907
908
909 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
910 stdin=subprocess.PIPE)
911 output, _ = a.communicate((text % dico).encode())
912
913
914
915
916 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
917 output = output.decode()
918 try:
919 id = pat.search(output).groups()[0]
920 except:
921 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
922 % output)
923 self.submitted += 1
924 self.submitted_ids.append(id)
925 return id
926
927 @store_input()
928 @multiple_try()
929 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
930 log=None, input_files=[], output_files=[], required_output=[],
931 nb_submit=0):
932 """Submit the job on the cluster NO SHARE DISK
933 input/output file should be give relative to cwd
934 """
935
936 if not required_output and output_files:
937 required_output = output_files
938
939 if (input_files == [] == output_files):
940 return self.submit(prog, argument, cwd, stdout, stderr, log,
941 required_output=required_output, nb_submit=nb_submit)
942
943 text = """Executable = %(prog)s
944 output = %(stdout)s
945 error = %(stderr)s
946 log = %(log)s
947 %(argument)s
948 should_transfer_files = YES
949 when_to_transfer_output = ON_EXIT
950 transfer_input_files = %(input_files)s
951 %(output_files)s
952 Universe = vanilla
953 notification = Error
954 Initialdir = %(cwd)s
955 %(requirement)s
956 getenv=True
957 queue 1
958 """
959
960 if self.cluster_queue not in ['None', None]:
961 requirement = 'Requirements = %s=?=True' % self.cluster_queue
962 else:
963 requirement = ''
964
965 if cwd is None:
966 cwd = os.getcwd()
967 if stdout is None:
968 stdout = '/dev/null'
969 if stderr is None:
970 stderr = '/dev/null'
971 if log is None:
972 log = '/dev/null'
973 if not os.path.exists(prog):
974 prog = os.path.join(cwd, prog)
975 if argument:
976 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
977 else:
978 argument = ''
979
980 if input_files:
981 input_files = ','.join(input_files)
982 else:
983 input_files = ''
984 if output_files:
985 output_files = 'transfer_output_files = %s' % ','.join(output_files)
986 else:
987 output_files = ''
988
989
990
991 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
992 'stderr': stderr,'log': log,'argument': argument,
993 'requirement': requirement, 'input_files':input_files,
994 'output_files':output_files}
995
996
997 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
998 stdin=subprocess.PIPE)
999 output, _ = a.communicate((text % dico).encode())
1000
1001
1002
1003
1004 output = output.decode()
1005 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
1006 try:
1007 id = pat.search(output).groups()[0]
1008 except:
1009 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1010 % output)
1011 self.submitted += 1
1012 self.submitted_ids.append(id)
1013 return id
1014
1015
1016
1017
1018
1019 @multiple_try(nb_try=10, sleep=10)
1021 """ control the status of a single job with it's cluster id """
1022 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1023 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1024 stderr=subprocess.PIPE)
1025
1026 error = status.stderr.read().decode()
1027 if status.returncode or error:
1028 raise ClusterManagmentError('condor_q returns error: %s' % error)
1029
1030 return status.stdout.readline().decode().strip()
1031
1032 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'}
1033 @check_interupt()
1034 @multiple_try(nb_try=10, sleep=10)
1036 """ control the status of a single job with it's cluster id """
1037
1038 if not self.submitted_ids:
1039 return 0, 0, 0, 0
1040
1041 packet = 15000
1042 idle, run, fail = 0, 0, 0
1043 ongoing = []
1044 for i in range(1+(len(self.submitted_ids)-1)//packet):
1045 start = i * packet
1046 stop = (i+1) * packet
1047 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1048 " -format \"%d \" ClusterId " + \
1049 " -format \"%d\\n\" JobStatus "
1050
1051 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1052 stderr=subprocess.PIPE)
1053 error = status.stderr.read().decode()
1054 if status.returncode or error:
1055 raise ClusterManagmentError('condor_q returns error: %s' % error)
1056
1057 for line in status.stdout:
1058 id, status = line.decode().strip().split()
1059 status = self.jobstatus[status]
1060 ongoing.append(id)
1061 if status in ['I','U']:
1062 idle += 1
1063 elif status == 'R':
1064 run += 1
1065 elif status != 'C':
1066 fail += 1
1067
1068 for id in list(self.submitted_ids):
1069 if id not in ongoing:
1070 status = self.check_termination(id)
1071 if status == 'wait':
1072 run += 1
1073 elif status == 'resubmit':
1074 idle += 1
1075
1076 return idle, run, self.submitted - (idle+run+fail), fail
1077
1078 @multiple_try()
1079 - def remove(self, *args, **opts):
1080 """Clean the jobson the cluster"""
1081
1082 if not self.submitted_ids:
1083 return
1084 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1085
1086 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1087 self.submitted_ids = []
1088
1090 """Basic class for dealing with cluster submission"""
1091
1092 name = 'pbs'
1093 job_id = 'PBS_JOBID'
1094 idle_tag = ['Q']
1095 running_tag = ['T','E','R']
1096 complete_tag = ['C']
1097
1098 maximum_submited_jobs = 2500
1099
1100 @multiple_try()
1101 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1102 required_output=[], nb_submit=0):
1103 """Submit a job prog to a PBS cluster"""
1104
1105 me_dir = self.get_jobs_identifier(cwd, prog)
1106
1107 if len(self.submitted_ids) > self.maximum_submited_jobs:
1108 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1109 self.wait(me_dir, fct, self.maximum_submited_jobs)
1110
1111
1112 text = ""
1113 if cwd is None:
1114 cwd = os.getcwd()
1115 else:
1116 text = " cd %s;" % cwd
1117 if stdout is None:
1118 stdout = '/dev/null'
1119 if stderr is None:
1120 stderr = '/dev/null'
1121 elif stderr == -2:
1122 stderr = stdout
1123 if log is None:
1124 log = '/dev/null'
1125
1126 if not os.path.isabs(prog):
1127 text += "./%s" % prog
1128 else:
1129 text+= prog
1130
1131 if argument:
1132 text += ' ' + ' '.join(argument)
1133
1134 command = ['qsub','-o', stdout,
1135 '-N', me_dir,
1136 '-e', stderr,
1137 '-V']
1138
1139 if self.cluster_queue and self.cluster_queue != 'None':
1140 command.extend(['-q', self.cluster_queue])
1141
1142 a = misc.Popen(command, stdout=subprocess.PIPE,
1143 stderr=subprocess.STDOUT,
1144 stdin=subprocess.PIPE, cwd=cwd)
1145
1146 output = a.communicate(text.encode())[0].decode()
1147 id = output.split('.')[0]
1148 if not id.isdigit() or a.returncode !=0:
1149 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1150 % output)
1151
1152 self.submitted += 1
1153 self.submitted_ids.append(id)
1154 return id
1155
1156 @multiple_try()
1158 """ control the status of a single job with it's cluster id """
1159 cmd = 'qstat '+str(id)
1160 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1161 stderr=subprocess.STDOUT)
1162
1163 for line in status.stdout:
1164 line = line.decode().strip()
1165 if 'cannot connect to server' in line or 'cannot read reply' in line:
1166 raise ClusterManagmentError('server disconnected')
1167 if 'Unknown' in line:
1168 return 'F'
1169 elif line.startswith(str(id)):
1170 jobstatus = line.split()[4]
1171 else:
1172 jobstatus=""
1173
1174 if status.returncode != 0 and status.returncode is not None:
1175 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode)
1176 if jobstatus in self.idle_tag:
1177 return 'I'
1178 elif jobstatus in self.running_tag:
1179 return 'R'
1180 return 'F'
1181
1182
1183 @multiple_try()
1185 """ control the status of a single job with it's cluster id """
1186 cmd = "qstat"
1187 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1188
1189 me_dir = self.get_jobs_identifier(me_dir)
1190
1191 ongoing = []
1192
1193 idle, run, fail = 0, 0, 0
1194 for line in status.stdout:
1195 line = line.decode()
1196 if 'cannot connect to server' in line or 'cannot read reply' in line:
1197 raise ClusterManagmentError('server disconnected')
1198 if me_dir in line:
1199 ongoing.append(line.split()[0].split('.')[0])
1200 status2 = line.split()[4]
1201 if status2 in self.idle_tag:
1202 idle += 1
1203 elif status2 in self.running_tag:
1204 run += 1
1205 elif status2 in self.complete_tag:
1206 if not self.check_termination(line.split()[0].split('.')[0]):
1207 idle += 1
1208 else:
1209 fail += 1
1210
1211 if status.returncode != 0 and status.returncode is not None:
1212 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode)
1213
1214 for id in list(self.submitted_ids):
1215 if id not in ongoing:
1216 status2 = self.check_termination(id)
1217 if status2 == 'wait':
1218 run += 1
1219 elif status2 == 'resubmit':
1220 idle += 1
1221
1222 return idle, run, self.submitted - (idle+run+fail), fail
1223
1224 @multiple_try()
1225 - def remove(self, *args, **opts):
1226 """Clean the jobs on the cluster"""
1227
1228 if not self.submitted_ids:
1229 return
1230 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1231 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1232 self.submitted_ids = []
1233
1236 """Basic class for dealing with cluster submission"""
1237
1238
1239 name = 'sge'
1240 job_id = 'JOB_ID'
1241 idle_tag = ['qw', 'hqw','hRqw','w']
1242 running_tag = ['r','t','Rr','Rt']
1243 identifier_length = 10
1244
1246 """replace string for path issues"""
1247 location = os.path.realpath(location)
1248 homePath = os.getenv("HOME")
1249 if homePath:
1250 location = location.replace(homePath,'$HOME')
1251 return location
1252
1253 @multiple_try()
1254 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1255 required_output=[], nb_submit=0):
1256 """Submit a job prog to an SGE cluster"""
1257
1258 me_dir = self.get_jobs_identifier(cwd, prog)
1259
1260
1261 if cwd is None:
1262
1263 cwd = self.def_get_path(os.getcwd())
1264 cwd1 = self.def_get_path(cwd)
1265 text = " cd %s;" % cwd1
1266 if stdout is None:
1267 stdout = '/dev/null'
1268 else:
1269 stdout = self.def_get_path(stdout)
1270 if stderr is None:
1271 stderr = '/dev/null'
1272 elif stderr == -2:
1273 stderr = stdout
1274 else:
1275 stderr = self.def_get_path(stderr)
1276
1277 if log is None:
1278 log = '/dev/null'
1279 else:
1280 log = self.def_get_path(log)
1281
1282 text += prog
1283 if argument:
1284 text += ' ' + ' '.join(argument)
1285
1286
1287
1288
1289 homePath = os.getenv("HOME")
1290 if homePath:
1291 text = text.replace(homePath,'$HOME')
1292
1293 logger.debug("!=== input %s" % text)
1294 logger.debug("!=== output %s" % stdout)
1295 logger.debug("!=== error %s" % stderr)
1296 logger.debug("!=== logs %s" % log)
1297
1298 command = ['qsub','-o', stdout,
1299 '-N', me_dir,
1300 '-e', stderr,
1301 '-V']
1302
1303 if self.cluster_queue and self.cluster_queue != 'None':
1304 command.extend(['-q', self.cluster_queue])
1305
1306 a = misc.Popen(command, stdout=subprocess.PIPE,
1307 stderr=subprocess.STDOUT,
1308 stdin=subprocess.PIPE, cwd=cwd)
1309
1310 output = a.communicate(text.encode())[0].decode()
1311 id = output.split(' ')[2]
1312 if not id.isdigit():
1313 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1314 % output)
1315 self.submitted += 1
1316 self.submitted_ids.append(id)
1317 logger.debug(output)
1318
1319 return id
1320
1321 @multiple_try()
1323 """ control the status of a single job with it's cluster id """
1324
1325 cmd = 'qstat '
1326 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1327 for line in status.stdout:
1328 line = line.decode()
1329
1330
1331
1332
1333
1334
1335 if str(id) in line:
1336 status = line.split()[4]
1337
1338 if status in self.idle_tag:
1339 return 'I'
1340 elif status in self.running_tag:
1341 return 'R'
1342 return 'F'
1343
1344 @multiple_try()
1346 """ control the status of a single job with it's cluster id """
1347 cmd = "qstat "
1348 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1349
1350 me_dir = self.get_jobs_identifier(me_dir)
1351
1352 finished = list(self.submitted_ids)
1353
1354 idle, run, fail = 0, 0, 0
1355 for line in status.stdout:
1356 line = line.decode()
1357 if me_dir in line:
1358 id,_,_,_,status = line.split()[:5]
1359 if status in self.idle_tag:
1360 idle += 1
1361 finished.remove(id)
1362 elif status in self.running_tag:
1363 run += 1
1364 finished.remove(id)
1365 else:
1366 logger.debug(line)
1367 fail += 1
1368 finished.remove(id)
1369
1370 for id in finished:
1371 self.check_termination(id)
1372
1373 return idle, run, self.submitted - (idle+run+fail), fail
1374
1375
1376
1377 @multiple_try()
1378 - def remove(self, *args, **opts):
1379 """Clean the jobs on the cluster"""
1380
1381 if not self.submitted_ids:
1382 return
1383 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1384 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1385 self.submitted_ids = []
1386
1389 """Basic class for dealing with cluster submission"""
1390
1391 name = 'lsf'
1392 job_id = 'LSB_JOBID'
1393
1394 @multiple_try()
1395 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1396 required_output=[], nb_submit=0):
1397 """Submit the job prog to an LSF cluster"""
1398
1399
1400 me_dir = self.get_jobs_identifier(cwd, prog)
1401
1402 text = ""
1403 command = ['bsub', '-C0', '-J', me_dir]
1404 if cwd is None:
1405 cwd = os.getcwd()
1406 else:
1407 text = " cd %s;" % cwd
1408 if stdout and isinstance(stdout, str):
1409 command.extend(['-o', stdout])
1410 if stderr and isinstance(stdout, str):
1411 command.extend(['-e', stderr])
1412 elif stderr == -2:
1413 pass
1414 if log is None:
1415 log = '/dev/null'
1416
1417 text += prog
1418 if argument:
1419 text += ' ' + ' '.join(argument)
1420
1421 if self.cluster_queue and self.cluster_queue != 'None':
1422 command.extend(['-q', self.cluster_queue])
1423
1424 a = misc.Popen(command, stdout=subprocess.PIPE,
1425 stderr=subprocess.STDOUT,
1426 stdin=subprocess.PIPE, cwd=cwd)
1427
1428 output = a.communicate(text.encode())[0].decode()
1429
1430 try:
1431 id = output.split('>',1)[0].split('<')[1]
1432 except:
1433 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1434 % output)
1435 if not id.isdigit():
1436 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1437 % output)
1438 self.submitted += 1
1439 self.submitted_ids.append(id)
1440 return id
1441
1442
1443 @multiple_try()
1445 """ control the status of a single job with it's cluster id """
1446
1447 cmd = 'bjobs '+str(id)
1448 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1449
1450 for line in status.stdout:
1451 line = line.decode().strip().upper()
1452 if 'JOBID' in line:
1453 continue
1454 elif str(id) not in line:
1455 continue
1456 status = line.split()[2]
1457 if status == 'RUN':
1458 return 'R'
1459 elif status == 'PEND':
1460 return 'I'
1461 elif status == 'DONE':
1462 return 'F'
1463 else:
1464 return 'H'
1465 return 'F'
1466
1467 @multiple_try()
1469 """ control the status of a single job with it's cluster id """
1470
1471 if not self.submitted_ids:
1472 return 0, 0, 0, 0
1473
1474 cmd = "bjobs " + ' '.join(self.submitted_ids)
1475 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1476
1477 jobstatus = {}
1478 for line in status.stdout:
1479 line = line.decode().strip()
1480 if 'JOBID' in line:
1481 continue
1482 splitline = line.split()
1483 id = splitline[0]
1484 if id not in self.submitted_ids:
1485 continue
1486 jobstatus[id] = splitline[2]
1487
1488 idle, run, fail = 0, 0, 0
1489 for id in self.submitted_ids[:]:
1490 if id in jobstatus:
1491 status = jobstatus[id]
1492 else:
1493 status = 'MISSING'
1494 if status == 'RUN':
1495 run += 1
1496 elif status == 'PEND':
1497 idle += 1
1498 else:
1499 status = self.check_termination(id)
1500 if status == 'wait':
1501 run += 1
1502 elif status == 'resubmit':
1503 idle += 1
1504
1505 return idle, run, self.submitted - (idle+run+fail), fail
1506
1507 @multiple_try()
1508 - def remove(self, *args,**opts):
1509 """Clean the jobs on the cluster"""
1510
1511 if not self.submitted_ids:
1512 return
1513 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1514 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1515 self.submitted_ids = []
1516
1518 """Class for dealing with cluster submission on a GE cluster"""
1519
1520 name = 'ge'
1521 job_id = 'JOB_ID'
1522 idle_tag = ['qw']
1523 running_tag = ['r']
1524
1525 @multiple_try()
1526 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1527 required_output=[], nb_submit=0):
1528 """Submit a job prog to a GE cluster"""
1529
1530 text = ""
1531 if cwd is None:
1532 cwd = os.getcwd()
1533 else:
1534 text = " cd %s; bash " % cwd
1535 if stdout is None:
1536 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1537 if stderr is None:
1538 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1539 elif stderr == -2:
1540 stderr = stdout
1541 if log is None:
1542 log = '/dev/null'
1543
1544 text += prog
1545 if argument:
1546 text += ' ' + ' '.join(argument)
1547 text += '\n'
1548 tmp_submit = os.path.join(cwd, 'tmp_submit')
1549 open(tmp_submit,'w').write(text)
1550
1551 a = misc.Popen(['qsub','-o', stdout,
1552 '-e', stderr,
1553 tmp_submit],
1554 stdout=subprocess.PIPE,
1555 stderr=subprocess.STDOUT,
1556 stdin=subprocess.PIPE, cwd=cwd)
1557
1558 output = a.communicate()[0].decode()
1559
1560 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1561 try:
1562 id = pat.search(output).groups()[0]
1563 except:
1564 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1565 % output)
1566 self.submitted += 1
1567 self.submitted_ids.append(id)
1568 return id
1569
1570 @multiple_try()
1572 """ control the status of a single job with it's cluster id """
1573 cmd = 'qstat | grep '+str(id)
1574 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1575 if not status:
1576 return 'F'
1577
1578 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1579 stat = ''
1580 for line in status.stdout.read().decode().split('\n'):
1581 if not line:
1582 continue
1583 line = line.strip()
1584 try:
1585 groups = pat.search(line).groups()
1586 except:
1587 raise ClusterManagmentError('bad syntax for stat: \n\"%s\"' % line)
1588 if groups[0] != id: continue
1589 stat = groups[1]
1590 if not stat:
1591 return 'F'
1592 if stat in self.idle_tag:
1593 return 'I'
1594 if stat in self.running_tag:
1595 return 'R'
1596
1597 @multiple_try()
1599 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1600 if not self.submitted_ids:
1601 return 0, 0, 0, 0
1602 idle, run, fail = 0, 0, 0
1603 ongoing = []
1604 for statusflag in ['p', 'r', 'sh']:
1605 cmd = 'qstat -s %s' % statusflag
1606 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1607
1608 pat = re.compile("^(\d+)")
1609 for line in status.stdout.read().decode().split('\n'):
1610 line = line.strip()
1611 try:
1612 id = pat.search(line).groups()[0]
1613 except Exception:
1614 pass
1615 else:
1616 if id not in self.submitted_ids:
1617 continue
1618 ongoing.append(id)
1619 if statusflag == 'p':
1620 idle += 1
1621 if statusflag == 'r':
1622 run += 1
1623 if statusflag == 'sh':
1624 fail += 1
1625 for id in list(self.submitted_ids):
1626 if id not in ongoing:
1627 self.check_termination(id)
1628
1629
1630 return idle, run, self.submitted - idle - run - fail, fail
1631
1632 @multiple_try()
1633 - def remove(self, *args, **opts):
1634 """Clean the jobs on the cluster"""
1635
1636 if not self.submitted_ids:
1637 return
1638 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1639 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1640 self.submitted_ids = []
1641
1643 """start a computation and not wait for it to finish.
1644 this fonction returns a lock which is locked as long as the job is
1645 running."""
1646
1647 mc = MultiCore(1)
1648 mc.submit(exe, argument, cwd, stdout, **opt)
1649 mc.need_waiting = True
1650 return mc.lock
1651
1654 """Basic class for dealing with cluster submission"""
1655
1656 name = 'slurm'
1657 job_id = 'SLURM_JOBID'
1658 idle_tag = ['Q','PD','S','CF']
1659 running_tag = ['R', 'CG']
1660 complete_tag = ['C']
1661 identifier_length = 8
1662
1663 @multiple_try()
1664 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1665 required_output=[], nb_submit=0):
1666 """Submit a job prog to a SLURM cluster"""
1667
1668 me_dir = self.get_jobs_identifier(cwd, prog)
1669
1670
1671 if cwd is None:
1672 cwd = os.getcwd()
1673 if stdout is None:
1674 stdout = '/dev/null'
1675 if stderr is None:
1676 stderr = '/dev/null'
1677 elif stderr == -2:
1678 stderr = stdout
1679 if log is None:
1680 log = '/dev/null'
1681
1682 command = ['sbatch', '-o', stdout,
1683 '-J', me_dir,
1684 '-e', stderr, prog] + argument
1685
1686 if self.cluster_queue and self.cluster_queue != 'None':
1687 command.insert(1, '-p')
1688 command.insert(2, self.cluster_queue)
1689
1690 a = misc.Popen(command, stdout=subprocess.PIPE,
1691 stderr=subprocess.STDOUT,
1692 stdin=subprocess.PIPE, cwd=cwd)
1693
1694 output = a.communicate()
1695 output_arr = output[0].decode().split(' ')
1696 id = output_arr[3].rstrip()
1697
1698 if not id.isdigit():
1699 id = re.findall('Submitted batch job ([\d\.]+)', ' '.join(output_arr))
1700
1701 if not id or len(id)>1:
1702 raise ClusterManagmentError( 'fail to submit to the cluster: \n%s' \
1703 % ('stdout: %s\nstderr %s' %(output[0],output[1])))
1704 id = id[0]
1705
1706
1707 self.submitted += 1
1708 self.submitted_ids.append(id)
1709 return id
1710
1711 @multiple_try()
1713 """ control the status of a single job with it's cluster id """
1714 cmd = 'squeue j'+str(id)
1715 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1716 stderr=open(os.devnull,'w'))
1717
1718 for line in status.stdout:
1719 line = line.decode().strip()
1720 if 'Invalid' in line:
1721 return 'F'
1722 elif line.startswith(str(id)):
1723 status = line.split()[4]
1724 if status in self.idle_tag:
1725 return 'I'
1726 elif status in self.running_tag:
1727 return 'R'
1728 return 'F'
1729
1730 @multiple_try()
1732 """ control the status of a single job with it's cluster id """
1733 cmd = "squeue"
1734 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1735
1736 me_dir = self.get_jobs_identifier(me_dir)
1737
1738 idle, run, fail = 0, 0, 0
1739 ongoing=[]
1740 for line in pstatus.stdout:
1741 line = line.decode()
1742 if me_dir in line:
1743 id, _, _,_ , status,_ = line.split(None,5)
1744 ongoing.append(id)
1745 if status in self.idle_tag:
1746 idle += 1
1747 elif status in self.running_tag:
1748 run += 1
1749 elif status in self.complete_tag:
1750 status = self.check_termination(id)
1751 if status == 'wait':
1752 run += 1
1753 elif status == 'resubmit':
1754 idle += 1
1755 else:
1756 fail += 1
1757
1758
1759 for id in list(self.submitted_ids):
1760 if id not in ongoing:
1761 status = self.check_termination(id)
1762 if status == 'wait':
1763 run += 1
1764 elif status == 'resubmit':
1765 idle += 1
1766
1767
1768 return idle, run, self.submitted - (idle+run+fail), fail
1769
1770 @multiple_try()
1771 - def remove(self, *args, **opts):
1772 """Clean the jobs on the cluster"""
1773
1774 if not self.submitted_ids:
1775 return
1776 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1777 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1778 self.submitted_ids = []
1779
1781 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1782
1783 name= 'htcaas'
1784 job_id = 'HTCAAS_JOBID'
1785 idle_tag = ['waiting']
1786 running_tag = ['preparing','running']
1787 complete_tag = ['done']
1788
1789 @store_input()
1790 @multiple_try()
1791 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1792 log=None, input_files=[], output_files=[], required_output=[],
1793 nb_submit=0):
1794 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1795 input/output file should be given as relative to CWd
1796 """
1797
1798 cur_usr = os.getenv('USER')
1799
1800 if cwd is None:
1801 cwd = os.getcwd()
1802
1803 cwd_cp = cwd.rsplit("/",2)
1804
1805 if not stdout is None:
1806 print("stdout: %s" % stdout)
1807
1808 if not os.path.exists(prog):
1809 prog = os.path.join(cwd, prog)
1810
1811 if not required_output and output_files:
1812 required_output = output_files
1813
1814 logger.debug(prog)
1815 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1816 cwd_arg = cwd+"/arguments"
1817 temp = ' '.join([str(a) for a in argument])
1818 arg_cmd="echo '"+temp+"' > " + cwd_arg
1819 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1820 if argument :
1821 command.extend(['-a ', '='.join([str(a) for a in argument])])
1822 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1823 id = a.stdout.read().strip()
1824
1825 else:
1826 cwd_arg = cwd+"/arguments"
1827 temp = ' '.join([str(a) for a in argument])
1828 temp_file_name = "sub." + os.path.basename(prog)
1829 text = """#!/bin/bash
1830 MYPWD=%(cwd)s
1831 cd $MYPWD
1832 input_files=(%(input_files)s )
1833 for i in ${input_files[@]}
1834 do
1835 chmod -f +x $i
1836 done
1837 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1838 """
1839 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1840 'arguments': ' '.join([str(a) for a in argument]),
1841 'program': ' ' if '.py' in prog else 'bash'}
1842
1843
1844 new_prog = pjoin(cwd, temp_file_name)
1845 open(new_prog, 'w').write(text % dico)
1846 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1847 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1848 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1849 id = a.stdout.read().strip()
1850 logger.debug(id)
1851
1852 nb_try=0
1853 nb_limit=5
1854 if not id.isdigit() :
1855 print("[ID is not digit]:" + id)
1856
1857 while not id.isdigit() :
1858 nb_try+=1
1859 print("[fail_retry]:"+ nb_try)
1860 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1861 id = a.stdout.read().strip()
1862 if nb_try > nb_limit :
1863 raise ClusterManagementError('fail to submit to the HTCaaS cluster: \n %s' % id)
1864 break
1865
1866 self.submitted += 1
1867 self.submitted_ids.append(id)
1868
1869 return id
1870
1871 @multiple_try(nb_try=10, sleep=5)
1873 """ control the status of a single job with it's cluster id """
1874
1875 if id == 0 :
1876 status_out ='C'
1877 else :
1878 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1879 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1880 stderr=subprocess.PIPE)
1881 error = status.stderr.read().decode()
1882 if status.returncode or error:
1883 raise ClusterManagmentError('htcaas-job-submit returns error: %s' % error)
1884 status_out= status.stdout.read().decode().strip()
1885 status_out= status_out.split(":",1)[1]
1886 if status_out == 'waiting':
1887 status_out='I'
1888 elif status_out == 'preparing' or status_out == 'running':
1889 status_out = 'R'
1890 elif status_out != 'done':
1891 status_out = 'F'
1892 elif status_out == 'done':
1893 status_out = 'C'
1894
1895 return status_out
1896
1897 @multiple_try()
1899 """ control the status of a single job with it's cluster id """
1900 if not self.submitted_ids:
1901 logger.debug("self.submitted_ids not exists")
1902 return 0, 0, 0, 0
1903
1904 ongoing = []
1905 idle, run, fail = 0, 0, 0
1906
1907 start = self.submitted_ids[0]
1908 end = self.submitted_ids[-1]
1909
1910 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1911 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1912
1913 for line in status.stdout:
1914
1915 status2 = line.decode().split()[-1]
1916 if status2 != 'null' or line.split()[0].strip() != '0':
1917 ongoing.append(line.split()[0].strip())
1918 logger.debug("["+line.split()[0].strip()+"]"+status2)
1919 if status2 != 'null' or line.split()[0].strip() != '0':
1920 idle += 1
1921 elif status2 in self.idle_tag:
1922 idle += 1
1923 elif status2 in self.running_tag:
1924 run += 1
1925 elif status2 in self.complete_tag:
1926 if not self.check_termination(line.split()[0]):
1927 idle +=1
1928 else:
1929 fail += 1
1930
1931 return idle, run, self.submitted - (idle+run+fail), fail
1932
1933 @multiple_try()
1934 - def remove(self, *args, **opts):
1935 """Clean the jobson the cluster"""
1936
1937 if not self.submitted_ids:
1938 return
1939 for i in range(len(self.submitted_ids)):
1940 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1941 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1942
1944 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1945
1946 name= 'htcaas2'
1947 job_id = 'HTCAAS2_JOBID'
1948 idle_tag = ['waiting']
1949 running_tag = ['preparing','running']
1950 complete_tag = ['done']
1951
1952 @store_input()
1953 @multiple_try()
1954 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1955 log=None, input_files=[], output_files=[], required_output=[],
1956 nb_submit=0):
1957
1958 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1959 input/output file should be given as relative to CWD
1960 """
1961 if cwd is None:
1962 cwd = os.getcwd()
1963
1964 if not os.path.exists(prog):
1965 prog = os.path.join(cwd, prog)
1966
1967 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1968 if cwd or prog :
1969 self.submitted_dirs.append(cwd)
1970 self.submitted_exes.append(prog)
1971 else:
1972 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1973
1974 if argument :
1975 self.submitted_args.append('='.join([str(a) for a in argument]))
1976
1977 if cwd or prog :
1978 self.submitted += 1
1979 id = self.submitted
1980 self.submitted_ids.append(id)
1981 else:
1982 logger.debug("cwd and prog are not exist! ")
1983 id = 0
1984
1985 else:
1986 temp_file_name = "sub."+ os.path.basename(prog)
1987 text = """#!/bin/bash
1988 MYPWD=%(cwd)s
1989 cd $MYPWD
1990 input_files=(%(input_files)s )
1991 for i in ${input_files[@]}
1992 do
1993 chmod -f +x $i
1994 done
1995 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1996 """
1997 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1998 'arguments': ' '.join([str(a) for a in argument]),
1999 'program': ' ' if '.py' in prog else 'bash'}
2000
2001 new_prog = pjoin(cwd, temp_file_name)
2002 open(new_prog, 'w').write(text % dico)
2003 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
2004 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
2005 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
2006 id = a.stdout.read().strip()
2007 logger.debug("[mode2]-["+str(id)+"]")
2008 if cwd and prog :
2009 self.submitted += 1
2010 self.submitted_ids.append(id)
2011 else:
2012 logger.debug("cwd and prog are not exist! ")
2013 id = 0
2014
2015 return id
2016
2017 @multiple_try()
2061
2062
2063 @multiple_try(nb_try=10, sleep=5)
2065 """ control the status of a single job with it's cluster id """
2066
2067 if self.submitted == self.submitted_ids[-1] :
2068 id = self.metasubmit(self)
2069 tempid = self.submitted_ids[-1]
2070 self.submitted_ids.remove(self.submitted_ids[-1])
2071 self.submitted_ids.append(id)
2072 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2073
2074 if id == 0 :
2075 status_out ='C'
2076 else:
2077 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2078 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2079 stderr=subprocess.PIPE)
2080 error = status.stderr.read().decode()
2081 if status.returncode or error:
2082 raise ClusterManagmentError('htcaas-job-status returns error: %s' % error)
2083 status_out= status.stdout.read().decode().strip()
2084 status_out= status_out.split(":",1)[1]
2085 logger.debug("[["+str(id)+"]]"+status_out)
2086 if status_out == 'waiting':
2087 status_out='I'
2088 elif status_out == 'preparing' or status_out == 'running':
2089 status_out = 'R'
2090 elif status_out != 'done':
2091 status_out = 'F'
2092 elif status_out == 'done':
2093 status_out = 'C'
2094 self.submitted -= 1
2095
2096 return status_out
2097
2098 @multiple_try()
2100 """ control the status of a single job with it's cluster id """
2101 if not self.submitted_ids:
2102 logger.debug("self.submitted_ids not exists")
2103 return 0, 0, 0, 0
2104
2105 if "//" in me_dir :
2106 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2107 start = me_dir.split("//")[0]
2108 end = me_dir.split("//")[1]
2109 else :
2110 start = me_dir.split("//")[1]
2111 end = me_dir.split("//")[0]
2112 elif "/" in me_dir :
2113 start = 0
2114 end = 0
2115 elif me_dir.isdigit():
2116 start = me_dir
2117 end = me_dir
2118 elif not me_dir.isdigit():
2119 me_dir = self.submitted_ids[0]
2120 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2121
2122 ongoing = []
2123 idle, run, fail, done = 0, 0, 0, 0
2124
2125 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2126 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2127
2128 for line in status.stdout:
2129 line = line.decode()
2130 status2 = line.split()[-1]
2131 if status2 != 'null' or line.split()[0].strip() != '0':
2132 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2133 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2134
2135 if status2 == 'null' or line.split()[0].strip() == '0':
2136 idle += 1
2137 elif status2 in self.idle_tag:
2138 idle += 1
2139 elif status2 in self.running_tag:
2140 run += 1
2141 elif status2 in self.complete_tag:
2142 done += 1
2143 self.submitted -= 1
2144 if not self.check_termination(line.split()[1]):
2145 idle +=1
2146 else:
2147 fail += 1
2148
2149 return idle, run, self.submitted - (idle+run+fail), fail
2150
2151 @multiple_try()
2152 - def remove(self, *args, **opts):
2153 """Clean the jobson the cluster"""
2154
2155 if not self.submitted_ids:
2156 return
2157 id = self.submitted_ids[0]
2158 if id:
2159 cmd = "htcaas-job-cancel -m %s" % str(id)
2160 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2161
2162 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2163 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2164 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2165
2166 onecore=MultiCore(1)
2167
2168