
    ^i;                         S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
rS SKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr   " S	 S
5      r " S S\5      r  S"S\R.                  \R0                  \R2                        S\R4                  \R6                  \\4      S\S\R<                  \R0                  \R2                        4S jjrS r  " S S5      r!S\S\S\RD                  \#\RH                  \   4   4S jr%SS\%4S\R.                  \R0                  \R2                        S\S\&S\RN                  SS4
S jjr(\RR                  S#S\R.                  \#   S\R<                  S   4S jj5       r* " S  S!5      r+g)$    N)Path   )core)JobEnvironment)CommandFunctionDelayedSubmission)environment_variablesc                   j   ^  \ rS rSrSrU 4S jrS\R                  S\R                  S\4S jr	Sr
U =r$ )	Checkpointable   a  Derived callable classes are requeued after timeout with their current
state dumped at checkpoint.

__call__ method must be implemented to make your class a callable.

Note
----
The following implementation of the checkpoint method resubmits the full current
state of the callable (self) with the initial argument. You may want to replace the method to
curate the state (dump a neural network to a standard format and remove it from
the state so that not to pickle it) and change/remove the initial parameters.
c                 p   > [         TU ]  U 5      n[        U5      (       d   SU R                   S35       eU$ )NzClass z^ is marked as Checkpointable but doesn't have a __call__ method. Please add a __call__ method.)super__new__callable__name__)clsargskwargsinstance	__class__s       M/mnt/rpi/tmp/demucs-venv-sys/lib/python3.13/site-packages/submitit/helpers.pyr   Checkpointable.__new__,   sM    7?3'
 
 	ACLL>!  A	A 
     r   r   returnc                      [        U /UQ70 UD6$ )z3Resubmits the same callable with the same argumentsr   )selfr   r   s      r   
checkpointCheckpointable.checkpoint3   s     !7777r    )r   
__module____qualname____firstlineno____doc__r   tpAnyr	   r   __static_attributes____classcell__)r   s   @r   r   r      s5    8 8"&& 8=N 8 8r   r   c                      \ rS rSrSrSS\SS4S jjrS\R                  S\R                  4   S	\R                  S
\R                  SS4S jr
S\4S jrS\R                  \   4S jrS\R                   \R                     4S jrSrg)FunctionSequence:   a  This is for gathering several estimations into one function, which
will return the sequence of outputs.
Also this "function" is stateful, hence it can be stopped, and recovered,
which is useful when job can be preempted.

Usage
-----
func = FunctionSequence()
func.add(my_function1, arg1, kwarg1=value_kwarg1)
func.add(my_function2, arg1, arg2)
result1, result2 = func()

Note
----
This function is checkpointable because:
- it derives from Checkpointable
- it keeps DelayedSubmission objects as attribute, which in turn store the
  results of the computation in memory once they are computed. So at checkpoint
  time, those results will be saved, and only the non-computed results
  will be computed once the job restarts.
verboser   Nc                     Xl         / U l        g N)r,   delayed_functions)r   r,   s     r   __init__FunctionSequence.__init__Q   s    =?r   func.r   r   c                 T    U R                   R                  [        U/UQ70 UD65        g r.   )r/   appendr	   )r   r2   r   r   s       r   addFunctionSequence.addU   s&    %%&7&Nt&Nv&NOr   c                 ,    [        U R                  5      $ r.   )lenr/   r   s    r   __len__FunctionSequence.__len__X   s    4))**r   c                 ,    [        U R                  5      $ r.   )iterr/   r9   s    r   __iter__FunctionSequence.__iter__[   s    D**++r   c                     U R                   (       a5  [        S U  5       5      n[        SU S[        U R                  5       3SS9  U R                   Vs/ s H  o"R                  5       PM     sn$ s  snf )Nc              3   @   #    U  H  oR                  5       v   M     g 7fr.   )done).0fs     r   	<genexpr>,FunctionSequence.__call__.<locals>.<genexpr>`   s     .Avvxxs   zStarting from /Tflush)r,   sumprintr8   r/   result)r   rB   rD   s      r   __call__FunctionSequence.__call__^   sh    <<...DN4&#d.D.D*E)FGtT $ 6 6
 61HHJ 6
 	
 
s   A1)r/   r,   F)r   r!   r"   r#   r$   boolr0   r%   Callabler&   r5   intr:   Iteratorr	   r>   ListrM   r'   r    r   r   r*   r*   :   s    ,@ @ @PCK0 P P266 PVZ P+ +,"++&78 ,
"''"&&/ 
r   r*   jobstimeoutpoll_frequencyr   c              #     #    [         R                   " 5       n[        5       n Ub"  [         R                   " 5       U-
  U:  a  [        e[        U 5       H8  u  pVXT;   a  M  UR	                  5       (       d  M#  UR                  U5        Uv   M:     [        U5      [        U 5      :X  a  g[         R                  " U5        M  7f)u  
Yields jobs as they complete (finished, failed or were cancelled).
Raises a TimeoutError if the result isn’t available after timeout seconds.
timeout can be an int or float. If timeout is not specified or None, there is no
limit to the wait time.

Parameters
----------
jobs: list
    Jobs instances

timeout: int/float
    Maximum time (in sec) to wait for jobs completion

poll_frequency: float
    Frequency in second at which we check job status.

Yields
------
Job
    The next completed job
N)timesetTimeoutError	enumeraterB   r5   r8   sleep)rU   rV   rW   start	jobs_doneijobs          r   as_completedrb   g   s     6 IIKE UI
499;#6#@oFA~xxzza 	 & y>SY&

>" s   A0C 6A
C c                 j    [         R                  " U 40 UD6R                  S5      R                  5       $ )Nzutf-8)
subprocesscheck_outputdecodestrip)str_argsr   s     r   run_cmdri      s,    ""86v6==gFLLNNr   c                       \ rS rSrSr    SS\S\R                  \   S\S\R                  \
   S\R                  \
   4
S	 jjr\SS
\S\4S jj5       rSS jrS rSrg)RsyncSnapshot   a  Takes a snapshot of the git repository that the script lives in.

This ensures that remote jobs always use the code from when they are scheduled
and not the code from when they are launched / re-started.


Parameters
----------
snapshot_dir: Path
    A path to where the snapshot should be created
with_submodules: bool
    Whether or not submodules should be included in the snapshot
exclude: Sequence[str]
    An optional list of patterns to exclude from the snapshot
include: Sequence[str]
    A list of relative file names to include from the snapshot.
    Useful for .so or other build artifacts that are genarally not tracked by git.

Note
----
- Only files that are checked in to the repository are included in the snapshot.
    If you have experimental code that you would like to include in the snapshot,
    you'll need to `git add` the file first for it to be included, or use `include` arg.
Nsnapshot_dirroot_dirwith_submodulesexcludeincludec                     U R                  SS9  [        U5      U l        U=(       d    [        / SQ5      U l        [        R
                  " 5       U l        X0l        X@l        XPl	        g )NTthrow)gitz	rev-parsez--show-toplevel)
	availabler   rm   ri   rn   cwdoriginal_dirro   rp   rq   )r   rm   rn   ro   rp   rq   s         r   r0   RsyncSnapshot.__init__   sO     	T" . TG,S$T HHJ.r   rt   r   c                 `    [         R                  " S5      (       d  U (       a  [        S5      egg)Nrsyncz-RsyncSnapshot requires rsync to be installed.FT)shutilwhichRuntimeErrorrs   s    r   rv   RsyncSnapshot.available   s&    ||G$$"#RSSr   c                    [         R                  " 5       U l        [        U R                  5      nU R
                  (       a  SOSnU R                  R                  5       (       dU  U R                  R                  R                  SSS9  [        R                  " SSSSU 3[        U R                  5      /5        [        R                  " 5        n[        S	U S
UR                   3USS9  [!        ["        R$                  R'                  S U R(                   5       5      5      n[+        UR                  SSS9 nU R,                   H  n[/        XeS9  M     S S S 5        [        SSSUR                  U[        U R                  5      /U-   5        S S S 5        [0        R2                  " U R                  5        g ! , (       d  f       Nh= f! , (       d  f       N@= f)Nz--recurse-submodulesz-sT)parentsexist_okru   clonez	--depth=2zfile://zgit ls-files z | grep -v ^16 | cut -f2- > )rw   shellc              3   *   #    U  H	  nS U4v   M     g7f)z	--excludeNr    )rC   pats     r   rE   *RsyncSnapshot.__enter__.<locals>.<genexpr>   s     8dWcPS+s9KWcs   autf8)encoding)filer{   z-az--files-from)r   rw   rx   strrn   ro   rm   existsparentmkdirrd   
check_calltempfileNamedTemporaryFileri   namelist	itertoolschainfrom_iterablerp   openrq   rK   oschdir)r   rn   subtfilerp   oincs          r   	__enter__RsyncSnapshot.__enter__   si    HHJt}}%(,(<(<$$  ''))$$**4$*G!!5';'(@TVYZ^ZkZkVl"mn ((*emC5(DUZZLQW_gkl9??888dW[WcWc8ddeGejj#71<<C#& ( 8 WdNEJJ#dN_N_J`adkkl + 	""#	 87	 +*s%   A*G8F79G7
G	G
Gc                 D    [         R                  " U R                  5        g r.   )r   r   rx   )r   r   s     r   __exit__RsyncSnapshot.__exit__   s    
""#r   )rp   rq   rx   rn   rm   ro   )NFr    r    rO   r   N)r   r!   r"   r#   r$   r   r%   OptionalrP   Sequencer   r0   staticmethodrv   r   r   r'   r    r   r   rk   rk      s    8 '+ %$&$& ++d# 	
 S! S!   $  $,$r   rk   monitoring_start_timen_jobs
state_jobsc           	         [         R                   " 5       U -
  n[        R                  R                  5       R                  S5      n[	        US   5      n[        [        U5      5      n[        SU S[        US-  5       S3[        US   5      U  SU S	3[        U5      U  SU S
3[        US   5      [        U5      -
  U  SU S3SS9  [        U5      S:  a  [        SU SU 3SS9  g g )Nz%Y-%m-%d %H:%M:%SFAILED[z] Launched <   z minutes ago,RUNNINGrG   z jobs running,z jobs failed,DONEz
 jobs doneTrH   r   z] Failed jobs, indices )	rY   datetimenowstrftimesortedr8   r   rK   rR   )r   r   r   run_time	date_timefailed_job_indicesn_charss          r   _default_custom_loggingr      s   yy{22H!!%%'001DEI
8 45#f+G	
I;k#hm"4!5]Cz)$%wi
0&H!"G9
-QvhmDz&!"S);%<<gY
GqPZ[ ")34F3GHPTU #r      F	test_modecustom_loggingc                 X   U(       d  US:  d   S5       e[        U 5      nUS:X  a  [        S5        gSR                  [        [	        S U  5       5      5      5      n[        SU S	U S
35        [
        R
                  " 5       n U(       d  U S   R                  SS9  [        R                  " [        5      n[        U 5       H[  u  pXyR                  R                  5          R                  U5        U	R                  5       (       d  MG  US   R                  U5        M]     [        US   5      n
[        US   5      [        U 5      :X  a  [        SU
 S3SS9  O!U" XdU5        [
        R                  " U5        M  [        S[        [
        R
                  " 5       U-
  S-  5       S35        g)aQ  Continuously monitors given jobs until they are all done or failed.

Parameters
----------
jobs: List[Jobs]
    A list of jobs to monitor
poll_frequency: int
    The time (in seconds) between two refreshes of the monitoring.
    Can't be inferior to 30s.
test_mode: bool
    If in test mode, we do not check the length of poll_frequency
r   z@You can't refresh too often (>= 30s) to avoid overloading squeuer   zThere are no jobs to monitorNz, c              3   r   #    U  H-  n[        UR                  5      R                  S S5      S   v   M/     g7f)_r   r   N)r   job_idsplit)rC   ra   s     r   rE   monitor_jobs.<locals>.<genexpr>  s,     %WRV3c#**o&;&;C&CA&FRVs   57zMonitoring z jobs from job arrays z 
Tforce)moder   r   z%All jobs finished, jobs with indices z failedrH   z Whole process is finished, took r   z minutes)r8   rK   joinr   rZ   rY   get_infocollectionsdefaultdictr\   stateupperr5   rB   r]   rR   )rU   rW   r   r   r   
job_arraysr   r   r`   ra   r   s              r   monitor_jobsr      sz   & #g%gg#YF{,-6#%WRV%W"WXYJ	Kx5j\
EF IIK
G'* ,,S1
oFAyy()--a0xxzz6"&&q) &
 $Jx$89z&!"c$i/9:L9MWU]ab,jA

>" " 
,S$))+@U2UY[1[-\,]]e
fgr   extra_namesc              #   j  #    Sn[         R                   Vs0 s HE  nUR                  S5      (       d  X!;   d  X ;   d  M%  U[         R                  R                  U5      _MG     nn Sv   [         R                  R	                  U5        gs  snf ! [         R                  R	                  U5        f = f7f)a`  Removes slurm and submitit related environment variables so as to avoid interferences
when submiting a new job from a job.

Parameters
----------
extra_names: Sequence[str]
    Additional environment variables to hide inside the context,
    e.g. TRITON_CACHE_DIR and TORCHINDUCTOR_CACHE_DIR when using torch.compile.

Note
----
A slurm job submitted from within a slurm job inherits some of its attributes, which may
be confusing a cause weird gres errors (or pytorch distributed).
Submitting within this context should prevent this.

Usage
-----
with submitit.helpers.clean_env():
    executor.submit(...)
MASTER_ADDRMASTER_PORTRANK
WORLD_SIZE
LOCAL_RANKLOCAL_WORLD_SIZE)SLURM_SLURMD_SRUN_SBATCH_	SUBMITIT_N)r   environ
startswithpopupdate)r   distrib_namesxcluster_envs       r   	clean_envr   '  s     , kM ALLOPP! 	2::>>!  '


+& 	

+&s-   B3$B
$B
#B3&B *%B3!B00B3c                   J    \ rS rSrS
S jrS\4S jr  SS\S\SS 4S jjrS	r	g)TorchDistributedEnvironmentiM  r   Nc                    [        5       U l        U R                  R                  S   U l        U R	                  5       U l        U R                  R                  U l        U R                  R                  U l	        U R                  R                  U l
        U R                  R                  U R                  R                  -  U l        g)aO  Construct a class holding the parameters required to properly setup
PyTorch distributed (with the default env:// initialization method).

Examples
--------
>>> dist_env = TorchDistributedEnvironment().export()
>>> torch.distributed.init_process_group(backend="nccl")
>>> print(f"master: {dist_env.master_addr}:{dist_env.master_port}")
r   N)r   _job_env	hostnamesmaster_addr_get_master_portmaster_portglobal_rankrank	num_tasks
world_size
local_rank	num_nodeslocal_world_sizer9   s    r   r0   $TorchDistributedEnvironment.__init__N  s     '(==2215002MM--	--11--22 $ 7 74==;R;R Rr   c                     Su  p[         R                  R                  S5      nUc;  [        R                  " U R
                  R                  5      nUR                  X5      $ [        U5      nU$ )N)i N  i`  r   )	r   r   getrandomRandomr   r   randintrR   )r   MIN_MASTER_PORTMAX_MASTER_PORTmaster_port_strrngr   s         r   r   ,TorchDistributedEnvironment._get_master_port`  sY    +9(**..7"-- 4 45C;;@@/*r   set_cuda_visible_devices	overwritec                    U R                   [        U R                  5      [        U R                  5      [        U R                  5      [        U R
                  5      [        U R                  5      S.nU(       d,  U H&  nU[        R                  ;   d  M  [        SU S35      e   U(       a  [        U R
                  5      US'   [        R                  R                  U5        U $ )aR  Export all the environment variables required to properly setup
PyTorch distributed (with the default env:// initialization method) i.e.
MASTER_ADDR, MASTER_PORT, RANK, WORLD_SIZE (to which LOCAL_RANK and
LOCAL_WORLD_SIZE are added).

Parameter
----------
set_cuda_visible_device: bool
    if True, updates CUDA_VISIBLE_DEVICES to use only the device
    matching the local rank.
overwrite: bool
    if True, overwrites the environment variables if they exist;
    this can be useful when launching a job from another job.

Returns
--------
TorchDistributedEnvironment
    the current instance
r   z'Cannot export environment variables as z is already setCUDA_VISIBLE_DEVICES)r   r   r   r   r   r   r   r   r   r~   r   )r   r   r   env_varskeys        r   export"TorchDistributedEnvironment.exportm  s    8  ++t//0		Ndoo.doo. #D$9$9 :
 "**$&)PQTPUUd'eff   $/24??/CH+,


(#r   )r   r   r   r   r   r   r   r   )TF)
r   r!   r"   r#   r0   rR   r   rP   r  r'   r    r   r   r   r   M  sB    S$#  *.+"&+ + 
'	+ +r   r   )N
   )r    ),r   
contextlibr   r   r   r   r|   rd   r   rY   typingr%   pathlibr   r   core.job_environmentr   
core.utilsr   r	   r
   r   r*   r   JobRr   UnionrR   floatrS   rb   ri   rk   Dictr   Setr   rP   rQ   r   contextmanagerr   r   r    r   r   <module>r     s       	         0 : > F8 8:*
~ *
^ 26(#
++dhhtvv&
'(#[[#u*-.(# (# [[$&&!"	(#VOI$ I$XV5 V# VSUSZSZ[^`b`f`fgj`k[kSl V( "9	0h
++dhhtvv&
'0h0h 0h KK	0h
 
0hf "'2;;s+ "'R[[5F "' "'JK Kr   