o
    H&i>T                     @   s  U d dl mZmZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlZd dlmZ d dlm   m!Z" d d	lm#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 G dd deZ8G dd de1Z9dej#j:dej#j:fddZ;e, a<dej=dee/ fddZ>edej?j@deeAef d eeAejBf fd!d"ZCejDjEZEed#d$ ZFdDd&d'ZGd(d) ZHd*d+ ZIdDd,d-ZJdDd.d/ZKd%d%d%d%d%d0d0ddd1	d2d3ZLeEjMjeGeEjMjNe
eJeEjOjNeEjPjNe
eKeEjQjNeEjRjNe
eKeEjSjNeEjTje
eIeEjUjeEjVjNe
eJeEjWjNeEjTjNe
eJeEjUjNeEjXjYe
eHeEjZjYeEj[jYe
eHeEj\jYeEj]jYe
eHeEj^jYeEj_jNe
eJeEj`jNeEjajYeLeEj%jYe%iZbejDjcjdjYejDjcjejYhZfeejgjh eid4< de#j:de#j:fd5d6ZjeG d7d8 d8Zkd9ed:eee8  d;e-d<ed=edekfd>d?Zld@Zm			dEd:eee8  dAeee#j:ge#j:f  d;ee- fdBdCZndS )F    )ABCabstractmethod)contextmanagernullcontext)copy)	dataclass)partialwraps)	AnyCallablecastDictListOptionalSetTupleUnion)make_fxN)fx)native_layer_norm_backward)FakeTensorMode)gradients_tagging)DataParallelDTensorExpandModeParallelMode)	Placement)_PyTreeCodeGen_PyTreeInfoCodeGen)	stateless)NamedMemberAccessorc                   @   sV   e Zd ZdZededejjdejjfddZ	ede
jdeej de
jfd	d
ZdS )Overridea  Override the tracing and transformation behavior of :meth:`~torch.distributed._spmd.compile`.

    This is useful when any part of the model is not traceable or if you prefer
    to not trace it due to any reason. More specifically, users can implement
    :meth:`torch.distributed._spmd.Override.replacement` to replace an original
    submodule with the return new submodule. The new submodule contains
    operations that users preferred to be traced, which simply be a dummy
    placeholder operator. After tracing, users can implement
    :meth:`torch.distributed._spmd.Override.transform` to transform the traced
    graph, where the dummy placeholder operator serves as an anchor to insert
    new sub-graphs.
    fqnorig_submodulereturnc                 C      dS )a  Implement this method to return a new :class:`nn.Module` instance to replace the ``orig_submodule``
        argument in the model.

        This helps if ``orig_submodule`` is not traceable or should not be traced.

        Args:
            fqn (str): fully quantified name of the submodule.
            orig_submodule (class:`nn.Module`): original submodule instance to replace.

        Returns:
            A new :class:`nn.Module` instance to replace the original one.

        N )selfr"   r#   r&   r&   FC:\wamp64\www\opt\env\Lib\site-packages\torch/distributed/_spmd/api.pyreplacement0   s   zOverride.replacementgm
flat_statec                 C   r%   )a/  
        Given a DTensor-expanded graph and sharding schema for every node,
        conduct additional transformation for the sub-graph from the :class:`nn.Module`
        returned by :meth:`torch.distributed._spmd.Override.replacement` if
        necessary.

        Args:
            gm (:class:`fx.Graph`): a DTensor-expanded graph.
            flat_state (List[str, :class:`Tensor`]): a reference to the list of
                flattened state. The elements in ``flat_state`` map to the first
                ``len(flat_state)`` placeholders in the graph. The transformation
                can add state to or remove state from ``flat_state`` as long as
                it keeps ``flat_state`` and the placeholders consistent.

        Returns:
            The :class:`fx.Graph` after transformation.

        Nr&   )r'   r*   r+   r&   r&   r(   	transformA   s   zOverride.transformN)__name__
__module____qualname____doc__r   strtorchnnModuler)   r   GraphModuler   Tensorr,   r&   r&   r&   r(   r!   "   s     r!   c                   @   s&   e Zd ZdedefddZdd ZdS )_PyTreeCodeGenOutputsOnlyargsr$   c                 G   s   |S Nr&   )r'   r8   r&   r&   r(   process_inputs^      z(_PyTreeCodeGenOutputsOnly.process_inputsc                 C   s   t | ||S r9   )r   
gen_fn_def)r'   Z	free_varsZmaybe_return_annotationr&   r&   r(   r<   b   s   z$_PyTreeCodeGenOutputsOnly.gen_fn_defN)r-   r.   r/   r
   r:   r<   r&   r&   r&   r(   r7   \   s    r7   r*   r$   c                 C   s,   t tdd| jjjjdd| j_|   | S )zMove the responsibility of flattening the input arguments from the graph module to the caller.

    Example:

        output = gm(my_struct)

        gm = gm(to_caller_flattened_graph_module)

        output = gm(*pytree.flatten(my_struct)[0])

    N)	orig_argsZin_specout_spec)pytree_info)r7   r   Z_graphZ_codegenr?   r>   	recompile)r*   r&   r&   r(   !_to_caller_flattened_graph_modulef   s   


rA   t
placementsc                 C   s   |t jt| < d S r9   )dtensor_expand_modeZ_placements_overrideid)rB   rC   r&   r&   r(   _override_placements   s   rF   optnamed_statesparamsc              	   c   s~    | d usJ t | j}|D ]}|| | j|| < q| jd }|d }| |d< zd V  W ||d< || _d S ||d< || _w )Nr   rI   )r   stateZparam_groupsvalues)rG   rH   rI   Zorig_statesnZparam_groupZorig_paramsr&   r&   r(   _rematerialize_optimizer   s   


rM   c                  c   sD    dd } t jjj}| jt jj_zd V  W |t jj_d S |t jj_w )Nc                   S   r%   )NTr&   r&   r&   r&   r(   f_true   r;   z_enable_compile.<locals>.f_true)r2   _utilsZis_compiling__code__)rN   Zorig_is_compiling_coder&   r&   r(   _enable_compile   s   
rQ      c                 C   s4   t jj| ||d}t| |D ]	\}}|| qd S )N)alpha)aten_foreach_addr   zipcopy_)r'   otherrS   self_updatedss_ur&   r&   r(   _foreach_add_decomp   s   r\   c                 C   s*   | |}t ||D ]	\}}|| q	d S r9   rV   rW   )opr'   rY   rZ   r[   r&   r&   r(   _foreach_unaop_decomp   s   r_   c                 C   ,   | ||}t ||D ]	\}}|| q
d S r9   r]   )r^   r'   rX   rY   rZ   r[   r&   r&   r(   _foreach_binop_list_decomp      
ra   c                 C   r`   r9   r]   )r^   r'   scalarrY   rZ   r[   r&   r&   r(   _foreach_binop_scalar_decomp   rb   rd   c                 C   s0   | ||||}t ||D ]	\}}|| qd S r9   r]   )r^   r'   Ztensor1Ztensor2rc   rY   rZ   r[   r&   r&   r(   _foreach_addcop_scalar_decomp   s   re   T	lrbeta1beta2weight_decayepsamsgradmaximize
grad_scale	found_infc       	         C   s   | ||||f}t jj| |||||||||	|
||||d}tt||D ]\}\}}|dkr.q#t||D ]	\}}|| q3q#d S )Nrf   rR   )rT   Z_fused_adamdefault	enumeraterV   rW   )r'   ZgradsZexp_avgsZexp_avg_sqsZmax_exp_avg_sqsZstate_stepsrg   rh   ri   rj   rk   rl   rm   rn   ro   Z
orig_tupleZupdated_tupleidxorigupdatedour&   r&   r(   _fused_adam_decomp   s2   rw   DEDUP_TARGETSc                 C   sv   i }| j jD ].}tj|j }|jtv r4|jg|R }||d }|d u r)|||< q|| | j 	| q| 
  | S r9   )graphnodespytreearg_tree_leavesr8   targetrx   getZreplace_all_uses_withZ
erase_noder@   )r*   Zargs_to_nodenoder8   Zargs_keyZunique_noder&   r&   r(   _dedup_collectives.  s   


r   c                   @   s@   e Zd ZU ejed< ejed< ee	j
j ed< ee	j ed< dS )_CompiledResultr*   modrG   r+   N)r-   r.   r/   r   r5   __annotations__r3   r4   r   r2   optim	Optimizerr   r6   r&   r&   r&   r(   r   E  s
   
 

r   funcmodule_overrideparallel_moder8   kwargsc              	      sV  d\t j|i |D ]#}t|tjrd u sJ d|t|tjjr/d u s-J d|qd us8J drTt dt	dtjjdd f fdd	d
 t
jdd}t
jdd}i }d ur| D ]\}	}
|
jv r~j|
 ||	< qnt|tfdd}rdnd}rt tt|dtjdtjffdd}t tj||}t tj||}t 0 tjjdd tt|| |tdd|||||}W d    n1 sw   Y  W d    n1 sw   Y  i ||}||||||}t ||g}t|}t|}r$D ]	}|||}qt||S )N)NNz%Only support single nn.Module for nowz%Only support single Optimizer for nowz5Couldn't find nn.Module instances from the arguments.
fqn_prefixmoduler$   c                    sz   D ]8}|  D ]1\}}t|dkrq| dkr| d | n|}|||}t|t|kr3|| q || qqd S )Nr    .)Znamed_childrenlenr)   rE   Zswap_submodule)r   r   overridenamechildr"   	new_child)accessorr   swapr&   r(   r   g  s   z_compile.<locals>.swapr   F)Zremove_duplicatec              
      s   t i ||X rt||nt <  rt|nt  | |i |}W d    n1 s2w   Y  |t t| fW  d    W  d    S 1 sUw   Y  W d    d S 1 sew   Y  d S r9   )r   Z_reparametrize_modulerM   r   r   list
parametersrK   )r   rI   buffersrH   r8   r   ret)is_data_parallel_moder   rG   r&   r(   stateless_func  s   Rz _compile.<locals>.stateless_funcZfakeZsymbolicargc                    s6    | }dg| j }| j  t 9  < ||S )NrR   )Zfrom_tensorndimZinput_batch_dimdistZget_world_sizerepeat)r   Zfake_argZarg_dims)data_parallel_mode	fake_moder&   r(   _get_full_batch_arg  s   

z%_compile.<locals>._get_full_batch_arg)Z	check_nan)tracing_modeZdecomposition_tableZ_allow_non_fake_inputs) r{   r|   
isinstancer3   r4   r2   r   r   r    r1   dictZnamed_parametersZnamed_buffersitemsrJ   r   r   r   r6   Ztree_map_onlyrQ   ZautogradZdetect_anomalyr   r   SPMD_DECOMP_TABLE	partitionZtree_leavesrA   r   r,   r   )r   r   r   r8   r   r   rI   r   rH   rL   pr   r   r   r*   Zparams_and_buffersr+   r   r&   )r   r   r   r   r   r   rG   r   r(   _compileM  s   "




 r   Z_compiled_objgm_transformationc                    s   dt f fdd}|S )a  Compile and optimize a callable, which can be a train step within a training loop.

    This method will extract :class:`nn.Module` and :class:`torch.optim.Optimizer`
    instances from the input arguments and trace operations applied to their
    parameters and states.

    Args:
        module_override (Optional[List[Override]]): a list of Override instances
            that will be applied to the module in order. The :class:`Override`
            objects provide :class:`nn.Module` replacements during tracing and a
            graph transformation function after tracing. (Default: ``None``)
        gm_transformation (Optional[Callable[fx.GraphModule, fx.GraphModule]]):
            a callback that will be called after the original callable is
            compiled and distributed (usually after the first iteration) to
            transform the compiled GraphModule into a new optimized one.
        parallel_mode (Optional[ParallelMode]): a :class:`ParallelMode` object
            that specifies how to parallelize the callable. Each ParallelMode
            would have its own strategy to partition the model and the captured
            graph (Default: ``None``)

    r   c                    s    t   fddS )Nc            	         s0  |r| ddnd}d}jtd }|d u r3d}d u rtn}t |g| R i |}|jt< |jtj| i | }t	
 L |rMrM|j|_|sW|j| d }n.z|j|d|id }W n! ty } zdt|vrs||j| d }W Y d }~nd }~ww |W  d    S 1 sw   Y  d S )Nlast_train_stepFTr   Z	last_iter)pop__dict__r~   COMPILED_OBJECT_KEYrD   r   r+   r{   r|   r2   Zno_gradr*   	TypeErrorr1   )	r8   r   r   Z
first_iterZcompiled_objmodeZ	flat_inpsoutpute)r   r   r   r   wrapperr&   r(   r     s>   

$z'compile.<locals>.inner.<locals>.wrapper)r	   )r   r   r   r   )r   r   r(   inner  s   ,zcompile.<locals>.inner)r   )r   r   r   r   r&   r   r(   compile  s   0r   )rR   )NNN)oabcr   r   
contextlibr   r   r   dataclassesr   	functoolsr   r	   typingr
   r   r   r   r   r   r   r   r   Z	functorchr   r2   Ztorch.distributeddistributedr   Z)torch.distributed._functional_collectivesZtorch.nnr3   Ztorch.utils._pytreeutilsZ_pytreer{   r   Ztorch._decomp.decompositionsr   Ztorch._subclasses.fake_tensorr   Z%torch.distributed._spmd.data_parallelr   Z%torch.distributed._spmd.parallel_moder   r   r   Ztorch.distributed._tensorr   Ztorch.fx.graphr   r   r   Ztorch.nn.utilsr   Z%torch.nn.utils._named_member_accessorr    r!   r7   r5   rA   rD   r6   rF   r   r   r1   	ParameterrM   opsrT   rQ   r\   r_   ra   rd   re   rw   Z_foreach_add_ZScalarrU   Z_foreach_addcdiv_Z_foreach_addcdivZ_foreach_addcmul_Z_foreach_addcmulZ_foreach_div_Z_foreach_divZ_foreach_mul_Z_foreach_mulZ_foreach_neg_rp   Z_foreach_negZ_foreach_reciprocal_Z_foreach_reciprocalZ_foreach_sqrt_Z_foreach_sqrtZ_foreach_sub_Z_foreach_subZ_fused_adam_r   Zc10d_functionalZ
all_reduceZwait_tensorrx   Z_opsZ
OpOverloadr   r   r   r   r   r   r&   r&   r&   r(   <module>   s    ,:





.
&


 %
