o
    H&i                     @   s  U d dl Z d dlZd dlZd dlmZmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d d	l,m-Z-m.Z. d d
l/m0Z0 d dl1m2Z3 dZ4dZ5e5 dZ6dZ7e7 dZ8dZ9ee: Z;e< Z=ee e>d< eeeej?e@eAe:f ZBeeBeeB eeB ee:df f ZCee:eCf ZDeeD ZEee:eeDeEf f ZFe jGdd ZHeG dd dZIeG dd deIZJdSdejKde:deLde;fddZMddd dejKd!eejNjOd"f d#eLd$eeejK  d%eeI deJfd&d'ZPd(ee:eCf d)eFd*eJddfd+d,ZQd-eejKejNjOf d.e:defd/d0ZRdejKd*eJdee:eCf fd1d2ZSdejKd3ee:eCf d*eJde0fd4d5ZTd6ejNjOddfd7d8ZUdejKd9eejNjOd"f d*eJdeFfd:d;ZVdejKd6ejNjOd)eFd*eJdeFf
d<d=ZWdejKd9eejNjOd"f d3eFd*eJddf
d>d?ZXddd dejKd$eeejK  d%eeI dee:eCf fd@dAZYddd dejKd9eejNjOeejNjO f d$eeejK  d%eeI deFf
dBdCZZddd dejKd9eejNjOeejNjO f d$eeejK  d%eeI deee:eCf eFf f
dDdEZ[dejKd3eeejKee:eCf f ee:eCf f dee:eCf fdFdGZ\ddHdejKd(eeejKee:eCf f ee:eCf f d%eeI de0fdIdJZ]ddHdejKd9eejNjOeejNjO f d)eFd%eeI ddf
dKdLZ^ddHdejKd9eejNjOeejNjO f d(eeejKee:eCf f ee:eCf f d)eFd%eeI de0fdMdNZ_eddHdejKd%eeI ddfdOdPZ`eddHdejKd9eejNjOd"f d%eeI ddfdQdRZadS )T    N)asdict	dataclassfield)chain)AnyCallablecastDictIterableListno_type_checkOptionalSetTupleUnion)ShardedTensor)DTensor)_gather_state_dict_offload_state_dict_to_cpu)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)_IncompatibleKeys)DistributedDataParallelZ_flat_paramparam_groups.stateparams_patched_state_dict	ValueTypec                  c   sP    t  } t   zd V  W t   | rt   d S d S t   | r't   w w N)gc	isenableddisableZcollectenable)
is_enabled r-   RC:\wamp64\www\opt\env\Lib\site-packages\torch/distributed/checkpoint/state_dict.py
gc_contextC   s   
r/   c                   @   sN   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< d	S )
StateDictOptionsa  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes``: when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().
      The default value is False.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictN)__name__
__module____qualname____doc__r1   bool__annotations__r2   r3   r4   r5   r-   r-   r-   r.   r0   P   s   
 r0   c                   @   s   e Zd ZU eedZeeee	j
f eee	j
f f ed< eedZee ed< eedZee ed< dZeed< dZeed< ejZeed< eedZeej ed	< d
S )_StateDictInfo)default_factoryfqn_param_mappingall_fqnssubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)r6   r7   r8   r   dictr>   r	   r   strtorchTensorFQNS_Tr;   setr?   r   r@   rA   r:   rB   
contextlibnullcontextrC   r   listrD   r   nnModuler-   r-   r-   r.   r<   w   s   
 r<   Tmodelnameskip_ddp_prefixreturnc           	         s   d|vr|hS | d}g }| }t|D ]b\}}t|tr.|dks#J |j}|s-|| qt|trl||d  tkrXd| t	|t} rL  d  fdd|j
D   S t	|t}|tkrk|| t	||}q|| t	||}qd|hS )a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `Set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
    r"   module   c                    s   h | ]}  | qS r-   r-   ).0fqnprefixr-   r.   	<setcomp>   s    z_get_fqns.<locals>.<setcomp>)split	enumerate
isinstanceDDPrT   appendFSDP
FLAT_PARAMjoingetattrZ_fqnsr   )	rP   rQ   rR   Z	obj_namesZfqn_obj_namesZcurr_objiZcurr_obj_nameZ
flat_paramr-   rX   r.   	_get_fqns   s6   










re   )
submodulesoptionsoptims.
optim_onlyrf   rg   c                C   sx  |r|st d|pt }i }t }|  D ]\}}t| |}	|	||< |	D ]}
|||
< ||
 q%qt }|ret|}|  D ]%\}}||vrHq?t| |}	t|	dksWJ d|	D ]
}
||
 d qYq?t	| }|r|j
rt|j|jd}t|j|jd}tj}nt }t|jd}tj}tjtj| |||d}ntj}tdi t|||||tttj || t|dkd	S )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    z;Optimizers are not passed in but optim_only is set to True.rU   z)Submodule FQN should only have 1 instancer"   )offload_to_cpuZ
rank0_only)rj   )rT   state_dict_typestate_dict_configoptim_state_dict_configr   )r>   r?   r@   rC   rD   rA   rB   Nr-   )RuntimeErrorr0   rJ   named_parametersre   addnamed_moduleslenr`   rD   r1   r   r2   r   r   ZFULL_STATE_DICTr   r   ZSHARDED_STATE_DICT	functoolspartialrk   rK   rL   r<   r   r   r   rN   rO   )rP   rh   ri   rf   rg   r>   r?   rQ   paramfqnsrW   r@   rT   rD   rl   rm   rk   rC   r-   r-   r.   _verify_options   sx   





rw   model_state_dictoptim_state_dictinfoc                 C   s   d}|j D ]}t|}|d usJ d|jrd} nq|j r$|s$td|jrB| sB|jsB|jsB|jr5|jsB|j	rBtdt
 d|jrX|rK|t sX|jrQ|jsXtd| |  D ]}t|v rlt| dt d	q\d S )
NFz)Expected a fsdp_state with a fsdp module.Tz:The model has FSDP modules but no FSDP root module exists.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=r"   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rD   r   Z_is_rootrn   rA   r@   r3   r2   r1   r5   distZget_rankrB   STATEkeysra   )rx   ry   rz   Zhas_fsdp_rootrT   Z
fsdp_statekeyr-   r-   r.   _verify_state_dict  s^   

r   objapic                 C   s,   t | |}|tv rtjt | j|| d}|S )N)self)rc   r%   rs   rt   	__class__)r   r   callr-   r-   r.   _state_dict_fn5  s   
r   c                 C   s  |j si S |  t| d }W d    n1 sw   Y  t| D ]6}t| |}t|dks4J tt|}||kr[dt	fdd}|||sTt
d| d| ||||< q%|jri }| D ]&}|jD ] }||srqj|jr||| ||< qj|t|d  }	|| ||	< qjqe|}|jr|  D ]\}}
|
jrqt| |}|D ]}|| qqt| D ]\}}|jr|| q|jr|jst nd}t||j|d	S |jrt|S |S )
N
state_dictrU   rS   c                 S   s   t |t | kr
dS |d}| d}d}t|D ]&\}}||| kr9|d7 }|t |kr8|t |d k  S q|dkr>q dS dS )NFr"   r   rU   rT   T)rr   r[   r\   )r~   rW   Z	fqn_splitZ	key_splitZfqn_idxZkey_idxZkey_namer-   r-   r.   verifyL  s   

z%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is r   r2   
ranks_only)rA   rC   r   rM   r}   re   rr   nextiterr:   rn   popr@   
startswithr4   r3   ro   requires_graditemsis_metar1   r2   tupler   r   )rP   rz   r   r~   rv   rW   r   new_state_dictrY   Znew_fqnru   pr   r-   r-   r.   _get_model_state_dict<  s^   






r   r   c           	      C   s   |j r|s
ti i S |  D ]%\}}t| |}t| |dd}t||D ]\}}||kr2||||< q#q|  ttt| d||j	dW  d    S 1 sPw   Y  d S )NF)rR   load_state_dict)r   r5   )
rA   r   ro   re   zipr   rC   r   r   r5   )	rP   r   rz   r~   _rv   Zfqns_with_ddp_prefixrW   Zfqn_with_ddp_prefixr-   r-   r.   _load_model_state_dict  s$   



$r   optimc                 C   sd   | j rdS | jD ]}|t D ]}|jdurtd|jr"t||_qq| jdd | j	dd dS )zH
    Initialize optim states by calling the step() with zero grads.
    Na  state_dict can only be used if the optimizer states are initialized (usually after one step() with gradients) or gradients are None. For the later case, state_dict will fake the gradients as zero to initialize the optimizer states. However, the gradients are not None.)closureT)Zset_to_none)
r#   r!   PARAMSZgradrn   r   rG   Z
zeros_likestepZ	zero_grad)r   param_groupru   r-   r-   r.   _init_optim_state  s   

r   
optimizersc              	      s  |j si S ti tg i}|D ]}t| t|d }|jr8|  t| ||}W d    n1 s2w   Y  nut	t
dd |jD }tt|tt|}i  |  D ](\}}	t| |}
t|
dksgJ tt|
}|	|vrrqV||	 }| |< | |< qVt	|t  D ]} | }|t ||t |< q|t D ]} fdd|t D |t< q|sqtt|t |t  tt|t |t  q|jr|jst nd}t ||j|dS |jrt!|S |S )	Nr   c                 s   s    | ]}|t  V  qd S r'   )r   )rV   gr-   r-   r.   	<genexpr>  s    z(_get_optim_state_dict.<locals>.<genexpr>rU   c                    s   g | ]} | qS r-   r-   )rV   pidZfqn_pid_mappingr-   r.   
<listcomp>  s    z)_get_optim_state_dict.<locals>.<listcomp>r   r   )"rB   r|   PGr   r   rD   rC   r`   ry   rM   r   from_iterabler!   rE   r   rangerr   ro   re   r   r   r}   r   r   r   DictValueTypeupdateListDictValueTypeextendr1   r2   r   r   r   )rP   r   rz   ry   r   Zosdr$   Zparam_pid_mappingr~   ru   rv   rW   r   groupr   r-   r   r.   _get_optim_state_dict  sP   


r   c                 C   s4  i }g }t |t|i}i }|jD ]]}|tg i |t D ]O}	|j|	 D ]G}
|d t }t|ts2J ||
 |	jrEt	t
|t  |
 ||
< t	t|t D ]}|t }t|tsYJ |
|v rit|t d |t|< qLq#qqt	t|t D ]#}|t|d}|dkrqt| D ]\}}|tkrq||| |< qqt|S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    rU   )r|   r   r!   r_   r   r>   r]   rM   r   r   r   r   rr   idgetr   )rP   r   ry   rz   r#   Zpg_stateZ
return_osdZ
pg_mappingr   ru   rW   r$   Zloaded_param_groupidxr~   valuer-   r-   r.   _split_optim_state_dict  s@   

r   c              	   C   sz   |j sd S |D ]3}t| |||}|jr.|  t| ||}W d    n1 s)w   Y  t| t|d|d qd S )Nr   r   )rB   r   rD   rC   r`   Zoptim_state_dict_to_loadr   r   )rP   r   r   rz   r   ry   r-   r-   r.   _load_optim_state_dict"  s   
r   c                C   sX   t   t| t d||d}t| |}t|i | |W  d   S 1 s%w   Y  dS )a  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules: Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.
    Fri   rf   rg   N)r/   rw   r   r   r   )rP   rf   rg   rz   rx   r-   r-   r.   get_model_state_dict:  s   
$r   c                C   st   t  - t|tjjr|fnt|}t| |d||d}t| ||}ti || |W  d   S 1 s3w   Y  dS )a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules: Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.
    Tr   N)	r/   r]   rG   r   	Optimizerr   rw   r   r   )rP   r   rf   rg   rz   ry   r-   r-   r.   get_optimizer_state_dict]  s    $r   c                C   s   t  4 t|tjjr|fnt|}t| |d||d}t| |}t| ||}t	||| ||fW  d   S 1 s:w   Y  dS )a)  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:

        import torch
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.nn.parallel import DistributedDataParallel as DDP
        from torch.distributed.checkpoint.state_dict import get_state_dict

        fsdp_model = FSDP(copy.deepcopy(model))
        fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        ddp_model = DDP(copy.deepcopy(model))
        ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

        # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        # the asserts will fail.
        assert ddp_state_dict == fsdp_state_dict
        assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules: Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.
    Fr   N)
r/   r]   rG   r   r   r   rw   r   r   r   )rP   r   rf   rg   rz   rx   ry   r-   r-   r.   get_state_dict  s"   D
$r   c           	         s   |si S t tt| tjr_tttjttt	f f |}i }|
 D ]8\}}|  D ]/\}}||kr5q,t| |}t|dksDJ dtt| d | fdd|
 D  q,q$|S tttt	f |S )NrU   z/FQNs for a submodule should only have 1 elementr"   c                    s   i | ]	\}} | |qS r-   r-   )rV   Zsubfqnr   rX   r-   r.   
<dictcomp>  s    z/_unflatten_model_state_dict.<locals>.<dictcomp>)r]   r   r   r}   rN   rO   r   r	   rF   r&   r   rq   re   rr   r   )	rP   r   Zcast_state_dictr   	submoduleZsub_state_dictrQ   mrv   r-   rX   r.   _unflatten_model_state_dict  s$   

r   )rg   c                C   s^   t | |}t  t| t d|d}t|i | t| ||W  d   S 1 s(w   Y  dS )a2  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys
    Fri   rg   N)r   r/   rw   r   r   r   )rP   rx   rg   rz   r-   r-   r.   set_model_state_dict  s   
$r   c                C   sr   t  , t|tjjr|fnt|}t| |d|d}ti || t| ||| W d   dS 1 s2w   Y  dS )a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None
    Tr   N)	r/   r]   rG   r   r   r   rw   r   r   )rP   r   ry   rg   rz   r-   r-   r.   set_optimizer_state_dict   s   "r   c                C   s   t | |}t 2 t|tjjr|fnt|}t| || |d}t||| t	| ||| t
| ||W  d   S 1 s=w   Y  dS )a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.
    r   N)r   r/   r]   rG   r   r   r   rw   r   r   r   )rP   r   rx   ry   rg   rz   r-   r-   r.   set_state_dictE  s   )

$r   c                   sj   t jt| |dfdd}|| _t jt| |d dtttf f fdd}|| _t	
| t	
| dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rP   rg   c                           S r'   r-   r-   _state_dict_callr-   r.   state_dict_call     z0_patch_model_state_dict.<locals>.state_dict_callr   c                        | d d S )N)rx   r-   r   _load_state_dict_callr-   r.   load_state_dict_call     z5_patch_model_state_dict.<locals>.load_state_dict_callN)rs   rt   r   r   r   r	   rF   r   r   r%   rp   )rP   rg   r   r   r-   r   r   r.   _patch_model_state_dict  s    
r   c                   s   t jt| ||dfdd}t jt| ||d dtttf f fdd}t| t| t	|t
jjr9|fnt|}|D ]}||_||_q?dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rP   r   rg   c                      r   r'   r-   r-   r   r-   r.   r     r   z4_patch_optimizer_state_dict.<locals>.state_dict_callr   c                    r   )N)ry   r-   r   r   r-   r.   r     r   z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)rs   rt   r   r   r	   rF   r   r%   rp   r]   rG   r   r   r   r   r   )rP   r   rg   r   r   r   r-   r   r.   _patch_optimizer_state_dict  s0   

r   )T)brK   rs   r(   dataclassesr   r   r   	itertoolsr   typingr   r   r   r	   r
   r   r   r   r   r   r   rG   Ztorch.distributeddistributedr{   Ztorch.nnrN   Z'torch.distributed._shard.sharded_tensorr   Ztorch.distributed._tensorr   Z.torch.distributed.checkpoint._state_dict_utilsr   r   Ztorch.distributed.fsdpr   r   r   r`   r   r   r   r   r   Z$torch.distributed.fsdp._common_utilsr   r   Ztorch.nn.modules.moduler   Ztorch.nn.parallelr    r^   ra   r   Z	PG_PREFIXr|   ZSTATE_PREFIXr   rF   rI   rJ   r%   r;   rH   intfloatZPrimitiveTyper&   r   r   ZOptimizerStateTypecontextmanagerr/   r0   r<   rO   r:   re   r   r   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-   r-   r-   r.   <module>   s  
 4(



&2
S

$1

J


5
6


'
/
W$

  
,
- 	
=6