o
    <&i3                     @   s  d dl Z d dl mZ ddlmZmZmZmZmZmZm	Z	m
Z
mZ d dlmZmZ ddgZG dd deZd	d
e
 de	 de d e_				d!dee dee dee dee dee dee dedededededededefddZdee dee dee dee dee dedededededededefddZdee dee dee dee dee dedededededededefdd ZdS )"    N)Tensor   )		Optimizer_use_grad_for_differentiable
_get_value_stack_if_compiling_default_to_fused_or_foreach_differentiable_doc_maximize_doc_foreach_doc_view_as_real)ListOptionalAdamaxadamaxc                       sd   e Zd Z					dddddee d	ed
ef fddZ fddZdd ZedddZ	  Z
S )r   Mb`?g?g+?:0yE>r   NF)maximizedifferentiableforeachr   r   c          
   	      s   d|kst d| d|kst d| d|d   kr"dk s,n t d|d  d|d   kr8dk sBn t d|d  d|ksMt d	| t|||||||d
}	t ||	 d S )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: )lrbetasepsweight_decayr   r   r   )
ValueErrordictsuper__init__)
selfparamsr   r   r   r   r   r   r   defaults	__class__ =C:\wamp64\www\opt\env\Lib\site-packages\torch/optim/adamax.pyr       s(   	zAdamax.__init__c                    s   t  | | jD ]}|dd  |dd |dd q	t| j }t|dko3t	|d d }|sI|D ]}tj
t|d tjd|d< q8d S d S )Nr   r   Fr   r   stepZdtype)r   __setstate__param_groups
setdefaultliststatevalueslentorchZ	is_tensortensorfloatfloat32)r!   r.   groupZstate_valuesZstep_is_tensorsr$   r&   r'   r*   /   s   

zAdamax.__setstate__c           
      C   s   d}|d D ]`}|j d u rq|t|O }|| |j jr"td||j  | j| }	t|	dkrQtjdtj	d|	d< tj
|tjd|	d	< tj
|tjd|	d
< ||	d	  ||	d
  ||	d  q|S )NFr"   z(Adamax does not support sparse gradientsr   r   r)   r(   )Zmemory_formatexp_avgexp_inf)gradr1   
is_complexappendZ	is_sparseRuntimeErrorr.   r0   r2   r4   Z
zeros_likeZpreserve_format)
r!   r5   params_with_gradgradsexp_avgsexp_infsstate_stepshas_complexpr.   r&   r&   r'   _init_group=   s,   




zAdamax._init_groupc                 C   s   d}|durt   | }W d   n1 sw   Y  | jD ]F}g }g }g }g }g }|d \}	}
|d }|d }|d }|d }|d }|d }| ||||||}t|||||||	|
||||||d	 q |S )
zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   )	r   beta1beta2r   r   r   r   r   rB   )r1   Zenable_gradr+   rD   r   )r!   closureZlossr5   r=   r>   r?   r@   rA   rE   rF   r   r   r   r   r   r   rB   r&   r&   r'   r(   Y   sH   

zAdamax.step)r   r   r   r   NN)__name__
__module____qualname__r   boolr    r*   rD   r   r(   __classcell__r&   r&   r$   r'   r      s(    		
"a  Implements Adamax algorithm (a variant of Adam based on infinity norm).

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)},
                \: \lambda \text{ (weight decay)},                                                \\
            &\hspace{13mm}    \epsilon \text{ (epsilon)}                                          \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                u_0 \leftarrow 0 \text{ ( infinity norm)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}m_t      \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t               \\
            &\hspace{5mm}u_t      \leftarrow   \mathrm{max}(\beta_2 u_{t-1}, |g_{t}|+\epsilon)   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \frac{\gamma m_t}{(1-\beta^t_1) u_t} \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
    a
  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        z	
        zd

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980

    Fr"   r>   r?   r@   rA   r   r   r   rB   r   rE   rF   r   r   c	                C   s   t dd |D std|du rt| |dd\}}|r%tj r%td|r/tj s/t}nt}|| |||||	|
||||||d dS )	zrFunctional API that performs adamax algorithm computation.

    See :class:`~torch.optim.Adamax` for details.
    c                 s   s    | ]	}t |tjV  qd S rH   )
isinstancer1   r   ).0tr&   r&   r'   	<genexpr>   s    zadamax.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)Z	use_fusedz6torch.jit.script not supported with foreach optimizers)r   rE   rF   r   r   r   r   rB   )allr<   r   r1   ZjitZis_scripting_multi_tensor_adamax_single_tensor_adamax)r"   r>   r?   r@   rA   r   r   r   rB   r   rE   rF   r   r   _funcr&   r&   r'   r      s4   
c                C   s(  t | D ]\}}|| }|
s|n| }|| }|| }|| }|d7 }|	dkr.|j||	d}t|rGt|}t|}t|}t|}||d|  t||d|	 
|dgd}|sqtj|dd|d n|tj|ddd d|t|  }|| }|j||| d qd S )Nr   r   alphaFkeepdimout)rZ   )value)	enumerateaddr1   r:   Zview_as_realZlerp_catZmul_	unsqueezeabsadd_
unsqueeze_ZamaxZcopy_r   Zaddcdiv_)r"   r>   r?   r@   rA   r   rE   rF   r   r   r   r   rB   iparamr9   r7   r8   Zstep_tnorm_bufbias_correctionclrr&   r&   r'   rT      s2   




$rT   c             	      s~  |rJ dt | dkrd S t| ||||g}| D ]\\}}}}}}|
r,t|}|r5t|||| |d jrHtj|tj	ddddd nt|d |dkre|
r]tj|||d ntj
|||d}t||d   t|| t||D ]&\}}t|d| |	dgd}tj|dd||  fd	 qy fd
d|D }tfdd|D }t|||| qd S )Nz#_foreach ops don't support autogradr   r   cpu)ZdevicerW   r   FrY   c                    s   g | ]
}d  t |  qS )r   )r   )rO   r(   )rE   r&   r'   
<listcomp>_  s    z(_multi_tensor_adamax.<locals>.<listcomp>c                    s   g | ]}d  |  qS )r&   )rO   rg   )r   r&   r'   rj   `  s    )r0   r   Z"_group_tensors_by_device_and_dtyper/   r1   Z_foreach_negr   Zis_cpuZ_foreach_add_r2   Z_foreach_addZ_foreach_lerp_Z_foreach_mul_zipr_   r`   ra   rb   rc   maxnewlongr   Z_foreach_addcdiv_)r"   r>   r?   r@   rA   rE   rF   r   r   r   r   r   rB   Zgrouped_tensorsZgrouped_paramsZgrouped_gradsZgrouped_exp_avgsZgrouped_exp_infsZgrouped_state_stepsrU   r8   r9   rf   Zbias_correctionsrh   r&   )rE   r   r'   rS   %  s6   

 rS   )NFFF)r1   r   Z	optimizerr   r   r   r   r   r	   r
   r   r   typingr   r   __all__r   __doc__rL   r3   r   rT   rS   r&   r&   r&   r'   <module>   s    ,
5	

8	

5	
