o
    <Æ&il  ã                
   @   s0  d dl Z d dlZd dlmZmZ d dlZd dlZd dlm  m	Z
 d dlmZmZmZmZmZmZmZ ddlmZ e  e¡Z				d>dejd	ejd
ejdeej fdd„Zd?dedefdd„Zd?dedefdd„Z	d?dejdedefdd„Zd?defdd„Zd?dedefdd„Z d?dedefdd„Z!d?dedefdd„Z"d?dedefd d!„Z#d?dedefd"d#„Z$d$d%„ Z%d&ejdeeje&e&f fd'd(„Z'd)ejfd*d+„Z(d)ejd,e&d-e&d.e&dejf
d/d0„Z)d1d2„ Z*d)ejd3e&d4edejfd5d6„Z+d7d8„ Z,d9ejfd:d;„Z-				d>dejd	ejd
ejdeej fd<d=„Z.dS )@é    N)ÚOptionalÚTuple)Úcan_use_efficient_attentionÚcan_use_flash_attentionÚflash_sdp_enabledÚmath_sdp_enabledÚmem_efficient_sdp_enabledÚ
SDPAParamsÚ
SDPBackendé   )ÚNestedTensorç        FÚqueryÚkeyÚvalueÚ	attn_maskc                 C   sN  t | tƒrt |tƒrt |tƒs td| j› d|j› d|j› dƒ‚| j|jks,| j|jkr=td| j› d|j› d|j› dƒ‚| j|jksI| j|jkrZtd| j› d	|j› d
|j› dƒ‚|  ¡ dk sl| ¡ dk sl| ¡ dk r€td|  ¡ › d| ¡ › d| ¡ › dƒ‚| j|jksŒ| j|jkrtd| j› d|j› d|j› dƒ‚|d ur¥tdƒ‚d S )NzNExpected query, key, and value to be nested tensors, but got query.is_nested: z, key.is_nested: z, and value.is_nested: z	 instead.zLExpected query, key, and value to have the same dtype, but got query.dtype: z, key.dtype: z, and value.dtype: zSExpected query, key, and value to have the same device type, but got query.device: z, key.device: z, and value.device: é   zUExpected query, key, and value to all be  at least 2 dimensional, but got query.dim: z, key.dim: z and value.dim: z[Expected query, key, and value to all be ragged on the same dimension, but got ragged dims z, z, and z, respectively.zMasks are not yet supported!)
Ú
isinstancer   Ú
ValueErrorÚ	is_nestedÚdtypeÚdeviceÚdimÚ_ragged_idxÚtorchÚbool)r   r   r   r   Ú	dropout_pÚ	is_causalÚscale© r   úFC:\wamp64\www\opt\env\Lib\site-packages\torch/nested/_internal/sdpa.pyÚ_validate_sdpa_input   st   
ÿþýÿÿþÿÿÿþÿÿÿþÿ$ÿÿÿÿÿÿÿÿþr!   ÚparamsÚreturnc                 C   s4   | j  d¡}| j d¡}| j d¡}||ko||kS )Nr   )r   Úsizer   r   )r"   ÚdebugÚq_batch_sizeÚk_batch_sizeÚv_batch_sizer   r   r    Ú_check_batch_size_nestedJ   s   r)   c                 C   sl   d}| j  d¡}| j d¡}| j d¡}||ko||k}|r(|d dkr(||ks4|r2t d|||¡ dS dS )Né   éÿÿÿÿé   r   zÖFor NestedTensor inputs, Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 256. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FT)r   r$   r   r   ÚlogÚwarning)r"   r%   Úmax_sizeZquery_size_lastZkey_size_lastZvalue_size_lastZsame_head_dim_sizer   r   r    Ú!_check_head_dim_size_flash_nestedW   s&   ÿÿúr0   ÚparamÚ
param_namec                 C   sR   t | tƒs	J dƒ‚| jdkr|rt d|¡ dS | jdkr'|r%t d|¡ dS dS )Nzparam should be a jagged NTr   zMFused kernels do not support ragged num_head_dims, %s has a ragged num_heads.Fr   zAFused kernels do not support seq_len == 0, %s has a seq len of 0.T)r   r   r   r-   r.   Ú_min_seqlen)r1   r2   r%   r   r   r    Ú:_check_for_seq_len_0_and_consistent_head_dim_nested_helperq   s    
þ
þr4   c              
   C   s`   t | ||ƒ}| |kr| dks||kr|dks||kr.|dkr.|r,t d||| ||||¡ dS dS )Nr   zzBoth fused kernels require query, key and value to have broadcastable %s, got Query %s %d, Key %s %d, Value %s %d instead.FT)Úmaxr-   r.   )Zq_sizeZk_sizeZv_sizer2   r%   r/   r   r   r    Ú_try_broadcast_param_size‹   s"   ÷r6   c           	      C   sÜ   | j jrt| j d|ƒnd}|sdS | jjrt| jd|ƒnd}|s"dS | jjr-t| jd|ƒnd}|s3dS | j  d¡}| j d¡}| j d¡}||koL||k}|sl| j js[| jjs[| jjrd|rbt d¡ dS t	|||d|ƒS dS )	Nr   TFr   r   r   zFBoth fused kernels do not support training with broadcasted NT inputs.z	num heads)
r   r   r4   r   r   r$   Úrequires_gradr-   r.   r6   )	r"   r%   Z	q_is_safeZ	k_is_safeZ	v_is_safeÚq_num_headsÚk_num_headsÚv_num_headsZsame_num_headsr   r   r    Ú_check_for_seq_len_0_nested¢   sX   ýÿûýÿûýÿûÿþýÿ
ÿr;   c                 C   s.   | j js| jjs| jjr|rt d¡ dS dS )NzMMemory efficient attention currently doesn't support training with NT inputs.FT)r   r7   r   r   r-   r.   ©r"   r%   r   r   r    Ú_check_requires_grad_nestedÝ   s   ÿþýÿr=   c                 C   ó(   t ttf}|D ]
}|| |ƒs dS qdS ©NFT)r)   r0   r;   ©r"   r%   ÚconstraintsÚ
constraintr   r   r    Ú_can_use_flash_sdpa_jaggedì   ó   ý
ÿrC   c                 C   r>   r?   )r=   r)   r;   r@   r   r   r    Ú_can_use_efficient_sdpa_jaggedø   rD   rE   c                 C   sd   | j  dd¡ ¡ r| j dd¡ ¡ r| j dd¡ ¡ s$|r"t d¡ dS | jr0|r.t d¡ dS dS )Nr   r   zGIf inputs are nested tensors they must be contiguous after transposing.FzENested tensors for query / key are not supported when is_causal=True.T)r   Ú	transposeÚis_contiguousr   r   r-   r.   r   r<   r   r   r    Ú_can_use_math_sdpa_jagged  s$   ÿþýÿÿrH   c           	      C   s  t ƒ stƒ stƒ stjS tjtjtjf}t| |||||ƒ}|D ]7}|tjkr3t	|ƒr3t
|ƒr3tj  S |tjkrEt|ƒrEt|ƒrEtj  S |tjkrVtƒ rVt|ƒrVtj  S qt d¡ t|dd t|dd t d¡ t	|dd t
|dd t d¡ t|dd tjS )Nz)Memory efficient kernel not used because:T)r%   z(Flash attention kernel not used because:z'Math attention kernel not used because:)r   r   r   r
   ÚERRORÚFLASH_ATTENTIONÚEFFICIENT_ATTENTIONÚMATHr	   r   rC   r   rE   rH   r-   r.   )	r   r   r   r   Zdropoutr   Zorderingr"   Úbackendr   r   r    Ú_select_sdp_backend  sD   ÿþýý



ÿ


€


rN   Úqkvc                 C   s   t | tƒs	tdƒ‚|  ¡ d u r%|  ¡ jtj| jd}| j	}|  
¡ jd }n|  ¡  d¡jtj| jd}|  d¡}| j	}t|d  ¡ ƒ}|||fS )Nz<QKV must be nested for flash cumulative_seq_len calculation.)r   r   r   r+   )r   r   r   ÚlengthsÚoffsetsÚtor   Zint32r   Ú_max_seqlenÚvaluesÚshapeZcumsumr$   ÚintÚitem)rO   Zcumulative_seqlenZ
max_seqlenZn_elemZ
batch_sizer   r   r    Ú_cumulative_and_max_seq_len_nnz@  s   
ÿ

rX   Útensorc                 C   sf   t | tƒsJ ‚|  ¡ }| j}| d¡d }|dkrdS |d }|dd … D ]}||kr. dS |}q%dS )Nr   r   Tr   F)r   r   rQ   Z_strider$   )rY   rQ   ÚstridesZ	n_tensorsZprev_strideZstrider   r   r    Ú!_is_safe_to_get_storage_as_tensor\  s   
r[   ÚNnzÚ	num_headsÚhead_dimc                 C   s   | j r|  ¡ S |  |||¡S )N)r   rT   Úview)rY   r\   r]   r^   r   r   r    Ú_view_as_dense{  s   r`   c                 C   s\  |   d¡}|  d¡}|  d¡}|   d¡}|  d¡}|  d¡}||kr.||kr.||kr.||ks2tdƒ‚|   d¡}	|   d¡}
|  d¡}|  dd¡}| dd¡}| dd¡}t|ƒ\}}}t|ƒ\}}}| ¡ smt|ƒsm| ¡ }| ¡ syt|ƒsy| ¡ }| ¡ s…t|ƒs…| ¡ }t|||	|
ƒ}t|||	|
ƒ}t|||	|ƒ}| ¡ |j	|j
dœ}||||||||fS )Nr   r   z<This path is currently not implemented for jagged layout NT.é   r   )rQ   rS   r3   )r$   ÚRuntimeErrorrF   rX   rG   r[   Ú
contiguousr`   rQ   rS   r3   )r   r   r   r&   r'   r(   r8   r9   r:   r]   Zhead_dim_qkZ
head_dim_vZq_tZk_tZv_tÚcumulative_sequence_length_qÚmax_seqlen_batch_qZNnz_qÚcumulative_sequence_length_kvÚmax_seqlen_batch_kvZNnz_kvÚquery_buffer_reshapedÚkey_buffer_reshapedÚvalue_buffer_reshapedÚoutput_nt_infor   r   r    Ú_sdpa_nested_preprocessing  sb   





ÿ


üü	ýørl   Úalignment_sizeÚslicec                 C   sR   |   d¡}|| dkr| S |||  }tjj | d|g¡} |r'| dd|…f S | S )Nr+   r   .)r$   r   ÚnnÚ
functionalÚpad)rY   rm   rn   Zlast_dim_sizeZ	pad_countr   r   r    Ú_pad_last_dimZ  s   
rr   c                 C   s(   |d ur|}|S t  d|  d¡ ¡}|S )Ng      ð?r+   )ÚmathÚsqrtr$   )r   r   Zsoftmax_scaler   r   r    Ú_calculate_scalem  s   ÿru   Úoutc                 C   s(   | j s|  d¡|kr| dd|…f } | S )Nr+   .r   )r   r$   )rv   Úog_sizer   r   r    Ú_post_process_flash_outputr  s   rx   c           %      C   s@  t | ||||||ƒ t| tƒrt|tƒrt|tƒsJ ‚|  ¡ dkrY| ¡ dkrY| ¡ dkrY| jdkrYddlm} tj| j	|j	|j	t|tƒrH|j	n||||d}t|fi || ƒ¤ŽS | j
pa|j
pa|j
}	t| |||||ƒ}
|
tjkrÂ|  d¡}t| ddƒ}t|ddƒ}t|ddƒ}t| |ƒ}t|||ƒ\}}}}}}}}tjjj|||||||||d|d	\}}}}}t|fi |¤Ž dd
¡}t||ƒS |
tjkrt| ||ƒ\}}}}}}} }tjjj| d¡| d¡| d¡d ||||t|ƒ|	|d	\}}!}"}#}$}t| d¡fi |¤Ž dd
¡S |
tjkrtj| ||||||d	d S tdƒ‚)Nra   r   r   )Úextract_kwargs)r   r   r   r   r+   r,   F)r   r   z=No viable backend for scaled_dot_product_attention was found.) r!   r   r   r   r   Ztorch.nested._internal.opsry   ÚFZscaled_dot_product_attentionZ_valuesr7   rN   r
   rJ   r$   rr   ru   rl   r   ÚopsZatenZ_flash_attention_forwardrF   rx   rK   Z_efficient_attention_forwardZ	unsqueezerV   ZsqueezerL   Z"_scaled_dot_product_attention_mathrb   )%r   r   r   r   r   r   r   ry   ÚoutputZcompute_logsumexpZbackend_choicerw   Zquery_paddedZ
key_paddedZvalue_paddedZog_scalerh   ri   rj   rd   rf   re   rg   rk   Z	attentionZ	logsumexpZphilox_seedZphilox_offsetZdebug_attn_maskZquery_reshapedZkey_reshapedZvalue_reshapedÚ_Z
log_sumexpÚseedÚoffsetZmax_seqlen_qr   r   r    Ú#jagged_scaled_dot_product_attentionx  sÊ   	ÿþý.	÷ÿ




÷	õú


÷	õùÿþÿr€   )Nr   FN)F)/Úloggingrs   Útypingr   r   r   Ztorch.nnZtorch.nn.functionalro   rp   rz   Ztorch.backends.cudar   r   r   r   r   r	   r
   Znested_tensorr   Ú	getLoggerÚ__name__r-   ZTensorr!   r   r)   r0   Ústrr4   r6   r;   r=   rC   rE   rH   rN   rV   rX   r[   r`   rl   rr   ru   rx   r€   r   r   r   r    Ú<module>   sœ    $

ùÿþý
ü3ÿÿÿ
þ; (ÿÿÿÿ
þ Iÿÿÿ
þ
ùÿþýü