o
    I&iF                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dlm%Z% d dl&Z&d dl'm(Z(m)Z) d dl*Z*d dl+m,Z, d dl-m.Z. d d	l/m0Z0 d d
l1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7 e	8e9Z:e"dZ;ee&j<e&j<f Z=ddddZ>dd Z?e@ddddZAdd ZBdd d!ZCd"d# ZDd$d% ZEdd)d*ZFdd.d/ZGdd2d3ZHdd7d8ZId d;d<ZJdd?d@ZKdAdB ZLdCdD ZMdddGdHZN	EdddLdMZO	EdddQdRZPddVdWZQddZd[ZRd	d\d]ZSd^d_ ZTd`da ZUe)dbZVe"dcdddeZWG dfdg dgeeVeWf e ZXd
djdkZYdldm ZZdndo Z[dpdq Z\	dddudvZ]dwdx Z^dd{d|Z_dddZ`dddZadddZbdddZcdd Zdejedg def ef ef ef gdZge@ddddZhejidddZjdddZke@ddd ZlG dd deZmG dd dZnG dd dZoe@ddd ZpdddZqdddZrdddZsddddZtdd Zudd ZvG dd dZwdd Zxdd Zyejidd ZzdddZ{dd Z|d ddddńZ}dddȄZ~ddʄ Zdd̄ Zdd΄ ZddЄ ZdddԄZejiddք Zdd؄ ZddZzd dlZW n eyr   dZY nw ddڄ Zdd܄ Zddބ Zdd Zdd Ze@ddd Ze@ddd Zdd Zdd ZdddZdddZdd ZG dd dejZdddZdS (      )annotationsN)StringIO)AnyCallableDictGenericIterableList
NamedTupleOptionalProtocolSetTypeVarUnion
ValuesView)mock)Concatenate	ParamSpec)get_interface_for_device)
DeviceType)	EventList)CeilDivCleanDivFloorDivModularIndexing   )config_T   d   fnCallable[[], Any]returnfloatc                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|  |   q)|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]}|   qYt jjt jjjgd}
t|	D ]	}|  |   qot j  W d	   n1 sw   Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Acuda)dtypedeviceT)Zenable_timing   r   )Z
activitiesNz
raw eventsZself_cuda_time_total)Zsort_by	row_limitc                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Sync)Zdevice_typer   CUDAname.0event r/   @C:\wamp64\www\opt\env\Lib\site-packages\torch/_inductor/utils.py
<listcomp>k   s
    z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   r/   )r-   ir.   Znum_event_per_groupr/   r0   r1   z   s
    zprofiling time breakdown)r)   c                 s  s    | ]}|j V  qd S N)Zcuda_time_totalr,   r/   r/   r0   	<genexpr>   s    z+do_bench_using_profiling.<locals>.<genexpr>g     @@zprofiling results: %s ms)torchr$   synchronizeemptyintEventrecordrangeZzero_Zelapsed_timemaxprofilerprofileZProfilerActivityr*   logdebugZkey_averagestabler   eventslenRuntimeError	enumerateZ_build_treesum)r    ZwarmuprepcacheZstart_eventZ	end_event_Zestimate_msZn_warmupZn_repeatpr3   Zfiltered_eventsZactual_eventsresr/   r4   r0   do_bench_using_profiling9   sh   	




rN   c                  O  s>   t d dd }| \}}||vrd||< || i |d S )Nc               
   S  sX   zddl m}  W n ty } ztd|d }~ww | t| jdd ur)dfS dfS )Nr   )do_benchzrequires TritonZ	quantilesZpercentiles)triton.testingrO   ImportErrorNotImplementedErrorinspect	signature
parametersget)triton_do_benchexcr/   r/   r0   load_triton   s   

zdo_bench.<locals>.load_triton)g      ?g?g?r   )	functools	lru_cache)argskwargsrY   rW   Zquantile_field_namer/   r/   r0   rO      s   

rO   boolc                  C  s@   zddl m}  | d uotttjdd dW S  ty   Y dS w )Nr   	roi_alignZtorchvisionr`   F)Ztorchvision.opsr`   hasattrgetattrr7   opsrQ   r_   r/   r/   r0   has_torchvision_roi_align   s   
rd   c                  G  s   t tjdd | D S )Nc                 S  s   g | ]}|r|qS r/   r/   r-   xr/   r/   r0   r1      s    z'conditional_product.<locals>.<listcomp>)rZ   reduceoperatormul)r\   r/   r/   r0   conditional_product      rj   r&   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdkr.| jd u r.t| j}t j| j|j	 dS | S )Ng        cpu)index)
r7   tensorr&   
isinstancestrtypero   r   ZWorkerZcurrent_devicer&   Zdevice_interfacer/   r/   r0   decode_device   s   


ru   c                 C  s   t tj| tdS )Nr   )rZ   rg   rh   ri   sympyIntegeritr/   r/   r0   sympy_product   s   rz   c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S r5   r/   )r-   abr/   r/   r0   r6          zsympy_dot.<locals>.<genexpr>)rE   rv   expandrH   zip)Zseq1Zseq2r/   r/   r0   	sympy_dot   s   r   ry   Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS r/   )idre   r/   r/   r0   
<dictcomp>   s    zunique.<locals>.<dictcomp>)valuesrx   r/   r/   r0   unique      r   numerUnion[int, sympy.Expr]denomc              	   C  sh   t | tjst |tjrt| |S t | trt |ts.J |  dt|  d| dt| | |   S )Nz: , )rq   rv   Exprr   r:   rs   )r   r   r/   r/   r0   ceildiv   s   
 r   nr:   c                 C  s`   | dksJ d| d8 } | | d? O } | | d? O } | | d? O } | | d? O } | | d? O } | d7 } | S )z9Return the smallest power of 2 greater than or equal to nl        z32-bit onlyr               r/   r   r/   r/   r0   next_power_of_2   s   r   lst"Iterable[Union[int, torch.SymInt]]List[sympy.Expr]c                 C  s   dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s*   g | ]}t |tjr|jjnt|qS r/   )rq   r7   ZSymIntnodeexprrv   rw   r-   r3   r/   r/   r0   r1      s    z-convert_shape_to_inductor.<locals>.<listcomp>r/   r   r/   r/   r0   convert_shape_to_inductor   s   r    Iterable[Union[int, sympy.Expr]]List[Union[int, torch.SymInt]]c                   s   ddl m   fdd| D S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r   Vc                   sB   g | ]}t |tr|nt |tjrt|n	 jjjj|d dqS )N)hint)rq   r:   rv   rw   graphZsizevarsZ	shape_envZcreate_symintnoder   r   r/   r0   r1     s    

z+convert_shape_to_symint.<locals>.<listcomp>)Zvirtualizedr   r   r/   r   r0   convert_shape_to_symint   s   
r   optorch._ops.OpOverloadc                 C  s(   t | tjjs	J tdd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S r5   )Z
alias_info)r-   r{   r/   r/   r0   r6         zis_view.<locals>.<genexpr>)rq   r7   _ops
OpOverloadany_schema	argumentsr   r/   r/   r0   is_view  s   r   c                 C  sh   | j dksdS t| jtjjs| jtju sdS | jtju s"t| jr,t	dd | j
D S tjj| jjv S )Ncall_functionFc                 s  s    | ]}t |V  qd S r5   )is_pointwise_use)r-   ur/   r/   r0   r6         z#is_pointwise_use.<locals>.<genexpr>)r   rq   targetr7   r   r   rh   getitemr   allusersTagZ	pointwisetags)Zuser/   r/   r0   r     s   
r   c           
      C  s   t j }g }g }t|D ] \}}t|t jr(||d|  || q|| qtdd |	 D s;J |
| t||}t| jjdkrZt| jjd jdkrZ|f}|| t ji |}	|	|fS )Nargc                 s  s    | ]
}t |tj V  qd S r5   )rq   r7   Tensorre   r/   r/   r0   r6   .  s    z$gen_gm_and_inputs.<locals>.<genexpr>r   r   r   )r7   ZfxZGraphrG   rq   r   appendplaceholderr   r   r   tuplerE   r   returnsrr   rs   outputZGraphModule)
r   r\   r]   gZg_argsZa_argsr   r   r   gmr/   r/   r0   gen_gm_and_inputs$  s    

r   r$   rr   c                 C  s,   | dkrd S t | }| r|  d S d S )Nrn   )r   Zis_availabler8   rt   r/   r/   r0   r8   ;  s   r8   modelCallable[..., Any]timesc                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r8   r7   Zmanual_seedtimeperf_counterr=   )r   Zexample_inputsr   r&   t0rK   resultt1r/   r/   r0   timedC  s   

r   r/   
         ?c                   sD   t  fddt|D }t | }t|| d |S )Nc                   s   g | ]	}t  qS r/   )r   )r-   rK   r\   r&   r    r   r/   r0   r1   U      z%print_performance.<locals>.<listcomp>z.6f)r7   rp   r=   Zmedianprint)r    r\   r   repeatZbaseliner&   ZtimingsZtookr/   r   r0   print_performanceR  s   "r   objr   methodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S r5   r/   r/   r   r/   r0   <lambda>^  s    z#precompute_method.<locals>.<lambda>N)rb   setattr)r   r   r/   r   r0   precompute_method[  s   r   methods	List[str]c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)r   )r   r   r   r/   r/   r0   precompute_methodsa  s   r   c                 C  s   t | |kt | |k  S r5   )r:   )r{   r|   r/   r/   r0   cmpg  rk   r   c                 C  s&   t | dkrt| | d g| S | S )Nr   r   )rE   rs   )rf   sizer/   r/   r0   pad_listlikek  s   r   c                 C  s$   t | dkrg S dd }t| |dS )Nr   c                 S  s   t | tr| S |  S r5   )rq   rr   get_name)elemr/   r/   r0   	sort_funcw  s   
ztuple_sorted.<locals>.sort_funckey)rE   sorted)rf   r   r/   r/   r0   tuple_sorteds  s   r   PRVT)	covariantc                   @  s$   e Zd ZedddZdd
dZdS )CachedMethodr"   Nonec                 C     d S r5   r/   selfr/   r/   r0   clear_cache  s   zCachedMethod.clear_cacher\   P.argsr]   P.kwargsr   c                 O  r   r5   r/   )r   r\   r]   r/   r/   r0   __call__  s   zCachedMethod.__call__N)r"   r   )r\   r   r]   r   r"   r   )__name__
__module____qualname__staticmethodr   r   r/   r/   r/   r0   r     s    r   !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   s<   d j  dt  fdd}fdd}||_|S )N___cachec                   s$   t | st|  |  t| S r5   )ra   r   rb   r   r    r   r/   r0   wrapper  s   

zcache_on_self.<locals>.wrapperc                   s   t |  rt|   d S d S r5   )ra   delattrr   r   r/   r0   r     s   
z"cache_on_self.<locals>.clear_cache)r   rZ   wrapsr   )r    r   r   r/   r   r0   cache_on_self  s   r   c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr   irc                 S  s$   g | ]}t |d r|jr|jjqS )r   )ra   r   origins)r-   r   r/   r/   r0   r1     s    z%aggregate_origins.<locals>.<listcomp>) r   rq   listrZ   rg   rh   or_setZExternKernelr   )node_scheduler   r/   r/   r0   aggregate_origins  s   
	r  c                 C  s   t | }|dkrdd |D }tt|}nH|dkrPg }|D ]*}|jdkrHd|jv rH|jd d }t|d tr@||d  q||d j qtt|}n|d	kr\d
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s.   g | ]}|j d krd|jv r|jd jjqS )r   r  )r   meta_overloadpacketr   r-   originr/   r/   r0   r1     s
    z)get_fused_kernel_name.<locals>.<listcomp>r7   r   Zsource_fn_stackr(   r   Zinductor_nodec                 S  s   g | ]
}|j d kr|jqS r   )r   r+   r
  r/   r/   r0   r1     s    rK   Zfused)r  r   r  r   r  rq   rr   r   r   rR   join)r  Zdescriptive_namesall_originssourcesr  Z	source_fnr/   r/   r0   get_fused_kernel_name  s.   r  c                 C  s  t | }dd |D }tt}tt}|D ]-}d|jv r.t|jd j}|| |j d|jv rD|jd d d }|| |j q|j	 dd
t|  dd
t|  d	}g }	t| D ]\}
}|	|j	 d
|
 dd
t|  qg|d
|	fS )Nc                 S  s   g | ]	}|j d kr|qS r  r   r
  r/   r/   r0   r1     r   z'get_kernel_metadata.<locals>.<listcomp>r  Z	from_noder   z Source Nodes: [r   z], Original ATen: [] z => 
)r  collectionsdefaultdictr  r  rr   r	  r   r+   commentr  r   keysitems)r  r   r  Zinductor_nodesZfrom_node_dictZoriginal_aten_dictr   r   metadataZdetailed_metadataZoriginal_nodenodesr/   r/   r0   get_kernel_metadata  s,   



r  initial_queueIterable[torch.fx.Node]Set[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)r  r  popr   addr   )r  Zskip_filterZdominated_setr   userr/   r/   r0   dominated_nodes  s   


	r"  c                   sb   dd l }ddlm   fddfdd| D }fdd| D }t|jg ||R  S )	Nr   r   r   c                   sD   t |  jr| jS t |  jr| jS t |  jo!t |  jS r5   )rq   	TensorBoxdata
StorageBoxIRNodeZ	Pointwiser   r   is_unrealized_noder/   r0   r(    s
   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS r/   r   )r-   valr(  r/   r0   r1     r   z"gather_origins.<locals>.<listcomp>c                   r)  r/   r*  )r-   r   r,  r/   r0   r1     r   )	itertoolsr  r   r   r  chain)r\   r]   r-  Zkwarg_originsZarg_originsr/   r'  r0   gather_origins   s   r/  r   
sympy.Exprc                 C  s   t | tjr	| jS t | tjrdtt| jS t | tj	r'dtt| jS t | t
ttfr@| jj ddtt| j dS t| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))rq   rv   Symbolr+   Addr  map	sympy_strr\   ZMulr   r   r   funcr   rr   )r   r/   r/   r0   r6    s   "r6  r+   sympy.Symbolc                 C  s    | d dksJ t j| dddS )Nr   sT)integerZnonnegative)rv   r3  r+   r/   r/   r0   sympy_symbol#  s   r<  replacementsDict[Any, Any]c                   s*   dd  t |  fdd| D S )z=
    xreplace is faster than subs, but is way more picky
    c                 S  s   t | tr	t| S | S r5   )rq   rr   r<  r   r/   r/   r0   promote_strings1  s   
z#sympy_subs.<locals>.promote_stringsc                   s   i | ]\}} | |qS r/   r/   )r-   kvr?  r/   r0   r   7  s    zsympy_subs.<locals>.<dictcomp>)rv   ZsympifyZxreplacer  )r   r=  r/   rB  r0   
sympy_subs,  s   
rC  ro   prefixc                      t  fdd| jD S )Nc                 3  s    | ]	}|j  V  qd S r5   )r+   
startswithr-   rA  rD  r/   r0   r6   <  r}   z)free_symbol_startswith.<locals>.<genexpr>r   Zfree_symbols)ro   rD  r/   rH  r0   free_symbol_startswith;  rk   rJ  patternc                   rE  )Nc                 3  s    | ]} |j v V  qd S r5   r;  rG  rK  r/   r0   r6   @  r   z"free_symbol_has.<locals>.<genexpr>rI  )ro   rK  r/   rL  r0   free_symbol_has?  rk   rM  c                 C  sD   h d}t  r|h d | jjD ]}t|j|v r dS qdS )N>   Zrun_with_rng_statez7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultZrun_and_save_rng_statez%fbgemm.jagged_to_padded_dense.defaultzaten.multinomial.defaultzaten._local_scalar_densez,aten._fused_moving_avg_obs_fq_helper.default>   zaten.index_put_.defaultzaten.scatter.reducezaten._unsafe_index_put.defaultzaten.scatter_add.defaultzaten.scatter_reduce.twozaten.index_put.defaultzaten.scatter_add_zaten.scatter_reduce.two_outzaten.scatter_reduce_.twozaten.scatter.value_reducezaten.scatter.srcTF)r7   Z$are_deterministic_algorithms_enabledupdater   r  rr   r   )r   Zforbidden_setr   r/   r/   r0   has_incompatible_cudagraph_opsC  s   
rO  instance_descriptor)Zdivisible_by_16Z
equal_to_1Zids_of_folded_argsZdivisible_by_8)defaultsc                  C  sN   t jd} | d u rtddt }t jt	
 d| } t j| dd | S )NTORCHINDUCTOR_CACHE_DIRz[\\/:*?"<>|]rK   Ztorchinductor_T)exist_ok)osenvironrV   resubgetpassgetuserpathr  tempfile
gettempdirmakedirs)	cache_dirZsanitized_usernamer/   r/   r0   r^  k  s   r^  c              
   #  s   t  u}tjtjd|iR tj|d tjtjd i1 dV  t	| trLt
| dks5J dtj rLt }|  fdd|D  W d   n1 sVw   Y  W d   n1 sew   Y  W d   dS W d   dS 1 s}w   Y  dS )	z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    rR  ZtritonZTRITON_CACHE_DIRNr   z!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)rT  rZ  getsizer  )r-   fZtriton_cache_dirr/   r0   r     s
    z(fresh_inductor_cache.<locals>.<dictcomp>)r[  TemporaryDirectoryr   patchdictrT  rU  rZ  r  rq   rE   existslistdirrN  )Zcache_entriesZinductor_cache_dirfilesr/   ra  r0   fresh_inductor_cachex  s0   




"rh  	List[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r   reverse)__getitem__r=   rE   r  reversedr   )seqgetterZa_rr/   r/   r0   argsort  s   ro  r   c                 C  s   t jd| d S )Nr/   r%   )r7   r9   element_sizerp  r/   r/   r0   get_dtype_size  s   rr  c                   @  s   e Zd ZU ded< dS )LineContextr   contextN)r   r   r   __annotations__r/   r/   r/   r0   rs    s   
 rs  c                   @  s|   e Zd ZdZd!ddZd"ddZd#d
dZd#ddZdd Zdd Z	dd Z
dd Zdd Zdd Zd$ddZd%ddZd S )&IndentedBufferr   r   c                 C  s   g | _ || _d S r5   )_lines_indent)r   initial_indentr/   r/   r0   __init__  s   
zIndentedBuffer.__init__r"   )tuple[str, list[tuple[int, LineContext]]]c                 C  s   t  }d}g }| jD ]8}t|tr| }|d u rq
nt|tr(|||jf q
t|ts/J || |d |d|	d 7 }q
|
 |fS )Nr   r  )r   rw  rq   DeferredLineBasers  r   rt  rr   writecountgetvalue)r   bufrL   Zlinemapliner/   r/   r0   getvaluewithlinemap  s"   




z"IndentedBuffer.getvaluewithlinemaprr   c                 C  s   |   \}}|S r5   )r  )r   rA  rK   r/   r/   r0   r    s   zIndentedBuffer.getvaluec                 C  s   t  }| jD ]6}t|tr| }|d u rqnt|trqt|ts#J |dr2||d d  q|| |d q| S )N\r(   r  )	r   rw  rq   r|  rs  rr   endswithr}  r  )r   r  r  r/   r/   r0   getrawvalue  s   




zIndentedBuffer.getrawvaluec                 C  s   | j   d S r5   )rw  clearr   r/   r/   r0   r       zIndentedBuffer.clearc                 C  
   t | jS r5   )r^   rw  r   r/   r/   r0   __bool__     
zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr  )rx  tabwidthr   r/   r/   r0   rD       zIndentedBuffer.prefixc                 C  s   |  d d S )Nr  	writeliner   r/   r/   r0   newline  r  zIndentedBuffer.newlinec                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nr  )rq   rs  rw  r   r|  with_prefixrD  stripr   r  r/   r/   r0   r    s   

zIndentedBuffer.writelinec                 C  s   |D ]}|  | qd S r5   r  )r   linesr  r/   r/   r0   
writelines  s   zIndentedBuffer.writelinesr   c                   s   t j fdd}| S )Nc                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w r5   )rx  r/   offsetr   r/   r0   ctx  s
   "z"IndentedBuffer.indent.<locals>.ctx)
contextlibcontextmanager)r   r  r  r/   r  r0   indent  s   zIndentedBuffer.indentFc                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   r  )rq   rv  r#   rw  rs  minrE   lstripmathisinfr   r  r:   textwrapdedentrstripsplit)r   Z
other_coder  r  r  r/   r/   r0   splice  s,   





zIndentedBuffer.spliceNr2   )r"   r{  r"   rr   )r   )F)r   r   r   r  rz  r  r  r  r  r  rD  r  r  r  r  r  r/   r/   r/   r0   rv    s    





rv  c                   @  sT   e Zd ZdZdd ZdddZdd
dZdd Zdd Zdd Z	dd Z
dd ZdS )r|  z.A line that can be 'unwritten' at a later timec                 C  s   |  sd}|| _d S r  )r  r  r  r/   r/   r0   rz    s   
zDeferredLineBase.__init__r"   Optional[str]c                 C     t  )zJReturns either self.line or None to indicate the line has been 'unwritten'rR   r   r/   r/   r0   r        zDeferredLineBase.__call__r  rr   c                 C  r  )z3Returns a new deferred line with the same conditionr  r  r/   r/   r0   	_new_line  r  zDeferredLineBase._new_linec                 C  s   |  | | j S r5   r  r  )r   rD  r/   r/   r0   r     s   zDeferredLineBase.with_prefixc                 C  s   |  | j S r5   )r  r  r  r   r/   r/   r0   r  #  r  zDeferredLineBase.lstripc                 C  s   |  | j| S r5   r  )r   ro   r/   r/   r0   rk  &  r  zDeferredLineBase.__getitem__c                 C  r  r5   )r^   r  r   r/   r/   r0   r  )  r  zDeferredLineBase.__bool__c                 C  r  r5   )rE   r  r   r/   r/   r0   __len__,  r  zDeferredLineBase.__len__N)r"   r  )r  rr   r"   r|  )r   r   r   __doc__rz  r   r  r  r  rk  r  r  r/   r/   r/   r0   r|    s    

r|  c                 C  s(   t j| j}|dk rtd dS dS )NP   z,not enough SMs to use max_autotune_gemm modeFT)r7   r$   Zget_device_propertiesZmulti_processor_countrA   warning)ro   Zsmsr/   r/   r0   
is_big_gpu0  s
   
r  c                   C  s   t jpt jpt jS r5   )r   Zmax_autotuneZmax_autotune_gemmZsearch_autotune_cacher/   r/   r/   r0   use_max_autotune9  s   r  allowed_layout_dtypesList[torch.dtype]c                 C  s,   t  o| jjdko| j|v ot| jjpdS )Nr$   r   )r  r&   rs   r%   r  ro   )layoutr  r/   r/   r0   _use_template_for_cuda?  s   
r  backendc                 C  s"   |   dd tj  dD v S )Nc                 S  s   g | ]}|  qS r/   )r  re   r/   r/   r0   r1   I  s    z)_use_autotune_backend.<locals>.<listcomp>,)upperr   Zmax_autotune_gemm_backendsr  )r  r/   r/   r0   _use_autotune_backendH  s   r  F)enable_int32c                C  s:   t jt jt jg}|rt jt jt jt jg}t| |otdS )NZTRITON)r7   float16bfloat16float32Zint32r  r  )r  r  layout_dtypesr/   r/   r0   use_triton_templateN  s   r  c                 C  sV   ddl m} tjjrdS tjtjtjg}t| |ot	d}|r)| s)t
d dS |S )Nr   )try_import_cutlassFZCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)Zcodegen.cuda.cutlass_utilsr  r7   versionhipr  r  r  r  r  rA   r  )r  r  r  rM   r/   r/   r0   use_cutlass_templateW  s   r  c                   C  s   t   ptdS )NZATEN)r  r  r/   r/   r/   r0   use_aten_gemm_kernelsn  r  r  c                   @  s8   e Zd ZU edZded< dd Zdd Zdd	 Z	d
S )DebugDirManagerr   rr   prev_debug_namec                 C  s   t tj| _d S r5   )nextr  counterr   r   r/   r/   r0   rz  v  r  zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )NZ_tmp_)r7   _dynamor   debug_dir_rootr  r   new_namer   r/   r/   r0   	__enter__y  s   zDebugDirManager.__enter__c                 G  s   t | j | jtjj_d S r5   )shutilrmtreer  r  r7   r  r   r  )r   r\   r/   r/   r0   __exit__~  s   zDebugDirManager.__exit__N)
r   r   r   r-  r~  r  ru  rz  r  r  r/   r/   r/   r0   r  r  s   
 
r  c                   sz   ddl m} |j g  fdd}tj|d| tj  | |i |}W d    |fS 1 s4w   Y  |fS )Nr   )GraphLoweringc                   sF    | }t |j}|  W d    |S 1 sw   Y  |S r5   )open__file__r   read)r   modr`  compile_to_modulesource_codesr/   r0   patched_compile_to_module  s   
z3run_and_get_code.<locals>.patched_compile_to_moduler  )	r   r  r  r   rc  objectr7   r  reset)r    r\   r]   r  r  r   r/   r  r0   run_and_get_code  s   

r  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S )Nr   r   z%expected one or two code outputs got r   )r  rE   )r    r\   r]   rK   r  r/   r/   r0   run_and_get_triton_code  s
   r  c              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)Ztorch._inductorr  Z	loweringsrZ   partial)Zaten_opZoverride_fnr  orig_fnr/   r/   r0   override_lowering  s   
r  c                   s4   ddl m} |j  fdd}tjj|d|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                   s&   | |  | |}r| | |S r5   r/   )Z	schedulerr  outr  post_fnpre_fnr/   r0   r     s
   


z(add_scheduler_init_hook.<locals>.wrapperrz  )Ztorch._inductor.schedulerr  rz  unittestr   rc  r  )r  r  r  r   r/   r  r0   add_scheduler_init_hook  s   r  c                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r   Zdeveloper_warningsrA   r  infomsgr/   r/   r0   developer_warning  s   r  num_in_out_argsr\   torch.Tensorr  c                   s   t  fddt|D S )z
    Return the total number of bytes the arguments of tensor type takes.

    For in/out args, tensor sizes are counted twice: once for reading and
    once for writing.

    The first num_in_out_args arguments are in out tensors.
    c                 3  s@    | ]\}}t |tjr| |  d t| k   V  qdS r   N)rq   r7   r   Znumelrq  r:   )r-   r3   r   r  r/   r0   r6     s    

z get_num_bytes.<locals>.<genexpr>)rH   rG   )r  r\   r/   r  r0   get_num_bytes  s   	r  r  c                 C  s   | | dd|dd|dd| }zdd l }| dkr,|dk r/|jj| |jj }W |S W |S W |S  ty@   td	 Y |S w )
Nz.3fzms    	z GB 	 z7.2fzGB/sr   g~jt?i  z@Colorama is not installed. Install it if you want colored output)coloramaForeZREDRESETrQ   rA   r  )msZnum_gbZgb_per_srD  suffixZinfo_strr  r/   r/   r0   create_bandwidth_info_str  s   $r  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr   r   -z--only=N)sysargvro   rE   
ValueErrorrF  )idxr   r/   r/   r0   get_benchmark_name  s   

r  c                 C     t dd | D S )Nc                 s      | ]}|d kV  qdS r  r/   re   r/   r/   r0   r6     r   zis_ones.<locals>.<genexpr>r   r  r/   r/   r0   is_ones  r   r  c                 C  r  )Nc                 s  r  )r   Nr/   re   r/   r/   r0   r6     r   zis_zeros.<locals>.<genexpr>r   r  r/   r/   r0   is_zeros  r   r  c                 C  r  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )rn   N)rq   r7   r   r&   )r-   itemr/   r/   r0   r6     s    

z is_cpu_device.<locals>.<genexpr>r   )Zinputsr/   r/   r0   is_cpu_device  s   r  r+  torch.dtypec                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rq   rv   r   
is_integerr7   Zint64Zfloat64)r+  r/   r/   r0   get_sympy_Expr_dtype  s   r  c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S r5   )r7   r?   r@   )Zshould_profiler\   r]   rL   r/   r/   r0   maybe_profile%  s   "
r	  c                 C  s6   t | j }|d| jf |d| jf t|S )z~
    Convert triton config to a tuple that can uniquely identify it. We can use
    the return value as a dictionary key.
    	num_warps
num_stages)r   r]   r  r   r
  r  r   )cfgr  r/   r/   r0   triton_config_to_hashable.  s   r  c                 C  s$   t s| S ttj| |  tjj S r5   )HAS_COLORAMArb   r  r  r  r  )r  colorr/   r/   r0   _color_text@  s   r  c                 C  
   t | dS )Ngreenr  r  r/   r/   r0   
green_textG  r  r  c                 C  r  )Nyellowr  r  r/   r/   r0   yellow_textK  r  r  c                 C  r  )Nredr  r  r/   r/   r0   red_textO  r  r  c                 C  r  )Nbluer  r  r/   r/   r0   	blue_textS  r  r  c                 C  s   ddl m}m} | tjtjtjfv sJ tjjr4| tjtjfv r$|| S tj	j
jjr/|tjS |tjS ddl m} |dgd }| tjtjfv rN|| |S tj	j
jjrZ|tj|S |tj|S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)nvsmizclocks.current.sm)rP   r  r  r7   r  r  r  r  r  backendsr$   matmulZ
allow_tf32r  )r%   r  r  r  Zcur_sm_clockr/   r/   r0   get_device_tflopsW  s   


r   c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)rP   r"  r!  r/   r/   r0   get_gpu_dram_gbpsq  s   r#  c                 C  s
   |  dS )NZwelford)rF  Zreduction_typer/   r/   r0   is_welford_reductionx  r  r%  c                 C  s   t | rdS dS )N   r   )r%  r$  r/   r/   r0   reduction_num_outputs|  r  r'  c                   C  s   t  dkS )NLinux)platformsystemr/   r/   r/   r0   is_linux  s   r+  itrIterable[Any]c                 C  r  )Nc                 s  s$    | ]}t |tjo|j V  qd S r5   )rq   rv   r   Z	is_numberre   r/   r/   r0   r6     s   " z#has_free_symbols.<locals>.<genexpr>)r   )r,  r/   r/   r0   has_free_symbols  r   r.  c                  G  s   ddl m} | D ]V}t||jr(t|j s$t|jdr't|j r' dS qt||j	|j
|jfrOt|dr=t|ds?J t| sKt| rN dS qt||jsVqtdt| dS )Nr   r   
get_strideTget_sizezunexpected type for is_dynamic F)r  r   rq   r#  r.  r$  r0  ra   r/  r%  ZBaseViewZComputedBufferr&  	TypeErrorrs   )r\   r   tr/   r/   r0   
is_dynamic  s&   
r3  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r5  r6  r/   r/   r/   r0   r4    s    r4  so_pathc                 C  s6   |dkr
d|  dS |dkrd|  dS t d| )Nr$   z
            #include <torch/csrc/inductor/aoti_model_container_runner_cuda.h>

            torch::inductor::AOTIModelContainerRunnerCuda runner("a  ");

            std::vector<at::Tensor> run(std::vector<at::Tensor>& input_tensors) {
                return runner.run(input_tensors);
            }

            std::vector<const char*> get_call_spec() {
                return runner.get_call_spec();
            }
        rn   z
            #include <torch/csrc/inductor/aoti_model_container_runner.h>

            torch::inductor::AOTIModelContainerRunnerCpu runner("zUnsupported device: )rF   )r7  r&   r/   r/   r0   aot_inductor_launcher  s   r8  )r   r   )r    r!   r"   r#   )r"   r^   )r&   rl   r"   rm   )ry   r   r"   r   )r   r   r   r   r"   r   )r   r:   r"   r:   )r   r   r"   r   )r   r   r"   r   )r   r   )r$   )r&   rr   )r   r$   )r   r   r   r:   r&   rr   r"   r#   )r/   r   r   r   r$   )r   r   r   rr   )r   r   r   r   )r"   r:   )r    r   r"   r   r5   )r  r  r"   r  )r   r0  r"   rr   )r+   rr   r"   r8  )r   r0  r=  r>  r"   r0  )ro   r0  rD  rr   )ro   r0  rK  rr   r  )r"   ri  )r  r  r"   r^   )r  rr   r"   r^   )r\   r  r  r:   r"   r:   )r  r  )r+  r0  r"   r  )r,  r-  )r7  rr   r&   rr   )
__future__r   r  r  enumrZ   rX  rS   r-  loggingr  rh   rT  r)  rV  r  r  r[  r  r   r  ior   typingr   r   r   r   r   r	   r
   r   r   r   r   r   r   r   rv   Ztyping_extensionsr   r   r7   Ztorch._dynamo.device_interfacer   Ztorch.autogradr   Ztorch.autograd.profiler_utilr   Ztorch.utils._sympy.functionsr   r   r   r   r  r   	getLoggerr   rA   r   r   Z	VarRangesrN   rO   r[   rd   rj   ru   rz   r   r   r   r   r   r   r   r   r   r8   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r"  r/  r6  r<  rC  rJ  rM  rO  
namedtupler   rP  r^  r  rh  ro  rr  rs  rv  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  rQ   r  r  r  r  r  r   r#  r%  r'  r+  r.  r3  Enumr4  r8  r/   r/   r/   r0   <module>   s   <
R




		
	!
l 

			





