o
    I&if                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZm Z m!Z! d dl"m"Z" d dl#m$Z$m%Z%m&Z& d dlm'Z' d dlm(Z( d d	lm)Z) d d
lm*Z* d dlm+Z+ d dl,m-Z-m,Z, d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 d dl:Z:d dl;m<Z<m=Z= d dl>m?Z? d dl@mAZAmBZB d dlCmDZD d dlEmFZFmGZGmHZH d dlImJZJ d dlKmLZLmMZMmNZN e8rd dlOmPZP d dlQmRZR d dlSmTZTmUZU ejVWeXZYejVZejVZeYZ[eA\ rPd dl]m^Z^ d dl_m`Z` d dlambZbmcZcmdZdmeZe ndd Zbdd Zcd d! Zddd$d%Zed&Zfd'agdahdd)d*Zidd+d,Zje
kelZmdd/d0Znd1d2 ZoG d3d4 d4ZpG d5d6 d6epZqG d7d8 d8epZrdd9d:Zsdd=d>ZtdddCdDZu	?dddIdJZvdddMdNZw	?	@	?dddPdQZxddSdTZyejzG dUdV dVZ{ejzG dWdX dXZ|dd[d\Z}dd_d`Z~dadb Zdcdd Zdedf ZG dgdh dhejZedddidjZejzG dkdl dlZG dmdn dnZddudvZG dwdx dxZejzG dydz dzZdd~dZdddZeddddZdddZdddZdddZeddddZG dd dZejzG dd deZejzG dd deZejzG dd deZG dd deZe Ze e gZedd ddZd!ddZd"d#ddZd"d$ddZd"d%ddZdddZdddZdddZdddZdddZdddZdddZeddddZedd&ddZdeddfd'ddZdddeddddfd(ddǄZd)ddʄZedd*dd̄ZG dd΄ d΃ZG ddЄ dЃZejddd҄ZdddԄZd+ddلZdaded< G dd݄ d݃ZG dd߄ d߃ZG dd dZG dd dZd,ddZd-ddZd-ddZd-ddZd-ddZd.ddZG dd dZG dd dZdd Zd/d dZd0ddZG dd dZdddZdaÐd	ed
< G dd dZeĠš  dS (1      )annotationsN)bisect_right)FutureProcessPoolExecutorThreadPoolExecutor)copy)c_void_pcdllCDLL)field)partial)abc)Path)Thread)sleeptime)
ModuleType)	AnyCallableDictListOptionalSetTupleTYPE_CHECKINGUnion)get_interface_for_device get_registered_device_interfaces)counters)configexc)cuda_env)	cache_dirdeveloper_warningis_linux)suggest_memory_format)has_hinthint_intShapeEnv)GraphLowering)ChoiceCaller)	_Faketqdmtqdm)build_paths)_run_build_command)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cachec                  O     d S N argskwargsr5   r5   DC:\wamp64\www\opt\env\Lib\site-packages\torch/_inductor/codecache.pyr/   L      r/   c                  O  r3   r4   r5   r6   r5   r5   r9   r0   O   r:   r0   c                  O  r3   r4   r5   r6   r5   r5   r9   r1   R   r:   r1   returnboolc                   C     dS NFr5   r5   r5   r5   r9   r2   U   r:   r2   iX  g        Nonec                   C  s   t d u r	t a d S d S r4   )_t0r   r5   r5   r5   r9   _compile_start`   s   
rA   c                  C  s&   t d urt } t| t  7 ad a d S d S r4   )r@   r   _cumulative_compile_time)t1r5   r5   r9   _compile_endf   s
   rD   namestrc                 C  sz   t jjd u rdn
dt jjdd }dtjj tjj }| d| }tj	
t |}tj	
|| }tj|dd |S )	Ncpucu. py_Texist_ok)torchversioncudareplacesysversion_infomajorminorospathjoinr"   makedirs)rE   Zcu_strpython_versionZbuild_foldercpp_wrapper_dirZcpp_wrapper_build_directoryr5   r5   r9   cpp_wrapper_cache_dirr   s   r]   c                   C  s   t jjd u rdS dS )NZ
cubin_pathZ
hsaco_path)rO   rP   hipr5   r5   r5   r9   get_cpp_wrapper_cubin_path_name      r_   c                   @  sr   e Zd ZeeddddZeeddddZeeddd
dZdddZ	dddZ
dddZdS )	CacheBaseNr;   Dict[str, Any]c               	   C  s   z	dd l } | j}W n ty   d }Y nw zdtjtj jitjj|ddtj	jj
jid}W n ttfy?   i }Y nw ttj|ddd |d	< |S )
Nr   rE   )rQ   triton
allow_tf32)devicerP   otherT)	sort_keysutf-8hash)rc   __version__ModuleNotFoundErrorrO   rQ   get_device_propertiescurrent_devicerE   rP   backendsmatmulrd   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)rc   Ztriton_versionsystemr5   r5   r9   
get_system   s6   

zCacheBase.get_systemr   c                   C  s   t tjt dt d S )Ncacheri   )r   rW   rX   rY   r"   ra   ry   r5   r5   r5   r9   get_local_cache_path   s   zCacheBase.get_local_cache_pathOptional[Path]c                   C  s*   t jd urttjt jt d S d S )Nri   )r   Zglobal_cache_dirr   rW   rX   rY   ra   ry   r5   r5   r5   r9   get_global_cache_path   s
   
zCacheBase.get_global_cache_pathr?   c                 C  s0   t j sd S t | _t | _t | _	d S r4   )
rO   rQ   is_availablera   ry   rx   r{   local_cache_pathr}   global_cache_pathselfr5   r5   r9   __init__   s
   


zCacheBase.__init__c                 C  sP   | j  si S t| j }t|}W d    |d S 1 sw   Y  |d S Nrz   )r   is_fileopenrt   load)r   Zlocal_cache_fplocal_cacher5   r5   r9   get_local_cache   s   

zCacheBase.get_local_cacher   c                 C  sH   t j| jjst j| jjdd tt| jtj	| j
|ddd d S )NTrM   )rx   rz      )indent)rW   rX   existsr   parentrZ   write_atomicrF   rt   ru   rx   )r   r   r5   r5   r9   update_local_cache   s   zCacheBase.update_local_cache)r;   rb   )r;   r   )r;   r|   r;   r?   )r   rb   r;   r?   )__name__
__module____qualname__staticmethod	functools	lru_cachery   r{   r}   r   r   r   r5   r5   r5   r9   ra      s    !

	ra   c                   @  s    e Zd ZdddZdd
dZdS )
LocalCachekeysrF   r;   Optional[Dict[str, Any]]c                 G  s0   |   }|}|D ]}||v r|| }q d S |S r4   )r   )r   r   rz   	sub_cachekeyr5   r5   r9   lookup   s   
zLocalCache.lookupvaluer   r?   c                G  sL   |   }|}|dd D ]}||i  || }q|||d < | | d S )Nr   )r   
setdefaultr   )r   r   r   rz   r   r   r5   r5   r9   	set_value   s   
zLocalCache.set_valueN)r   rF   r;   r   )r   rF   r   r   r;   r?   )r   r   r   r   r   r5   r5   r5   r9   r      s    
r   c                   @  s(   e Zd Zeddd ZdddZdS )PersistentCacheNc                 C  sZ   | j d u s
| j  si S t| j }t|}W d    |d S 1 s$w   Y  |d S r   )r   r   r   rt   r   )r   Zglobal_cache_fpZglobal_cacher5   r5   r9   get_global_cache   s   
z PersistentCache.get_global_cachechoicesList[ChoiceCaller]rE   rF   inputs	benchmark*Callable[[Any], Dict[ChoiceCaller, float]]r;   Dict[ChoiceCaller, float]c              
     sL  t t| j}t t| j}t t| j}i dd fdd}tjs+tjr|  }	||	st	 r>|| 
 |dsz4| tfdd D sPJ |	i  |	 i   D ]\}
}||	  |
 < qbW n ty } z|| |d}~ww | |	 fd	d
 D }|| S t	 r|| 
 |d S )a  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check global_cache[name][inputs][choice], return benchmark if cached.
            2. Check local_cache[name][inputs][choice], return benchmark if cached.
            3.
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[name][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        Nr;   r<   c                   sZ   d} D ]}|  }|| i i v r!|   | |< qd} |r+||d |S )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)rz   callbackhitchoiceZchoice_hashr   r   rE   timingsr5   r9   check_cache
  s   
z+PersistentCache.lookup.<locals>.check_cache)r   c                 3  s    | ]}| v V  qd S r4   r5   .0r   r   r5   r9   	<genexpr>$      z)PersistentCache.lookup.<locals>.<genexpr>c                   s   i | ]	}|   | qS r5   )r   r   r   r5   r9   
<dictcomp>1  s    z*PersistentCache.lookup.<locals>.<dictcomp>r4   r;   r<   )r   r0   rx   r1   r/   r   Zmax_autotuneZmax_autotune_gemmr   r2   r   allr   itemsr   rq   r   )r   r   rE   r   r   Z	log_statsZlog_valsZ
log_errorsr   r   r   ZtimingeZtimings_to_logr5   r   r9   r      sD   

zPersistentCache.lookup)
r   r   rE   rF   r   rF   r   r   r;   r   )r   r   r   r   r   r   r   r5   r5   r5   r9   r      s    
r   c                  C  s.   t jt d} t j| st j| dd | S )NlocksTrM   )rW   rX   rY   r"   r   rZ   )lock_dirr5   r5   r9   get_lock_dir=  s   r   databytesc                 C  s&   t t|  d d d S )N3   rh   )base64	b32encoderr   rs   digestdecodelower)r   r5   r5   r9   sha256_hashD  s   &r   rJ   codeUnion[str, bytes]extrac                 C  s>   t | tr| n| d}|dkr|d |d }dt| S )Nrh   rJ   s   ||c)
isinstancer   rv   r   )r   r   Zhashing_strr5   r5   r9   	code_hashI  s   r   basename	extensionspecified_dirTuple[str, str, str]c                 C  sb   |rt j|r|}nt jt |}nt jt | dd }t j||  d| }| ||fS )N      rI   )rW   rX   isabsrY   r"   )r   r   r   subdirrX   r5   r5   r9   get_pathP  s   
r   content	hash_typec                 C  s4   |dkr	t | |S |dv rt t| S td| )Nr   )cubinhsacozUnknown hash type )r   reprrp   )r   r   r   r5   r5   r9   get_hash^  s
   
r   Tuple[str, str]c           	      C  sZ   t |  ||}t|||\}}}tj|stj|dd tj|s)t||  ||fS )NTrM   )r   stripr   rW   rX   r   rZ   r   )	r   r   r   r   r   r   r   r   rX   r5   r5   r9   writef  s   

r   rX   c                 C  s   t |ttfsJ dt| } | jdt  dt	  d }t |tr'dnd}|
|}|| W d    n1 s>w   Y  ||  d S )Nz6Only strings and byte arrays can be saved in the cacherI   z.tmpwwb)r   rF   r   pathlibr   r   rW   getpid	threading	get_identr   r   rename)rX   r   Ztmp_pathZ
write_modefr5   r5   r9   r   y  s   
 r   c                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< ded< ded< ded< ded< ded< ded< ded< ded< ded< dS )TensorMetadatazG
    The Tensor metadata relevant when hashing FxGraph cache keys.
    torch.dtypedtypez
torch.SizeshapezTuple[Any, ...]stridetorch.devicere   ztorch.layoutlayoutzOptional[torch.memory_format]memory_formatintstorage_offsetr<   requires_gradis_quantizedis_conjis_negis_coalesced	dense_dim
sparse_dimNr   r   r   __doc____annotations__r5   r5   r5   r9   r     s    
 r   c                   @  s"   e Zd ZU dZded< ded< dS )TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    r   Ztensor_metadata	List[Any]valuesNr   r5   r5   r5   r9   r     s   
 r   ttorch.Tensorc                 C  s   t | }| j|dsd}t| j| j| jtjkr|  nd| j	| j|| 
 | j| j|  |  | jr5|  nd| jr=|  nd| jrG|  dS ddS )z1
    Extract the TensorMetadata of a tensor.
    )r   Nr5   F)r   r   r   re   r   r   r   r   r   r   r   r   r   r   )r%   Zis_contiguousr   r   r   r   rO   Zstridedr   re   r   r   r   r   r   Z	is_sparser   r   r   )r   r   r5   r5   r9   extract_tensor_metadata  s*   r  xr   c                 C     | S r4   r5   )r  r5   r5   r9   _ident  r:   r  c                 C  s   t | }t|ffS )zH
    See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
    )r  r  r   metadatar5   r5   r9   _reduce_fake_tensor  s   
r  c                 C  sD   t | }t| jdkstjjj| rtt	|| 
 ffS t|ffS )zD
    See FxGraphCachePickler. Custom reducer to pickle Tensors.
    r   )r  lenr   rO   Z	_inductorgraphr)   Zcan_inline_constantr  r   tolistr  r5   r5   r9   _reduce_tensor  s   

r  c                 C  s   t t| ffS )zD
    See FxGraphCachePickler. Custom reducer to pickle SymInts.
    )r  rF   )sr5   r5   r9   _reduce_symint  s   r  c                   @  sX   e Zd ZdZej Zeeej	j
j< eeej< eeej< edddZedd	d
ZdS )FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
    r;   r   c                 C  sF   t  }t|}||  | W  d   S 1 sw   Y  dS )zA
        Pickle an object using the FxGraphCachePickler.
        N)ioBytesIOr  dumpgetvalue)objstreamZpicklerr5   r5   r9   ru     s
   

$zFxGraphCachePickler.dumpsr  r   rF   c                 C  s   t | }t|S )zt
        Serialize an object using the FxGraphCachePickler and return a hash
        of the pickled object.
        )r  ru   r   )r  Zserialized_datar5   r5   r9   r      s   
zFxGraphCachePickler.get_hashNr;   r   )r  r   r;   rF   )r   r   r   r   copyregdispatch_tabler   r  rO   Z_subclassesZfake_tensorZ
FakeTensorr  Tensorr  SymIntr   ru   r   r5   r5   r5   r9   r    s    


	r  c               	   C  s   t jt} i }t| gD ]4}|j|jd}|dusJ |j	}|dus'J t
|d}| ||< W d   n1 s=w   Y  qtt| S )z
    Compute a hash of all inductor code modules. Used by the FxGraph cache
    so any inductor code changes would result in new cache keys.
    Nrb)rW   rX   dirname__file__pkgutiliter_modulesmodule_finder	find_specrE   originr   readrr   rs   pickleru   r   )Zinductor_rootcontentslibspecmoduler   r5   r5   r9   get_inductor_code_hash
  s   r(  c                   @  s   e Zd ZU dZded< dS )OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    r   r   Nr   r5   r5   r5   r9   r)    s   
 r)  c                   @  s*   e Zd ZdZdgZdd	d
ZdddZdS )FxGraphHashDetailszz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    Zgraph_idgmtorch.fx.GraphModuleexample_inputsList[torch.Tensor]	fx_kwargsrb   c                 C  s   || _ || _i | _t|D ]"}|| jvr/t|| tu r(tt|| | j|< q|| | j|< qtj	| _
t | _t | _t | _d S r4   )r+  r-  r/  sortedEXCLUDED_KWARGStypesetr)  rO   rj   Ztorch_versionra   ry   Zsystem_infor   Zsave_configZinductor_configr(  Zinductor_code_hash)r   r+  r-  r/  kr5   r5   r9   r   1  s   


zFxGraphHashDetails.__init__r;   rF   c           	      C  s   ddd}g }t |  D ]k\}}t|tr<tt|D ]}t|| }|d| d| d| d|||   qqt|t	rc| D ]\}}t|}|d| d| d| d||  qEqt|}|d| d| d||  qd	
|S )z
        Get a printable string describing in more detail all the attributes
        comprising this object. Useful for debugging when one graph hashes
        to a different value than another.
        r;   rF   c                 S  s.   t | tjrtt| S t | trdS t| S )Nz<bytes>)r   rO   r  rF   r  r   )r  r5   r5   r9   get_strT  s
   
z-FxGraphHashDetails.debug_str.<locals>.get_str[z] z]: : 
Nr;   rF   )varsr   r   listranger  r  r   appenddictrY   )	r   r5  linesattrr  iihr4  vr5   r5   r9   	debug_strM  s    

,

(
"
zFxGraphHashDetails.debug_strN)r+  r,  r-  r.  r/  rb   r9  )r   r   r   r   r1  r   rD  r5   r5   r5   r9   r*  (  s
    
r*  r+  r,  r-  r.  r/  rb   c                 C  s0   t | ||}dt| }td||  |S )z=
    Generate a unique hash of the FX graph for caching.
    r   z*FX graph cache hash details for key %s:
%s)r*  r  r   logdebugrD  )r+  r-  r/  detailsr   r5   r5   r9   compiled_fx_graph_hashl  s   rH  c                   @  s~   e Zd ZdZed%ddZed&ddZed'ddZed(ddZed)ddZ	ed*ddZ
ed+d d!Zed"d# Zd$S ),FxGraphCachea7  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metatdata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    r;   rF   c                   C  s   t jt dS )zS
        Get the toplevel temporary directory for storing compiled graphs.
        Zfxgraph)rW   rX   rY   r"   r5   r5   r5   r9   _get_tmp_dir     zFxGraphCache._get_tmp_dirr   c                 C  s   t jt | dd | S )zA
        Return the disk location for a given cache key.
        r   r   )rW   rX   rY   rI  rJ  )r   r5   r5   r9   _get_tmp_dir_for_key  s   z!FxGraphCache._get_tmp_dir_for_keyr   r   List[torch.SymInt]c                 C  s   dd | D S )z=
        Get the SymInt objects from the input list.
        c                 S  s   g | ]
}t |tjr|qS r5   )r   rO   r  r   r  r5   r5   r9   
<listcomp>      z0FxGraphCache._filter_symints.<locals>.<listcomp>r5   )r   r5   r5   r9   _filter_symints  s   zFxGraphCache._filter_symintsr(   c                   C  s   t jj jjS )zG
        Helper to get the shape env from the tracing context.
        )rO   Z_guardsZTracingContextr   Z	fake_mode	shape_envr5   r5   r5   r9   _get_shape_env  rK  zFxGraphCache._get_shape_envr-  r.  Optional[CompiledFxGraph]c              	   C  s  t | }tj|sdS tt|D ]r}ttj||d}t	
|}W d   n1 s0w   Y  |j}|s>|  S t  }t |}tdd |D sRJ dd |D }	t|||	}
td| ||	|
 |
rt|||}|du szJ td	| |j |  S qdS )
z
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        Nr  c                 s  s    | ]}t |V  qd S r4   )r&   rN  r5   r5   r9   r     r   z-FxGraphCache._lookup_graph.<locals>.<genexpr>c                 S  s   g | ]}t |qS r5   )r'   rN  r5   r5   r9   rO        z.FxGraphCache._lookup_graph.<locals>.<listcomp>zCfx graph cache key %s evaluating guards for %s with values %s => %sTz*fx graph cache key %s post-load guards: %s)rI  rL  rW   rX   r   r0  listdirr   rY   r#  r   guards_exprrS  rQ  r   r<   Zevaluate_guards_expressionrE  rF  Zguards)r   r-  r   rX   r   r	  rW  rR  symintshintsr   checkr5   r5   r9   _lookup_graph  sB   
	
zFxGraphCache._lookup_graphcompiled_graphCompiledFxGraphc           	      C  sz   t |}d|_t }t|}|||_t|}t	| }t
j|s-t
j|dd t
j|t|}t|| dS )z=
        Store a serialized CompiledFxGraph on disk.
        NTrM   )r   compiled_artifactrI  rS  rQ  Zproduce_guards_expressionrW  r#  ru   rL  rW   rX   r   rZ   rY   r   r   )	r   r\  r-  Zdisk_compiled_graphrR  rX  r   r   rX   r5   r5   r9   _save_graph  s   


zFxGraphCache._save_graphcompile_fx_fnCallable[..., Any]r+  r,  r/  rb   c           	      C  s   ddl m} t|||}t }|tj||d td}|E t	||}|du rJt
d| td d  d	7  < | ||fi |}t||| nt
d
| td d  d	7  < |W  d   S 1 sfw   Y  dS )z
        Load a compiled graph from the cache. If a cached entry does not exist,
        compile the graph and save it to the cache.
        r   FileLock.locktimeoutNzfx graph cache miss for key %sZinductorZfxgraph_cache_missr   zfx graph cache hit for key %sZfxgraph_cache_hit)filelockrc  rH  r   rW   rX   rY   LOCK_TIMEOUTrI  r[  rE  rF  r   r_  )	r`  r+  r-  r/  rc  r   r   lockr\  r5   r5   r9   r     s   $zFxGraphCache.loadc                   C  s   t t  dS )z.
        Clear out the on-disk cache.
        N)shutilrmtreerI  rJ  r5   r5   r5   r9   clear*  s   zFxGraphCache.clearNr9  )r   rF   r;   rF   )r   r   r;   rM  )r;   r(   )r   rF   r-  r.  r;   rT  )r   rF   r\  r]  r-  r.  )r`  ra  r+  r,  r-  r.  r/  rb   )r   r   r   r   r   rJ  rL  rQ  rS  r[  r_  r   rl  r5   r5   r5   r9   rI  |  s$    5rI  c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	d	ed
< e
edZded< e
edZded< e
edZded< e
edZded< e
edZded< dZded< dZded< dZded< d'ddZd(d"d#Zd)d%d&ZdS )*r]  zr
    Class holding a compiled FX graph. This is the object serialized on disk
    to support FxGraph caching.
    NOptional[Callable[..., Any]]r^  current_callableOptional[str]	cache_keyartifact_pathOptional[List[Tuple[int, str]]]cache_linemap)default_factoryzSet[str]device_typeszSet[int]device_idxsmutated_inputsmutated_input_idxszDict[str, torch.Tensor]	constantsz)Optional[List[Optional[Tuple[int, ...]]]]output_stridesrW  zOptional[bool]_boxed_callr	  r)   List[Optional[Tuple[int, ...]]]c                 C  sZ   || _ |j| _|j| _|j| _|j| _|j| _|j| _t|j	| _	|j
| _
|| _d | _d S r4   )r^  rp  
cache_pathrq  rs  ru  rv  rw  r3  rx  ry  rz  rW  )r   r^  r	  rz  r5   r5   r9   r   M  s   
zCompiledFxGraph.__init__r   r   r;   r   c                 C  s   |   |S r4   )get_current_callable)r   r   r5   r5   r9   __call___     zCompiledFxGraph.__call__ra  c                 C  s"   | j d u rttt| S | j S r4   )rn  r   r   _run_from_cacheweakrefproxyr   r5   r5   r9   r~  b  s   
z$CompiledFxGraph.get_current_callable)r^  rm  r	  r)   rz  r|  )r   r   r;   r   )r;   ra  )r   r   r   r   r^  r   rn  rp  rq  rs  r   r3  ru  rv  rw  rx  r>  ry  rz  rW  r{  r   r  r~  r5   r5   r5   r9   r]  2  s$   
 

r]  r\  r   r   c                 C  sP   | j d u r#ddlm} | jsJ | jsJ || j| j| j| jj| _ |  |S )Nr   )PyCodeCache)	r^  Z	codecacher  rp  rq  load_by_key_pathrs  ry  call)r\  r   r  r5   r5   r9   r  k  s   



r  c                  C  sH   t  rt S tt jjttfrtt jj} t	| S t jjf} t	| S r4   )
r   	is_fbcoder-   ccr   cppcxxr;  tuplecpp_compiler_search)searchr5   r5   r9   cpp_compiler  s   
r  r   r  c                 C  s   | D ]V}zG|d u r@t jdkrW qtdsW qddlm} t }|tj|dt	d}| t
 }W d    n1 s;w   Y  t|dg |W   S  tjttfyX   Y qw t )NlinuxZTORCH_INDUCTOR_INSTALL_GXXr   rb  zg++.lockre  	--version)rS   platformrW   getenvrg  rc  r   rX   rY   rh  install_gcc_via_conda
subprocesscheck_outputSubprocessErrorFileNotFoundErrorImportErrorr    ZInvalidCxxCompiler)r  r  rc  r   ri  r5   r5   r9   r    s*   


r  c               	   C  s   t jt d} t j| dd}t j|sBtd t jdd}|du r+t	
d}|durBtj|dd	|  d
ddddgtjd |S )z>On older systems, this is a quick way to get a modern compilerZgccbinzg++zDownloading GCC via condaZ	CONDA_EXEcondaNcreatez	--prefix=z--channel=conda-forgez--quietz-yz
python=3.8Zgxx)stdout)rW   rX   rY   r"   r   rE  infoenvironr   rj  whichr  
check_callPIPE)prefixZcxx_pathr  r5   r5   r9   r    s*   


r  c                   C     t tdt S )Nz(gcc|g\+\+)r<   rer  r  r5   r5   r5   r9   is_gcc     r  c                   C  r  )Nz(clang|clang\+\+)r  r5   r5   r5   r9   is_clang  r  r  c                  C  s*   t  } t| dgd}d| d v S )Nr  utf8ZAppler   )r  r  r  r   
splitlines)r  version_stringr5   r5   r9   is_apple_clang  s   r  c                   @  s   e Zd ZU ded< ded< ded< ded< dZd	ZdddZejfdddZ	dddZ
dddZdddZeddddZdS )VecISAr   
_bit_widthrF   _macro_arch_flagszDict[torch.dtype, int]_dtype_nelementsa~  
#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#endif

__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};

extern "C" void __avx_chk_kernel() {
    auto tmp0 = at::vec::Vectorized<float>(1);
    auto tmp1 = tmp0.exp();
    tmp1.store(in_out_ptr0);
}
zG
import torch
from ctypes import cdll
cdll.LoadLibrary("__lib_path__")
r;   c                 C     | j S r4   )r  r   r5   r5   r9   	bit_width     zVecISA.bit_widthr   r   c                 C  s
   | j | S r4   )r  )r   r   r5   r5   r9   	nelements  s   
zVecISA.nelementsc                 C  r  r4   )r  r   r5   r5   r9   build_macro  r  zVecISA.build_macroc                 C  r  r4   )r  r   r5   r5   r9   build_arch_flags   r  zVecISA.build_arch_flagsc                 C  s   t t| S r4   )ri   rF   r   r5   r5   r9   __hash__  r  zVecISA.__hash__Nr<   c           	      C  s,  t jjd ur
t jjS t  rdS ttjd\}}ddlm} t	 }|t
j||d td}|\ |d d d }tt||d	| d
}z%t||| tjtjdtjd|gtji t
jddtjid W n ty } zW Y d }~W d    d	S d }~ww 	 W d    dS 1 sw   Y  d S )NTr  r   rb  rd  re  soF)warning_allvec_isa-cZ__lib_path__
PYTHONPATH:stderrenv)r   r  Z
vec_isa_okr  r   r  	_avx_coderg  rc  r   rW   rX   rY   rh  shlexsplitcpp_compile_commandcompile_filer  r  rS   
executable_avx_py_loadrR   DEVNULLr  	Exception)	r   r   
input_pathrc  r   ri  output_pathZ	build_cmdr   r5   r5   r9   __bool__  s@   
	
$zVecISA.__bool__)r;   r   )r   r   r;   r   r9  r   )r   r   r   r   r  r  r  rO   floatr  r  r  r  r   r   r  r5   r5   r5   r9   r    s   
 



r  c                   @  H   e Zd ZU dZdZdZejdejdej	diZ
ddd	ZejZd
ed< dS )	VecAVX512i   z-DCPU_CAPABILITY_AVX512z0-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma       r;   rF   c                 C  r=   )NZavx512r5   r   r5   r5   r9   __str__3  r:   zVecAVX512.__str__Callable[[VecISA], Any]r  Nr9  r   r   r   r  r  r  rO   r  Zbfloat16Zfloat16r  r  r  r  r   r5   r5   r5   r9   r  ,     
 
r  c                   @  r  )VecAVX2   z-DCPU_CAPABILITY_AVX2z-mavx2 -mfma   r  r;   rF   c                 C  r=   )NZavx2r5   r   r5   r5   r9   r  @  r:   zVecAVX2.__str__r  r  Nr9  r  r5   r5   r5   r9   r  9  r  r  c                   @  r  )
VecZVECTORr  zO-DCPU_CAPABILITY_ZVECTOR -DCPU_CAPABILITY=ZVECTOR -DHAVE_ZVECTOR_CPU_DEFINITIONz-mvx -mzvectorr  r  r;   rF   c                 C  r=   )NZzvectorr5   r   r5   r5   r9   r  M  r:   zVecZVECTOR.__str__r  r  Nr9  r  r5   r5   r5   r9   r  F  r  r  c                   @  s@   e Zd ZU dZdZdZi ZdddZddd	Ze	j
Z
d
ed< dS )InvalidVecISAr   rJ   r;   rF   c                 C  r=   )NZINVALID_VEC_ISAr5   r   r5   r5   r9   r  Y  r:   zInvalidVecISA.__str__r<   c                 C  r=   r>   r5   r   r5   r5   r9   r  \  r:   zInvalidVecISA.__bool__r  r  Nr9  r   )r   r   r   r  r  r  r  r  r  r  r  r   r5   r5   r5   r9   r  S  s   
 

r  List[VecISA]c                  C  s   t jdkrg S t dkrt gS g } td }| }tD ]}t||v r-|r-| | q| W  d    S 1 s:w   Y  d S )Nr  s390xz/proc/cpuinfo)	rS   r  machiner  r   r"  supported_vec_isa_listrF   r=  )Zisa_listZ	_cpu_infoZ_cpu_info_contentisar5   r5   r9   valid_vec_isa_listi  s   


$r  c                  C  s^   t  rt S t } | stS t jjd u r| sJ | d S | D ]}t jj| kr,|  S qtS Nr   )r   r  r  r  invalid_vec_isar  Zsimdlenr  )Z_valid_vec_isa_listr  r5   r5   r9   pick_vec_isaz  s   r  Tcompile_onlyc                 C     | rdS dS )Nr  rJ   r5   )r  r5   r5   r9   get_compile_only  r  r  sharedc                 C  r  )Nz-shared -fPICrJ   r5   )r  r5   r5   r9   
get_shared  r  r  r  c                 C  r  )Nz-WallrJ   r5   )r  r5   r5   r9   get_warning_all_flag  r  r  c                   C  s   dt ttjj S )Nz-D_GLIBCXX_USE_CXX11_ABI=)rF   r   rO   Z_CZ_GLIBCXX_USE_CXX11_ABIr5   r5   r5   r9   get_glibcxx_abi_build_flags  r`   r  c                  C  s"   g d} t  r| d d| S )N)
-std=c++17z-Wno-unused-variablez-Wno-unknown-pragmasz%-Werror=ignored-optimization-argument )r  r=  rY   )flagsr5   r5   r9   	cpp_flags  s   

r  c                   C  r=   )Nz-DTORCH_INDUCTOR_CPP_WRAPPERr5   r5   r5   r5   r9   cpp_wrapper_flags  r:   r  c                  C  sz   t jjrdnd} | d7 } t jjs| d7 } t  r| S tjdkr$| d7 } nt dkr/| d7 } n| d	7 } t  s;| d
7 } | S )Nz-O0 -gz-O3 -DNDEBUGz" -ffast-math -fno-finite-math-onlyz -fno-unsafe-math-optimizationsdarwinz -Xclangppc64lez -mcpu=nativez -march=nativez	 -fopenmp)	r   aot_inductorZdebug_compiler  Zenable_unsafe_math_opt_flagr  rS   r  r  )Z
base_flagsr5   r5   r9   optimization_flags  s   


r  c                   C  r=   )Nz$-D C10_USING_CUSTOM_GENERATED_MACROSr5   r5   r5   r5   r9   use_custom_generated_macros  r:   r  c                  C  s.   t  rt } dd}d|  d| S dS )Nr  )z-D C10_USE_GLOGz-D C10_USE_MINIMAL_GLOGz'-D C10_DISABLE_TENSORIMPL_EXTENSIBILITYz-Wp,-fopenmp rJ   )r   r  r-   
openmp_librY   )r  Zpreprocessor_flagsr5   r5   r9   use_fb_internal_macros  s   r  c                   C  s   t  rdS dS )Nz	-nostdincrJ   )r   r  r5   r5   r5   r9   use_standard_sys_dir_headers  s   r  c                  C  sD   zd} t |  d}tt|dkW S  t jy!   Y dS w )Nzconda list llvm-openmp --jsonr  r   F)r  r  r  r   r  rt   loadsr  )commandoutputr5   r5   r9   is_conda_llvm_openmp_installed  s   r  Tuple[bool, str]c                  C  sT   zt ddg t g dd } tj| }|| fW S  t jy)   Y dS w )Nr  brew)r  z--prefixlibompr  )FrJ   )r  r  r   r   rW   rX   r   r  )libomp_pathomp_availabler5   r5   r9   homebrew_libomp  s   
r  Finclude_pytorchr  rQ   aot_mode$Tuple[List[str], str, str, str, str]c                 C  s  t  rdtjvrdtjvrtjt tjd< ddlm	} d}d}t
jdkr| s6|tks6|s6t jjr||tdg }||tdg }g }	t  se|	d	d
g7 }	|	dg7 }	|sd|	dg7 }	nS|	dg7 }	|r|tjt g7 }|rt|D ];\}
}|tjd rtj| dst|D ] \}}}d|v rtj||||
< |tj||
 d  nqq|| }|rt  r|tkrt| }d| d| d| d| dg}|r|r|d u rd}|d7 }|rtj j!d ur|	ddg7 }	nt  r
|	dg7 }	n|	g d7 }	| }n||tdg }|r-|tjt g7 }g }t
jdkrt"  }t#dd urztjt#ddd}tj|}|rp|tjt#dd |tjt#dd nt$%d |py|}|rg ndg}	|st#d d urt& }|rtjt#d d}|tjt#d d || t' j(d!krtjtj|d"rd#g}	|st) \}}|r|tj|d |tj|d nt  rdgndg}	t j*j+s|	d$g7 }	||j,g7 }t  rJ|t-  |t.  |t/  |t0  |t1  |t2  |t3  |t4  |t  |d g }|r[|r[t  r[g d%}dd&d' |D }d|d(d' |	D  }|||||fS ))N	CUDA_HOMEZ	CUDA_PATHr   cpp_extensionrJ   r  includeZLIBDIRrO   Z	torch_cpugompZtorch_pythonZompz/libcudart_static.azlibcudart_static.astubsr  z-D CPU_CAPABILITY=z-D CPU_CAPABILITY_z-D HAVE_Z_CPU_DEFINITIONz -D USE_CUDAZc10_hipZ	torch_hiprQ   )Zc10_cudarQ   Z
torch_cudar  Z
OMP_PREFIXzomp.hr%  z-environment variable `OMP_PREFIX` is invalid.ZCONDA_PREFIXx86_64zlibiomp5.dylibZiomp5Zc10)z-Wl,-Bstaticz-lcudart_staticz-Wl,-Bdynamicc                 S     g | ]}d | qS )-Lr5   r   pr5   r5   r9   rO    rU  z1get_include_and_linking_paths.<locals>.<listcomp>c                 S  r  )z-lr5   r  r5   r5   r9   rO    rU  )5r   r  rW   r  rX   r  r-   rQ   torch.utilsr	  rS   r  r  r  Zenable_kernel_profileinclude_paths	sysconfigr   Zlibrary_pathsget_config_varcpp_prefix_path	enumerate
startswithr   walkrY   r=  r  rF   upperr  rO   rP   r^   r  r  warningswarnr  unamer  r  r  Zabi_compatibleZTORCH_LIB_PATHZsleefZopenmpZ
cc_includeZlibgccZlibgcc_archZlibgcc_backwardglibcZlinux_kernel)r  r  rQ   r  r	  macrosr  ipathslpathslibsirX   rootdirsfilescapr  header_pathZ	valid_envZconda_lib_pathr  Zstatic_link_libsZ
lpaths_strZlibs_strr5   r5   r9   get_include_and_linking_paths  s   





	









r)  inputUnion[str, List[str]]r  use_absolute_pathc
              	   C  s  t ||||\}
}}}}t| tr| g} ddd |
D }d}t rV|r-|	s-| }|}ndd | D }tj|}t	 s?J |d7 }|d7 }dt
  }|d	t
  7 }n| }|}d}d|}td
ddg dt  d| dt| dt| dt  dt  d| d| d| d| d| d| d| dt  dt  dt  dt  dt| d| d S )Nr  c                 S  r  z-Ir5   r  r5   r5   r9   rO    rU  z'cpp_compile_command.<locals>.<listcomp>rJ   c                 S  s   g | ]}t j|qS r5   )rW   rX   r   )r   r#  r5   r5   r9   rO    s    z --rtlib=compiler-rtz -fuse-ld=lldz-Bz -Lz[ \n]+z
            z
            -o z	
        )r)  r   rF   rY   r   r  rW   rX   r   r  r-   Z	glibc_libr  subr  r  r  r  r  r  r  r  r  r  r   )r*  r  r  r  r  r  rQ   r  r  r,  r   r!  r"  r  r  Z
ipaths_strZclang_flagsZinp_nameZout_nameZlinker_pathsZinp_name_strr5   r5   r9   r    s   



	

r  cmdc              
   C  sF   t | } zt|  W d S  tjy" } zt| |j|d }~ww r4   )r  r  r  r  CalledProcessErrorr    CppCompileErrorr  )r/  r   r5   r5   r9   run_command_and_check  s   
r2  c                 C  s   |  drtj| S | dfS )zDReturns the path where the AOT Inductor compiled kernels are stored..sorJ   )endswithrW   rX   r  )rX   r5   r5   r9   split_aot_inductor_output_path  s   
r5  c                   @  sB   e Zd ZU e Zded< eejZedd
dZ	edddZ
dS )CudaKernelParamCachezDict[str, Dict[str, str]]rz   r   rF   paramsDict[str, str]r   r;   r?   c                 C  sL   t jjd u rdnd}t|||ttjjd d\}}||t < || j	|< d S )Nr   r   r   )r   r   )
rO   rP   r^   r   r5  r   r  r  r_   rz   )clsr   r7  r   Zbin_typerL   rX   r5   r5   r9   r3    s   

	zCudaKernelParamCache.setOptional[Dict[str, str]]c                 C  s   | j |d S r4   )rz   r   )r9  r   r5   r5   r9   r        zCudaKernelParamCache.getN)r   rF   r7  r8  r   rF   r;   r?   )r   rF   r;   r:  )r   r   r   r>  rz   r   r   rl  classmethodr3  r   r5   r5   r5   r9   r6    s   
 
r6  c                   @  s4   e Zd ZU e Zded< eejZedddZ	dS )AotCodeCacher8  rz   r	  r)   source_coderF   serialized_extern_kernel_nodesro  rQ   r<   r;   c              
     s"  t  }ttdd|||jd}d}d}t r.t }	|s)|jr)t }
d}d}n	t	 }
nd}	d}
t
tjj\}}t|d||d	\}}|| jvse|rVtj| j| |kse|rtj| j| |krd
dlm} t }|tj||d td}| t r|rtj|d
 d }t|d}|| W d    n1 sw   Y  |rtjjn	tj|d
 d }tj|stj|d
 d }t|||||jd|d}td| |rt|||  t|d nt | d2dd d fdd|j!" D }t|d|d\}}tj|d
 d }|r<|	 d tj| d!tj| }t|||  t|d n|	 d | d!| }t | td"| |
 d#| d!| }td$| t | d%| }td&| t | |rt#$d'd(tj|}nt#$d'd(|}g }|%|
 d)| d*|  |%|
 d)| d+|  |%|
 d)| d,|  td-d!| |D ]}t | qt||g||||j|d.}td/| |rt||g||  t|d0 nt | ntd1| || j|< W d    n	1 sw   Y  | j| S )3Nr#  o)r  rQ   r  FTldobjcopyr  )r   r   r   rb  rd  re  z.jsonr   r3  z.o)r*  r  r  rQ   r  r  r,  zaot compilation command: %si  r   r   r;   r   c                 S  sN   dd l }|  dkrdS |   }|| ||j|  }t	|j
S )Nr       )ctypesZnumelZuntyped_storagerG   castZdata_ptrPOINTERc_ubytenbytesr   r$  )r   rD  Zt_cpuZ	raw_arrayr5   r5   r9   	_to_bytesX  s   
z'AotCodeCache.compile.<locals>._to_bytesrC  c                 3  s    | ]} |V  qd S r4   r5   )r   ZtensorrI  r5   r9   r   h  s    
z'AotCodeCache.compile.<locals>.<genexpr>r  )r   z -r -b binary -o r  zaot constant binary command: %szC --rename-section .data=.lrodata,alloc,load,readonly,data,contents zaot constant obj command: %szrm z$aot constant bin removal command: %sz[\W]rL   z --redefine-sym _binary_z#_start=_binary_constants_bin_start z!_size=_binary_constants_bin_size z_end=_binary_constants_bin_end z'aot constant binary redefine symbol: %s)r*  r  r  rQ   r  r,  zaot linkage command: %si  z.aot_inductor dynamic library already exist: %s)r   r   r;   r   )&r  r   r  r  r   r  r-   rA  Zobjcopy_fallbackrB  r5  r  r  r   rz   rW   rX   r  r   rg  rc  r   rY   rh  splitextr   r   rE  rF  r  r  chmodr2  ry  r   r  r.  r=  )r9  r	  r>  r?  rQ   picked_vec_isacpp_commandZfbcode_aot_cpu_rer,  Z
ld_commandZobjcopy_commandZspecified_output_pathZspecified_so_namer   r  rc  r   ri  Zoutput_jsonr   Z	output_soZoutput_or/  Zaot_constantsZ
consts_keyZconsts_pathZconsts_obodyZsymbol_listr5   rJ  r9   compile  s   





	

$



zzAotCodeCache.compileN)
r	  r)   r>  rF   r?  ro  rQ   r<   r;   rF   )
r   r   r   r>  rz   r   r   rl  r<  rP  r5   r5   r5   r9   r=    
   
 
r=  c                  C  sR   t tjd } |  }| }t|d\}}W d    |S 1 s"w   Y  |S )Nzcodegen/cpp_prefix.hrB  )r   r  r   r   r"  r   )rX   r   r   rL   filenamer5   r5   r9   r    s   


r  c                  C  s.   t  } t rdtj|  dS d|  dS )Nz
#include "")r  r   r  rW   rX   r   )rR  r5   r5   r9   
cpp_prefix  s   rT  r  r  	List[str]c              
   C  s  t | tr| gn| }dd |D }zt rt }tj|}tj|}tjt	j
jjd}t L}	t|tj|	| t||D ]\}
}t|
tj|	| qDtj|	d}t|| t||	|}tj|rrt| t|| W d    n1 sw   Y  W d S W d S tj|tjd W d S  tjy } z"|jd}d|v pd|v }|rtjdkrd	}||7 }t|||d }~ww )
Nc                 S  s$   g | ]}t  rtj|n|qS r5   )r   r  rW   rX   r   )r   ipr5   r5   r9   rO    s    z compile_file.<locals>.<listcomp>r
  )r  rh   z'omp.h' file not foundr   r  a  

OpenMP support not found. Please try one of the following solutions:
(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ that has builtin OpenMP support;
(2) install OpenMP via conda: `conda install llvm-openmp`;
(3) install libomp via brew: `brew install libomp`;
(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path with `include/omp.h` under it.) r   rF   r   r  r  rW   rX   r   rY   rO   utilsr	  _TORCH_PATHtempfileTemporaryDirectoryrj  r   zipcopytreer.   r   remover  r  STDOUTr0  r  r   rS   r  r    r1  )r  r  r/  Zinput_pathsZinput_filesr(  header_nameZoutput_nameZtorch_includes_pathZtmp_dirr  r   Zdest_include_pathZoutput_file_pathr   r  Zopenmp_problemZinstructionr5   r5   r9   r    sD   


(	r  zOptional[CDLL]_libgompc                   @  sB   e Zd ZU e Zded< eejZedddZe	dd
dZ
dS )CppCodeCacheDict[str, CDLL]rz   rX   rF   r;   r
   c              
   C  s   zt | W S  tyG } z5dt|v r*tjdr*t dat | W  Y d }~S dt|v rBt| dt	  dt	  d| d }~ww )Nr  z/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)
r	   LoadLibraryOSErrorrF   rW   rX   r   r`  rY  
gettempdir)rX   r   r5   r5   r9   _load_library  s"   
zCppCodeCache._load_libraryr>  c                 C  s   t  }ttdd|d}t|d|d\}}|| jvrnddlm} t }|tj	
||d td	}|4 |d d
 d }	tj	|	sQtt||	|d}
t||	|
 | |	| j|< || j| _W d    n1 siw   Y  | j| S )Nr#  r@  )r  r  r   r   rb  rd  re  r  r  )r*  r  r  )r  r   r  r   rz   rg  rc  r   rW   rX   rY   rh  r   r  r  r  rf  r   )r9  r>  rM  rN  r   r  rc  r   ri  r  r/  r5   r5   r9   r   "  s(   

zCppCodeCache.loadN)rX   rF   r;   r
   )r>  rF   r;   r
   )r   r   r   r>  rz   r   r   rl  rf  r<  r   r5   r5   r5   r9   ra    s   
 
ra  c                   @  s   e Zd ZU e Zded< e Zded< eejZe	ddddZ
e				d d!ddZe			d"d#ddZe	edd$ddZdS )%r  zDict[str, ModuleType]rz   z Dict[str, List[Tuple[Any, ...]]]linemapsrJ   r>  rF   r   r;   r   c                 C  s   t |d|dS NrK   rg  )r   )r9  r>  r   r5   r5   r9   r   @  r;  zPyCodeCache.writeNlinemaprr  attrsr   r   c                 C  s"   t |d|d\}}| ||||S ri  )r   r  )r9  r>  r   rj  rk  r   rX   r5   r5   r9   r   D  s   zPyCodeCache.loadr   rX   c                 C  s  |d u rg }|| j vrt|p}z
t| |d}W n ty7 } ztd| dt|j d| d d }~ww tt d| }||_	||_
t||j|j |tj|j< | j || tt| | j|< |d urx| D ]
\}	}
t||	|
 qmW d    n1 sw   Y  | j | S )NexeczFailed to import r8  r7  rI   )rz   r   rP  r"  r  rq   r2  r   r   r  r   rl  __dict__rS   modulesr   r;  r[  rh  r   setattr)r9  r   rX   rj  rk  r   r   r   modr4  rC  r5   r5   r9   r  O  s6   


zPyCodeCache.load_by_key_pathlinenor   Optional[List[Dict[str, Any]]]c                 C  sX   || j vrd S | j | \}}t||}|dkrd S ||d  }|s#d S d	dd}||S )
Nr   r   stack_tracerF   r;   List[Dict[str, Any]]c                 S  s"   d}t || }dd t|D S )Nz"File "(.+)", line (\d+), in (.+)\nc                 S  s"   g | ]\}}}|t ||d qS ))rR  linerE   )r   )r   r   lnr5   r5   r9   rO    s    zPPyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace.<locals>.<listcomp>)r  findallreversed)rs  regexmatchesr5   r5   r9   parse_stack_trace  s
   z<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace)rs  rF   r;   rt  )rh  r   )r9  rX   rq  r?  nodesr  entryr|  r5   r5   r9   stack_frames_for_codeq  s   



z!PyCodeCache.stack_frames_for_coderJ   )r>  rF   r   rF   r;   r   )rJ   NN)
r>  rF   r   rF   rj  rr  rk  r   r;   r   )NN)
r   rF   rX   rF   rj  rr  rk  r   r;   r   )rX   rF   rq  r   r;   rr  )r   r   r   r>  rz   r   rh  r   rl  r<  r   r   r  r   r   r  r5   r5   r5   r9   r  ;  s$   
 

!r  c                   @  s4   e Zd ZU e Zded< eejZedddZ	dS )CppWrapperCodeCacherb  rz   r>  rF   	func_namer   rQ   r<   r;   r
   c                 C  s  d| }t |}tj|st| d}tj|| d| }td| || jvrtd| ddl	m
}	 t }
|	tj|
|d td	}| tj|std
| t }t }t }t }tt |d\}}}}}t }t }| d| d| d| d| d| d| }| d| d| d}tjjj|||g|g|g|g|dd}td| n.td| tj||}|d usJ tj|}t|jtj sJ |j!| td| || j|< W d    n1 sw   Y  | j| S )NZinline_extension_r  rI   zCpp wrapper code path %szCpp wrapper cache miss for %sr   rb  rd  re  zCpp wrapper building %s)r  rQ   r  z                     z -ffast-mathT)rE   Zbuild_directoryZcpp_sourcesZ	functionsextra_cflagsextra_ldflagsZextra_include_pathsZuse_pchzCpp wrapper done building %sz(Found target .so, cpp wrapper loading %szCpp wrapper done loading %s)"r]   rW   rX   r   rZ   rY   rE  rF  rz   rg  rc  r   rh  r  r  r  r  r)  r  r  r  rO   rW  r	  Zload_inline	importlibutilspec_from_file_locationmodule_from_specr   loaderr   Loaderexec_module)r9  r>  r  r   rQ   rE   r\   extfilepathrc  r   ri  Z
_cpp_flagsZ
_opt_flagsZ_sharedZ_warning_all_flagZ_ipathsZ_lpathsZ_libsZ_macrosZ_build_arch_flagsZ_use_custom_generated_macrosZ_cpp_wrapper_flagsr  r  rp  r&  r5   r5   r9   r     st   




4zCppWrapperCodeCache.loadN)
r>  rF   r  rF   r   rF   rQ   r<   r;   r
   )
r   r   r   r>  rz   r   r   rl  r<  r   r5   r5   r5   r9   r    rQ  r  c                   @  s   e Zd Zed	ddZdS )
TritonCodeCachekernel_namerF   r>  r;   r   c                 C  s   t |}t||S r4   )r  r   getattr)r9  r  r>  rp  r5   r5   r9   r     s   

zTritonCodeCache.loadNr  rF   r>  rF   r;   r   )r   r   r   r<  r   r5   r5   r5   r9   r    s    r  ro  c                   C  s\   t tjjrtjjS t tdrtddS t tdr,tjtdddS dS )NZCUDACXXrJ   r  zbin/nvccZnvcc)	r!   Z
nvcc_existr   rQ   Zcuda_cxxrW   r  rX   rY   r5   r5   r5   r9   _cuda_compiler  s   r  c                  C  s<   t jj} tj| dtj| dtj| dtj| dgS )Nr
  ztools/library/includeztools/library/srcztools/util/include)r   rQ   Zcutlass_dirrW   rX   rY   )Zcutlass_pathr5   r5   r9   _cutlass_include_paths  s   r  c                  C  s   ddl m}  g }t rDd}tj| |s!tj| dr!d}|d| |  |d| |d  |d |d |S td	)
Nr   r  Zlib64r%  r  r  z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)	r  r	  r$   rW   rX   r   Z_join_cuda_homer=  NotImplementedError)r	  r  Zextra_lib_dirr5   r5   r9   _cuda_lib_options  s(   

r  c                   C  s   g dS )N)z-fPICz-fno-strict-aliasingz-fvisibility=hiddenz-Wconversionr5   r5   r5   r5   r9   _nvcc_host_compiler_options  s   r  c               	   C  s   t  } | dkr
d} d|  d|  g}tjjr |d|  g7 }dddd	|  d
d| dtjjddg}tjjr@|g d tjj	rK|g d tjj
rV|ddg |S )NZ90Z90aZsm_Zcompute_Zlto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z-wz-gencode=arch=compute_z,code=[,]r  z--expt-relaxed-constexpr)z	-lineinfoz-gz-DCUTLASS_DEBUG_TRACE_LEVEL=1)z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r!   Zget_cuda_archr   rQ   Zenable_cuda_ltorY   Zcompile_opt_levelZenable_debug_infoextendZenable_ptxas_infoZuse_fast_math)archr   optionsr5   r5   r9   _nvcc_compiler_options  s6   		r  	src_filesdst_filedst_file_extc           
      C  s   t  }t }t }t }|dd |D  dd |D  | }d| }d}	|dkr;t  dd| d| d| }	n#|dkrV|d	 t  dd| d
| d| }	ntd| dt	d|	 |	S )Nc                 S  s(   g | ]}d |v rd| nd| qS )=z-Xcompiler z-Xcompiler=r5   )r   optr5   r5   r9   rO  I  s    z(cuda_compile_command.<locals>.<listcomp>c                 S  r  r-  r5   )r   rX   r5   r5   r9   rO  M  rU  r  rJ   r@  z -c -o r  z-sharedz -o zUnsupported output file suffix !zCUDA command: %s)
r  r  r  r  rY   r  r=  r  rE  rF  )
r  r  r  r  Zcuda_lib_optionsZnvcc_host_compiler_optionsZnvcc_compiler_optionsr  Zsrc_fileresr5   r5   r9   cuda_compile_command>  s0   
	$
$r  c                   @  sJ   e Zd ZdZdddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dS )
DLLWrapperz A wrapper for a dynamic library.lib_pathrF   c                 C  s   || _ t|| _d| _d S )NT)r  r	   rc  DLLis_open)r   r  r5   r5   r9   r   `  s   
zDLLWrapper.__init__c                 C  s   | j r|   d| _ d S d S r>   )r  _dlcloser   r5   r5   r9   closeh  s   
zDLLWrapper.closec                 C  sl   d }t  rtd }t|dstd}t|dr|j}ntd|d ur/tg|_|| jj d S t	
d d S )Ndlclosezlibc.soz&Unsupported env, failed to do dlclose!zKdll unloading function was not found, library may not be unloaded properly!)r$   r
   hasattrr  r  r   argtypesr  _handlerE  warning)r   Z	f_dlcloseZsymsr5   r5   r9   r  m  s   

zDLLWrapper._dlclosec                   s2   | j std| j t| j|  fdd}|S )NzCannot use closed DLL library: c                    s     |  }|rt d j d S )NzError in function: )rq   r   )r7   errmethodr5   r9   _wrapped_func  s   z-DLLWrapper.__getattr__.<locals>._wrapped_func)r  rq   r  r  r  )r   rE   r  r5   r  r9   __getattr__  s
   zDLLWrapper.__getattr__c                 C  r  r4   r5   r   r5   r5   r9   	__enter__  r:   zDLLWrapper.__enter__c                 G     |    d S r4   r  )r   r7   r5   r5   r9   __exit__  r  zDLLWrapper.__exit__c                 C  r  r4   r  r   r5   r5   r9   __del__  r  zDLLWrapper.__del__N)r  rF   )r   r   r   r   r   r  r  r  r  r  r  r5   r5   r5   r9   r  ]  s    
r  c                   @  sh   e Zd ZU ejG dd dZe Zded< e	ej
Z
dZeddd	ZedddZedddZdS )CUDACodeCachec                   @  s   e Zd ZU ded< ded< dS )zCUDACodeCache.CacheEntryrF   r  r  N)r   r   r   r   r5   r5   r5   r9   
CacheEntry  s   
 r  zDict[str, CacheEntry]rz   rH   r;   r   c                 C  s.   t tdgd|}t|| j|d\}}||fS )z
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        Zdummy_inputZdummy_outputrg  )r   r  r   _SOURCE_CODE_SUFFIX)r9  r>  r  Zcuda_commandr   r  r5   r5   r9   r     s   
zCUDACodeCache.writer   c                 C  s
  |  ||\}}|| jvr|ddlm} t }|tj||d td}|O |dt	| j
  | }tj|sdt|g||d}	ztj|	tjtjd W n tjyc }
 zt|	|
j|
d}
~
ww t||| j|< W d   n1 sww   Y  | j| j||fS )z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        r   rb  rd  re  Nr  r  )r   rz   rg  rc  r   rW   rX   rY   rh  r  r  r   r  r  r  r  r^  r  r0  r    ZCUDACompileErrorr  r  r  r  )r9  r>  r  r   r  rc  r   ri  r  r/  errorr5   r5   r9   rP    s0   


zCUDACodeCache.compileTuple[DLLWrapper, str, str]c                 C  s<   |dkrt d| d| | ||\}}}t|||fS )z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: )rq   rP  r  )r9  r>  r  Zdst_file_pathr   Zsource_code_pathr5   r5   r9   r     s   
zCUDACodeCache.loadN)r;   r   )r;   r   )r;   r  )r   r   r   dataclasses	dataclassr  r>  rz   r   r   rl  r  r<  r   rP  r   r5   r5   r5   r9   r    s   
 
r  c                  C  s&   t  D ]\} }| r|j  qd S r4   )r   r~   Workerrl   )rL   device_interfacer5   r5   r9   caching_device_properties  s
   
r  r  r>  r  r   re   r   c                 C  s4   t |j}|j|j t| |}|j|d d S )N)Zwarm_cache_only_with_cc)r   r2  r  Z
set_deviceindexr  r   
precompile)r  r>  r  re   r  kernelr5   r5   r9   _worker_compile  s   
r  r   c                 C  s   t | |}|  |S r4   )r  r   r  )r  r>  r  r5   r5   r9   _load_kernel  s   r  c                   @  s*   e Zd ZU ded< dd
dZdddZdS )TritonFuturer   r  r  rF   r>  futureFuture[Any]r;   r?   c                 C  s   || _ || _|| _d S r4   )r  r>  r  )r   r  r>  r  r5   r5   r9   r     s   
zTritonFuture.__init__c                 C  sv   t  }t| dr| jS | j  t| j| j }| _t  | }|dkr3td| d| j  t| j | `| `| `|S )Nr  2   z"Detected long compilation time of z seconds for kernel name )	r   r  r  r  resultr  r  r>  r#   )r   t0r  Zlatencyr5   r5   r9   r  	  s   



zTritonFuture.resultN)r  rF   r>  rF   r  r  r;   r?   )r;   r   )r   r   r   r   r   r  r5   r5   r5   r9   r    s   
 
r  c                   s&   d fdd}t |ddat  d S )Nr;   r?   c                     s*   	 t d  t krtt tj q)NTr   )r   rW   getppidkillr   signalZSIGKILLr5   	orig_ppidr5   r9   run 	  s
   z'_async_compile_initializer.<locals>.runT)targetdaemonr   )r   _watchdog_threadstart)r  r  r5   r  r9   _async_compile_initializer	  s   r  zOptional[Thread]r  c                   @  s   e Zd Zd*ddZeedd+ddZeedd,d
dZe	d*ddZ
e	d-ddZe	d.ddZ	d/d0ddZd1d!d"Zd#d$ Zd2d'd(Zd)S )3AsyncCompiler;   r?   c                 C  r3   r4   r5   r   r5   r5   r9   r   /	  r:   zAsyncCompile.__init__r   r   c                   C  s   t jdksJ tt jS Nr   )r   compile_threadsr   r5   r5   r5   r9   pool2	  s   
zAsyncCompile.poolr   c                  C  sX   t   tjdks
J t } ttj}ttj|t	t
| d}tjjd |jtjd |S )Nr   )Z
mp_contextZinitializer)Zexitpriority)r  r   r  rW   r   multiprocessingZget_contextZworker_start_methodr   r   r  r  ZFinalizeshutdownrS   maxsize)r  ctxr  r5   r5   r9   process_pool8	  s   	zAsyncCompile.process_poolc                 C  sd   t jdkrd S t  |  }t|dr|  ntt jD ]}|  qt|dr-|  t	  d S )Nr   _start_queue_management_thread_start_executor_manager_thread)
r   r  rA   r  r  r  r<  Z_adjust_process_countr  rD   )r9  r  rL   r5   r5   r9   	warm_poolN	  s   





zAsyncCompile.warm_pooltaskra  r   c                 C  s   t jdkr| S |  |S r  )r   r  r  submit)r9  r  r5   r5   r9   r  l	  s   
zAsyncCompile.submitfnseqr   c                   sB   t jdkst|dkrtt|S dd  fdd|D D S )Nr   c                 S  s   g | ]}|  qS r5   )r  )r   r   r5   r5   r9   rO  v	  rU  z$AsyncCompile.map.<locals>.<listcomp>c                   s   g | ]
}   |qS r5   )r  r  )r   r  r9  r  r5   r9   rO  v	  rP  )r   r  r  r;  map)r9  r  r  r5   r  r9   r  r	  s   zAsyncCompile.maprQ   r  rF   r>  
device_strUnion[TritonFuture, ModuleType]c                 C  s^   t   tjdkr*t|}t|| }||}|  	t
||||}t|||S t||S r  )rA   r   r  r   rO   re   rm   Zget_compute_capabilityr  r  r  r  r  )r   r  r>  r  r  re   r  r  r5   r5   r9   rc   x	  s   



zAsyncCompile.tritonr   c                   s    fdd}|  |S )Nc                     s   t  jS r4   )ra  r   r  r5   r>  r5   r9   r  	  r  zAsyncCompile.cpp.<locals>.taskr  )r   r>  r  r5   r  r9   r  	  s   
zAsyncCompile.cppc                   s    fdd}|  |S )Nc                     s   t  d S r  )r  r   r5   r  r>  r5   r9   r  	  s   zAsyncCompile.cuda.<locals>.taskr  )r   r>  r  r  r5   r  r9   rQ   	  s   
zAsyncCompile.cudascoperb   c                 C  s   t dd | D }t|dtjdd}tjdkrA| D ]#\}}tjr.t|ts.|	| t|t
tfr@| ||< |d qt  d S )Nc                 S  s"   g | ]\}}t |ttfr|qS r5   )r   r   r  )r   r   r   r5   r5   r9   rO  	  s    z%AsyncCompile.wait.<locals>.<listcomp>zInductor Compilationr   )totaldescdisabledelayr   )r  r   r,   r   Zdisable_progressr  Zverbose_progressr   r+   Zset_postfix_strr   r  r  updaterD   )r   r  Znum_kernelsZpbarr   r  r5   r5   r9   wait	  s(   



zAsyncCompile.waitNr   )r;   r   )r;   r   )r  ra  r;   r   )r  ra  r  r   r;   r   )rQ   )r  rF   r>  rF   r  rF   r;   r  )r>  rF   r;   r   )r  rb   r;   r?   )r   r   r   r   r   r   r   r  r  r<  r  r  r  rc   r  rQ   r  r5   r5   r5   r9   r  .	  s&    

r  r   r   )rE   rF   r;   rF   r9  )r   r   r;   rF   r  )r   r   r   rF   )r   rF   r   rF   r   rF   r;   r   )rJ   r   )r   r   r   rF   r   rF   )rJ   r   rJ   )r   r   r   rF   r   rF   r   rF   r   rF   r;   r   )rX   rF   r   r   r;   r?   )r   r   r;   r   )r  r   r;   r   r  )r+  r,  r-  r.  r/  rb   r;   rF   )r\  r]  r   r   r;   r   )r  rF   r;   rF   )r;   r  )r;   r  )T)r  r<   r;   rF   )r  r<   r;   rF   )r  r<   r;   rF   )r;   r  )
r  r<   r  r  rQ   r<   r  r<   r;   r  )r*  r+  r  rF   r  r<   r  r<   r  r<   r  r  rQ   r<   r  r<   r  r<   r,  r<   r;   rF   )r/  rF   )rX   rF   r;   r   )r  r+  r  rF   r/  rU  r;   r?   )r;   ro  )r;   rU  )r  rU  r  rF   r  rF   r;   rF   )
r  rF   r>  rF   r  r   re   r   r;   r?   r  )
__future__r   r   r  r  r   rr   r  r  rt   loggingr  rW   r   r#  r  r  r  r  rj  r  r  rS   r  rY  r   r  r  bisectr   concurrent.futuresr   r   r   r   rD  r   r	   r
   r   r   r   r   r   r   r   typesr   typingr   r   r   r   r   r   r   r   r   rO   Ztorch._dynamo.device_interfacer   r   Ztorch._dynamo.utilsr   Ztorch._inductorr   r    Ztorch._inductor.codegen.cudar!   Ztorch._inductor.utilsr"   r#   r$   Ztorch._prims_commonr%   Z%torch.fx.experimental.symbolic_shapesr&   r'   r(   Ztorch._inductor.graphr)   Z torch._inductor.select_algorithmr*   Z	torch.hubr+   r,   rX   abspathr  Z_HEREr  rX  r  Z	triton.fbr-   Ztriton.fb.buildr.   Ztorch._inductor.fb.utilsr/   r0   r1   r2   rh  rB   r@   rA   rD   	getLoggerr   rE  r]   r_   ra   r   r   r   r   r   r   r   r   r   r  r   r   r  r  r  r  r  Picklerr  r   r(  r)  r*  rH  rI  r]  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r)  r  r2  r5  r6  r=  r  rT  r  r`  r   ra  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r5   r5   r5   r9   <module>   sr   ,

	KT

!	D 78
_	 '9 ;30SL

	'=H	(