o
    I&i                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	l m!Z!m"Z" d d
l#m$Z$m%Z%m&Z& d dl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl+m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZLmMZMmNZN ddlOmPZP eQeRZSejTUeRdZVejTUeRdZWdd ZXdd ZYdd ZZG dd dej[j\Z]dS )     N)defaultdict)contextmanager)AnyCallableDefaultDictDictListOptionalSetTuple)get_decompositions)defakedynamo_timed)
LazyString)
FakeTensor)magic_methodsmethod_to_operator)has_free_symbolsShapeEnvSymTypes)no_dispatch   )configir)get_scheduling_for_deviceget_wrapper_codegen_for_deviceregister_backend_for_device)CppWrapperCodeGenCudaWrapperCodeGenWrapperCodeGen)CppWrapperCodeGenErrorLoweringExceptionMissingOperatorWithDecompMissingOperatorWithoutDecomp)ConstantFixedLayoutInputBuffer	Pointwise	Reduction
StorageBox	TensorBox)FALLBACK_ALLOW_LISTfallback_handler%fallback_node_due_to_unsupported_typelayout_constraints	loweringsmake_fallbackneeds_realized_inputsunsupported_output_tensor)SizeVarAllocator)convert_shape_to_inductorgather_originsget_sympy_Expr_dtype)VZ
perf_hintsoutput_codec              
   C   s\   t jt jt jt jt jt jt jt jt j	t j
h
}|r*|t j |t j |t j | |v S N)torchfloat32Zfloat64int64Zint32Zint16Zint8Zuint8boolZbfloat16Z	complex64addZfloat16Zfloat8_e4m3fnZfloat8_e5m2)dtypecudaZsupported_dtype rA   @C:\wamp64\www\opt\env\Lib\site-packages\torch/_inductor/graph.pysupported_dtype_of_cpp_wrapperB   s    rC   c                 C   sh   t | tjtjtjjjfsJ dt | tjjjrtjS t | tjr&t	| S | j
r,tjS | jr2tjS d S )Nzgget_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer)
isinstancesympySymbolExprcorenumbersIntegerr:   r<   r6   
is_integerZis_floatr;   )Zconstant_bufferrA   rA   rB   may_get_constant_buffer_dtypeX   s   rL   c                 C   s   dd t D }| |v S )Nc                 S   s   h | ]}t |qS rA   )r   ).0mrA   rA   rB   	<setcomp>k       z"is_magic_method.<locals>.<setcomp>)r   )opZ	magic_opsrA   rA   rB   is_magic_methodj   s   rR   c                
       s6  e Zd ZU eej ed< dejfddZ	dejfddZ
dd Zd	d	d	d	d
d
e d	d	d
f
dejjdeeej  f fddZedefddZdd Zdd ZdejfddZedd ZdefddZdefddZdefd d!Ze fd"d#Zd$ej fd%d&Z!d'ee fd(d)Z"d*d+ Z#d,efd-d.Z$d^d/d0Z%d,ed1eej fd2d3Z&d4ef fd5d6Z' fd7d8Z(ed9ejdefd:d;Z)d<d= Z*d>d? Z+d@dA Z, fdBdCZ-dDdE Z.e/dFejj0fdGdHZ1dIejj0f fdJdKZ2dLdM Z3dNdO Z4dPdQ Z5dRdS Z6dTdU Z7edVdW Z8dXdY Z9dZd[ Z:d,efd\d]Z;  Z<S )_GraphLoweringgraph_outputsexc                 C   sx   | j rt| t| fS ddlm} |dt| jj }| j	||\}}}dd |D }dd |D }||fS )z
        Support dynamic shapes and dynamic strides by assigning variables
        to each dimension.  We duck-shape tensors, so if two tensors
        have the same size they get assigned the same symbolic variable.
        r   )ConstantSourceZ__inductor_unknown_tensor_c                 S   $   g | ]}t |tjr|jjn|qS rA   rD   r:   SymIntnodeexprrM   irA   rA   rB   
<listcomp>      $ z8GraphLowering.symbolic_sizes_strides.<locals>.<listcomp>c                 S   rW   rA   rX   r\   rA   rA   rB   r^      r_   )
reuse_shape_envr4   sizestrideZtorch._dynamo.sourcerV   len
_shape_envZ
var_to_valZ,create_symbolic_sizes_strides_storage_offset)selfrU   rV   sourcera   rb   _rA   rA   rB   symbolic_sizes_stridesr   s&   z$GraphLowering.symbolic_sizes_stridesc                 C   s,   dd |  D }dd | D }||fS )z+
        Primarily used to weights
        c                 S      g | ]}t |qS rA   rE   rJ   r\   rA   rA   rB   r^          z6GraphLowering.static_sizes_strides.<locals>.<listcomp>c                 S   ri   rA   rj   r\   rA   rA   rB   r^      rk   )ra   rb   )re   rU   ra   rb   rA   rA   rB   static_sizes_strides   s   z"GraphLowering.static_sizes_stridesc                 C   sP   t dd u rddlm} td|t t dd u r&ddlm} td|t d S d S )Ncpur   )CppSchedulingr@   )CUDACombinedScheduling)r   Zcodegen.cpprn   r   r   Z codegen.cuda_combined_schedulingro   )re   rn   ro   rA   rA   rB   init_backend_registration   s   z'GraphLowering.init_backend_registrationNFgmexample_inputsc                    s  t  | || _|	d ur|	n| j||d| _d| _|| _d| _|d u r+t }d| _	n|| _
d| _	|| _
t|| _i | _i | _t | _t | _d| _g | _i | _i | _t | _t | _t | _t | _t | _d | _g | _|
| _d | _|| _i | _ t | _!g | _"i | _#t$t%| _&t'' | _(d| _)|| _*|| _+|| _,d | _-| jr| . nt | _/dh| _0|| _1d| _2d| _3g | _4d| _5d| _6|7 | _8| 9  d S )N)is_inferencer   FTrS   zaten.convolution_backward ):super__init__rr   decide_layout_opt
layout_optnum_channels_last_convrs   Zextra_tracebackr   r`   rd   r3   sizevarsgraph_inputsgraph_inputs_originalsetdevice_typesdevice_idxsr@   buffers	constantsconstant_reprsremoved_buffersZremoved_inplace_buffersmutated_buffersZnever_reuse_buffersinplaced_to_removewrapper_codeextern_kernel_nodesextern_node_serializercurrent_nodenum_static_inputslistsZmutated_inputsZmutated_input_idxsname_to_bufferr   listname_to_userstimeZcreation_timenamecpp_wrapperaot_modegraph_id	schedulerfind_nodes_prefer_channels_lastnodes_prefer_channels_last_warned_fallbackuser_visible_outputs	cache_key
cache_pathcache_linemapZdisable_cudagraphsZdisable_cudagraphs_reason__copy__orig_gmrp   )re   rq   rr   Z	shape_envr   r   r   r   r   rx   r   rs   	__class__rA   rB   rv      st   



zGraphLowering.__init__returnc             
      s  t jsdS t jr
dS dd | jjD }t|}|dkrdS tjjr(tj	
 r(dS tdd |D r>tjjjr>tjj
 r>dS tt| jjd| krQtd	 dS td
d |D ratd dS dd  dd dd |rddlm} tt}|D ]g}tjj|\}}}	|r|dd#}
tj |j|i |	 W d   n1 sw   Y  W d   n1 sw   Y  |
 } |rd}n|rd}n	|rd}nd}||  |7  < q|td q|d}d}d}d}t| }|d | |d |  |d |  |d |  }||k}|std|| |S t fdd|D r-td  dS tfd!d|D r@td" dS tfd#d|D rStd$ dS dS )%zl
        Decide if we should enable layout optimization for this graph based on
        heuristics.
        FTc                 S   s"   g | ]}|j tjjjjkr|qS rA   )targetr:   opsatenconvolutiondefaultrM   nrA   rA   rB   r^   	  s    z3GraphLowering.decide_layout_opt.<locals>.<listcomp>r   c                 s   s6    | ]}d D ]}|j | jd jtdkV  qqdS )r   r   valrm   N)argsmetadevicer:   rM   r   idxrA   rA   rB   	<genexpr>  s    z2GraphLowering.decide_layout_opt.<locals>.<genexpr>i,  z*Skipped layout opt because only a few convc                 s   s.    | ]}d D ]}t |j| jd V  qqdS )r   r   N)r   r   r   r   rA   rA   rB   r   (  s    zeSee perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670c                 S   s(   | j d dko| j d jd ddkS )Nr   r   r   r   ra   r   rA   rA   rB   
is_grouped2  s   (z3GraphLowering.decide_layout_opt.<locals>.is_groupedc                 S   sJ   | j d jd dd | j d jd dko$| j d jd ddkS )Nr   r   r      r   r   rA   rA   rB   is_in_out_channel5  s   0z:GraphLowering.decide_layout_opt.<locals>.is_in_out_channelc                 S   s4   | j d jd ddko| j d jd ddkS )Nr   r   r   @   r   r   rA   rA   rB   is_small_channel;  s   z9GraphLowering.decide_layout_opt.<locals>.is_small_channel)FlopCounterMode)displayNgroupedZsmallZin_outr   zConv inputs meta not foundg|?5^?gtV?g333333?guV?zhSkipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %dc                 3       | ]} |V  qd S r9   rA   r   )r   rA   rB   r         zFSkip layout opt because found grouped convolution with >1 in_channels!c                 3   r   r9   rA   r   )r   rA   rB   r     r   zBSkip layout opt because some convolutions have smaller out_channelc                 3   r   r9   rA   r   )r   rA   rB   r     r   z>Skip layout opt because all convolution channels are too small) r   Zlayout_optimizationZforce_layout_optimizationgraphnodesrc   r:   versionZhipr@   Zis_availableallbackendsmkldnnenabledr   logdebuganyZtorch.utils.flop_counterr   r   floatZ	_inductorZfx_utilsZget_fake_args_kwargsr7   	fake_moder   Zget_total_flopssumvalues)rq   rs   Z
conv_nodesZnconvr   Zflop_countsrZ   successr   kwargsZflop_counter_modeZcounted_flopsZ	node_typeZGROUPED_MULTIPLIERZDEFAULT_MULTIPLIERZIN_OUT_MULTIPLIERZSMALL_MULTIPLIERZtotal_flopsZweighted_flopsZdo_layout_optrA   )r   r   r   rB   rw      s   
	





	
zGraphLowering.decide_layout_optc                 C   s   t  }t| jjjD ]"}|jtjjj	j
kr|| q
|jD ]}||v r+||  nqq
| jjjD ]}||v rC|jD ]}|| q;q2|S )aC  
        The rule to decide if an node prefer channels last is simple.
        1. if it's input/output of a convolution
        2. if one of its user prefers channels last

        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
        channels last.

        Consider the scenario: conv -> batch-norm -> relu -> conv
        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
        1. the output of batch-norm should be channels last initially since its input is a conv's output.
           Forcing the batch-norm's output to be contiguous results in the first copy
        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
           We need convert it to channels last layout which results in the second copy.
        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
        can be saved.
        )r}   reversedmoduler   r   r   r:   r   r   r   r   r>   users)re   Z
output_setr   userchildrA   rA   rB   r     s"   



z-GraphLowering.find_nodes_prefer_channels_lastc                 C   s*   || j vr| j | td| d S d S )NzUsing FallbackKernel: %s)r   r>   perf_hint_loginfore   r   rA   rA   rB   warn_fallback  s   
zGraphLowering.warn_fallbackr   c                 C   s.   | j |j |jd ur| j|j d S d S r9   )r~   r>   typeindexr   )re   r   rA   rA   rB   add_device_info  s   
zGraphLowering.add_device_infoc                 C   s   t jS r9   )r7   r   re   rA   rA   rB   r     s   zGraphLowering.fake_modebuffer_namec                 C   s,   || j v r
| j | S || jv r| j| S d S r9   )r   r{   )re   r   rA   rA   rB   
get_buffer  s
   



zGraphLowering.get_bufferc                 C   st   || j v r| j | jS || jv r| j|  S || jv r#| j|  S td|}|r3| |dS td| )Nz1(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),r   could not find )	r   r?   r   	get_dtyper{   rematchgroupKeyError)re   r   rN   rA   rA   rB   r     s   


zGraphLowering.get_dtypec                 C   s|   ddl m} || jv r| j|  S || jv r+| j| }tt|dd |r'dS | S || jv r7| j|  S t	d| )Nr   )MultiOutputLayoutZlayoutr   )
r   r   r   Znumelr   rD   getattr	get_numelr{   r   )re   r   r   bufrA   rA   rB   r     s   



zGraphLowering.get_numelc                    s   t  j| S r9   )ru   run)re   r   r   rA   rB   r     s   zGraphLowering.runbufferc                 C   sL   dt | j }| j| || j|< t|tjr| s$| |	  |S )Nr   )
rc   r   appendr   rD   r   ComputedBufferZis_zero_elementsr   
get_device)re   r   r   rA   rA   rB   register_buffer	  s   
zGraphLowering.register_bufferbuffer_namesc                 C   s   dd | }|| j|< |S )Nlist_rg   )joinr   )re   r   r   rA   rA   rB   register_list  s   
zGraphLowering.register_listc                    s    fdd  | d S )Nc                    s   t | ttfr| D ]} | q	t | tjrAt| dr0t | jtjr0t| jdr0t | jjtjs2d S |  D ]}j| 	|  q6d S d S )Ndata)
rD   r   tupler   IRNodehasattrr   Zget_read_namesr   r   )valuexZ	read_nameregisterre   rA   rB   r     s"   

z1GraphLowering.register_users_of.<locals>.registerrA   )re   Znode_outputrA   r   rB   register_users_of  s   zGraphLowering.register_users_ofr   c                 C   sD   t |tsJ | j| || jvrdS | j| D ]}|  qdS )z
        When a buffer is mutated we need to make sure all the reads to
        the old version are realized before the mutation happens.
        N)rD   strr   r>   r   realize)re   r   r   rA   rA   rB   mark_buffer_mutated,  s   

z!GraphLowering.mark_buffer_mutatedc              
      s@    fdd}||}t t|t j jg R  S )Nc                    s  j  D ]/\}} js4  | kr4  | kr4 j|jkr4 j|jkr4t |	 r4|  S q| d u rAdt
j  } | d  rLd|  } tdd| }|} d}| j v rl| d| } |d7 }| j v s\ j | < tt d j| < | S )NZconstantr   Z	constant_z[^a-zA-Z0-9_]rg   r   zutf-8)r   itemsZ	is_mkldnnra   rb   r?   r   r:   eqr   rc   isdigitr   subhashlibsha256reprencode	hexdigestr   )r   constant_namer   prefixZcntr   re   rA   rB   allocate;  s:   



z3GraphLowering.add_tensor_constant.<locals>.allocate)r*   creater   ConstantBufferr%   r   r?   rl   )re   r   r   r  rA   r  rB   add_tensor_constant:  s   z!GraphLowering.add_tensor_constantdevice_overridec                 C   sZ   | j | j|ks|du r|S | d|j |jpd }|| j vr+| j | || j |< |S )z
        We AOT copy constants to the devices they are needed on.
        If device_override doesn't match the constant's device, then
        copy it and return a different name.
        Nrg   r   )r   r   r   r   to)re   r   r  Zalt_namerA   rA   rB   r  b  s   
zGraphLowering.constant_namer   c           	   	      s   t  |||}t|tr|jj}|| j|< |S t|ttt	fr,t
|}|| j|< |S t|tjs6J ||jsA| |\}}n| |\}}tt|t|j|j||}|| j|< |jj| j|< | |j |S r9   )ru   placeholderrD   r   rZ   r[   r{   intr=   r   rE   Zsympifyr:   TensorZ_has_symbolic_sizes_stridesrl   rh   r*   r	  r&   r%   r   r?   r   r|   r   )	re   r   r   r   Zexampler[   sizesstridestensorr   rA   rB   r  o  s.   




zGraphLowering.placeholderc              
      sB  |t ju rt|d tttfrt |||S t|dr#||i |S |t	vrtt|t
jjs5J | d| dd }|tv rGt| n-tjrct|grQtnt}td|||| t| nt|grnt|||t|||ztdt	|  t	| |i |}|W S  ty } zt|||||jd d }~ww )Nr   Z_inductor_lowering_functionz is not an OpOverload.z"Creating implicit fallback for:
%sz  via %s)operatorgetitemrD   r   r   dictru   call_functionr   r/   r:   Z_opsZ
OpOverloadr   splitr+   r0   r   Zimplicit_fallbacksr   r"   r#   r   r   Zoperator_strr   	Exceptionr!   with_traceback__traceback__)re   r   r   r   	base_nameerrorouter   rA   rB   r    sL   



zGraphLowering.call_functiontc                 C   s   t | jdko| jd dkS )zM
        True if this is a small constant attr that will be inlined.
        r   r      )rc   shape)r!  rA   rA   rB   can_inline_constant  s   z!GraphLowering.can_inline_constantc                 C   s   t | j|}tjst|r| ||S t ; |jdkr-t|	 |j
|jW  d    S | |rJddlm} || |j
|jdW  d    S W d    n1 sTw   Y  | ||S )NrA   r   )r  )r?   r   )r   r   r   Zalways_keep_tensor_constantsr2   r  r   r#  r$   itemr?   r   r$  loweringr  tolist)re   r   r   r   r   r  rA   rA   rB   get_attr  s   

	zGraphLowering.get_attrc                 C      t  r9   AssertionErrorre   r   r   r   rA   rA   rB   call_module     zGraphLowering.call_modulec                 C   r)  r9   r*  r,  rA   rA   rB   call_method  r.  zGraphLowering.call_methodc           	   	      s\  t  |||}t|ttfsJ t|tdd |D s"J |dd |D | _| j	 D ]e\}}t|t
tjfsDJ dt| t|t
sJq/|  t|t
sUJ |j}t|tjs`J |}|j}t|trp| |krtj|| j|  z| j|}| j| | j|< W q/ ty   Y q/w q/|   td| j| jd ur| j d S d d S )Nc              
   s   s8    | ]}t |ttjtd tjtjtjj	j
tfV  qd S r9   )rD   r*   r   r$   r   r
  rE   rG   ZlogicZboolalgBooleanr  rM   r   rA   rA   rB   r     s    
z'GraphLowering.output.<locals>.<genexpr>c                 S   s   g | ]}t j|qS rA   )r   ExternKernelZrealize_inputr1  rA   rA   rB   r^     s    z(GraphLowering.output.<locals>.<listcomp>z'Unsupported inductor graph input type: zGForce channels last inputs for %d conv for the current graph with id %dr   )ru   outputrD   r   r   r   r   rT   r{   r   r*   rE   rG   r   r   r   r)   r&   get_nameZMutationLayoutZrealize_intor|   r   
ValueErrorfinalizer   r   ry   r   )	re   r   r   r   resultr   r   Zvalue_storage_boxindr   rA   rB   r3    sN   



zGraphLowering.outputc                 C   s   | j D ]}|  qd S r9   )r   Zdecide_layout)re   r   rA   rA   rB   r6  
  s   

zGraphLowering.finalizerZ   c                 c   s*    | j }z|| _ d V  W || _ d S || _ w r9   )r   )re   rZ   oldrA   rA   rB   set_current_node  s   zGraphLowering.set_current_noder   c                    sL  fdd}h}j dkr| \}}|t||O }tj| |  t j dkrRjt	j
urRtrR|d tjdd|i |}nRj dkryjtv ry|d tj g|R i |\}}| j||}n+tjr|d tjd	 tjrjd	 jj}nt }n
|d
 t }tjjjjtjjjjtjjjjg tdd jD }t fddjD }|s|rtjd	 tj rjd	 ! }	tj"#jd	 }
|
rt$|	rt%|	}t$|& dkr| j'v rj(| j)vr|stj*}tj+,||}t$t-j}|dkrt|t.rՈjD ]}|jt/v r|0  tjjj1jtjjj2jtjjj3jg}| j4sY|5tjjj6j tj7j8r|tjj9j:jtjj9j:j;tjj9j<j;tjj9j=jtjj9j>jtjj9j>j;tjjj?jtjj@jAjtjj@jAj;tjj@jBjg
7 }tj7jCr|tjjDjEjg7 }|j|v rtj+,|t%jd	 ! }|j dkrt|jFjFtGtHfr|I  q/|Jt$j t|t.r|K r|0  t|t.rt|jFtLr|jFjF}t|tGr|M tNjOkr|I  W d    n	1 sw   Y  W d    n	1 s"w   Y  W d    n	1 s2w   Y  t|t.rt|jFtjLrt|jFjFtjPrT|jFjF_QnKt|jFjFtjRr|jFjF_Qt|jFjFtjSr|t|jFjFjFtjPr||jFjFjF_Qn#t|jFjFtjTr|jFjFjUst|jFjFjVd tjRr|jFjFjVd _Q| W| |S )Nc                    s   t dt j|  d S )Nzlowering %s %s)r   r   r   Zformat_node)msgr   rA   rB   r     s   z%GraphLowering.run_node.<locals>.debugr  r,   F)Zadd_to_fallback_setr.   rR   r   rt   c                 s   s    | ]}|j d kV  qdS )r3  N)rQ   rM   r   rA   rA   rB   r   G  s    z)GraphLowering.run_node.<locals>.<genexpr>c                 3   s    | ]}|j  v V  qd S r9   )r   r<  )as_strided_opsrA   rB   r   H  s    

   r   r3  r   )XrQ   Zfetch_args_kwargs_from_envr5   r   r   Zcurrent_originsr:  r7   r   r  r  r-   r,   r.   r  rR   rD   r   r:   rY   rZ   r[   ru   run_noder   r   Z
as_stridedr   Zas_strided_Zas_strided_scatterr   r   r  rb   Z_prims_commonZis_non_overlapping_and_denserc   Zget_stride_orderget_sizer   r   r   ZNHWC_STRIDE_ORDERr2  Zrequire_stride_orderr}   r*   r1   Zrealize_hintZconvolution_backwardmmZ_int_mmrx   r   r   Z_CZ_has_mkldnnr   Z_convolution_pointwisebinaryZ_convolution_pointwise_Z _convolution_transpose_pointwiseZ_linear_pointwiseZmkldnn_rnn_layerZonednnZqconv2d_pointwiseZqlinear_pointwiseZhas_mklZmklZ_mkl_linearr   r'   r(   r   Z
mark_reuseZhas_exceeded_max_readsr)   Zinner_fn_str_lenr   Zrealize_bytes_thresholdZLoopsZorigin_nodeBufferr   ZMultiOutputindicesZinputsr   )re   r   r   Zoriginsr   r   r7  Z	is_outputZis_input_for_as_stridedr  ZdenseZstride_orderZ	num_usersr   Zneed_fixed_layoutcurrr   )r=  r   rB   r?    s   

 





















   	

zGraphLowering.run_nodec                 C   s   t jrtdtjdkrtdtj | j D ],}d }t|tr'|	 }nt|t
jt
jt
jjjfr8t|}t|| jsEtd| qd S )NzC++ codegen is disabledlinuxzUnsupported platform zUnsupported input dtype )r   Zdisable_cpp_codegenr    sysplatformr{   r   rD   r*   r   rE   rF   rG   rH   rI   rJ   rL   rC   r@   )re   r   r?   rA   rA   rB   !validate_can_generate_cpp_wrapper  s    


z/GraphLowering.validate_can_generate_cpp_wrapperc                 C   s   d| j v | _| jr|   | jrt | _d S t | _d S | j  }|d t	|dks6J d
d|t	|dk}|r@dn| }t|}|d usTJ d| d| | _d S )	Nr@   rm   r   zDoes not support mixing {}+r   zDevice z not supported)r~   r@   r   rI  r   r   r   copydiscardrc   formatr   popr   )re   r~   Zonly_cpuZdevice_typeZwrapper_code_gen_clsrA   rA   rB   init_wrapper_code  s&   


zGraphLowering.init_wrapper_codec                    s   d| j v rNd| _|  j}dd  tjj  | jdusJ  fdd| jD }|| W d   n1 s7w   Y  ~d| _| j	
  | j
  |  S |  S )	ad  
        For CPU, the cpp wrapper codegen is done in one pass.
        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
        generate cpp wrapper code and compile it to a dynamic library in the second pass.
        r@   Fc                 S   sP   t | tjtjfr| jjS t | trt| S t | tjs&J dt	t
|  | S )Nz&Unknown type when creating real inputs)rD   r:   rY   ZSymFloatrZ   hintr   r   r  r   r   )r   rA   rA   rB   materialize  s   
z;GraphLowering.codegen_with_cpp_wrapper.<locals>.materializeNc                    s   g | ]} |qS rA   rA   r1  rQ  rA   rB   r^     rP   z:GraphLowering.codegen_with_cpp_wrapper.<locals>.<listcomp>T)r~   r   compile_to_modulecallr:   utilsZ_python_dispatchZ_disable_current_modesrr   r   clearr   codegen)re   compiledZreal_inputsrA   rR  rB   codegen_with_cpp_wrapper  s   




z&GraphLowering.codegen_with_cpp_wrapperc                 C   sL   ddl m} |   || j| _ tj| j| j j | j 	  | j
| jS )Nr   	Scheduler)r   r[  rO  r   r7   r   Zdraw_orig_fx_graphr   r   rW  r   generaters   )re   r[  rA   rA   rB   rW    s   
zGraphLowering.codegenc                 C   sl   ddl m} || j}d}g }g }|jD ]}| }||7 }|||d f ||| f q|||fS )Nr   rZ  r   r>  )r   r[  r   r   Zget_read_write_buffers_sizesr   Zget_estimated_runtime)re   r[  r   total_bytesZnode_countsZnode_runtimesrZ   	num_bytesrA   rA   rB   count_bytes  s   


zGraphLowering.count_bytesc                 C   s   ddl m} | jr|  n|  \}}dd |D }||\}}|j|||| jd}|| _|| _	|| _
|jd us;J td|j td| td|j tjr]td|j tjd	 tj|j tjtj|jd
 d  |S )Nr   )PyCodeCachec                 S   s   g | ]	\}}||j fqS rA   )Zstack_trace)rM   line_norZ   rA   rA   rB   r^   ,  s    z3GraphLowering.compile_to_module.<locals>.<listcomp>)linemapattrszOutput code written to: %sOutput code: 
%szCompiled module path: )filer   z.debug)	codecacher`  r   rY  rW  writeZload_by_key_pathr   r   r   r   __file__r   r   output_code_logr   r   Zbenchmark_kernelprintrG  stderrr7   r8   rK  ospathsplitext)re   r`  coderb  keyrm  modrA   rA   rB   rS  %  s(   
zGraphLowering.compile_to_modulec                 C   s   | j r>ddlm} | jsJ d|  \}}td| d }t r4| j	r4| j
r4| 
| j	}td| |j| ||| jdS |  jS )Nr   )AotCodeCachez"AOT mode only supports C++ wrapperrd  z#Serialized Extern Kernel Nodes: 
%s)r@   )r   rf  rr  r   rY  ri  r   r   Z	is_fbcoder   r   compiler@   rS  rT  )re   rr  ro  rb  Zserialized_extern_kernel_nodesrA   rA   rB   compile_to_fnA  s.   

zGraphLowering.compile_to_fnc                 C   s   dd | j D S )Nc                 S   s,   g | ]}t |tjst |tjs| qS rA   )rD   r   ZNoneAsConstantBufferZShapeAsConstantBufferr4  )rM   rZ   rA   rA   rB   r^   _  s    

z2GraphLowering.get_output_names.<locals>.<listcomp>)rT   r   rA   rA   rB   get_output_names^  s   zGraphLowering.get_output_namesc                 C   s4   || j  v o| j |  dko| j |  jdkS )Nr   rm   )r{   keysr   r   r   r   rA   rA   rB   is_unspec_argf  s
   zGraphLowering.is_unspec_argr9   )=__name__
__module____qualname__r   r   r   __annotations__r:   r  rh   rl   rp   	frozensetfxZGraphModuler	   rv   staticmethodr=   rw   r   r   r   r   propertyr   r   r   r   r   r   r   rC  r   r   r   r   r  r  r  r  r$  r(  r-  r/  r3  r6  r   Noder:  r?  rI  rO  rY  rW  r_  rS  rt  ru  rw  __classcell__rA   rA   r   rB   rS   o   s|   
 #T &3
	
( +1 &(

rS   )^r   loggingr  rl  r   rG  r   collectionsr   
contextlibr   typingr   r   r   r   r   r	   r
   r   rE   r:   Ztorch._loggingZtorch.fxZtorch._decompr   Ztorch._dynamo.utilsr   r   r   Ztorch._subclasses.fake_tensorr   Ztorch.fx.experimental.sym_noder   r   Z%torch.fx.experimental.symbolic_shapesr   r   r   Ztorch.utils._mode_utilsr   rt   r   r   Zcodegen.commonr   r   r   Zcodegen.wrapperr   r   r   excr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r&  r+   r,   r-   r.   r/   r0   r1   r2   rz   r3   rU  r4   r5   r6   Zvirtualizedr7   	getLoggerrx  r   Z_loggingZgetArtifactLoggerr   ri  rC   rL   rR   r}  ZInterpreterrS   rA   rA   rA   rB   <module>   sJ    ($(	

