o
    I&i^                    @   sl  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ dd	l%m&Z&m'Z' dd
l(m)Z) ddl"m*Z*m+Z+ ddl#m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z< e=e>Z?ej@Ae>dZBG dd dZCdd ZDG dd dZEd1ddZFejGjHjIejGjHjJejGjHjKejGjHjLdZMG dd dZNG dd deNZOG d d! d!eNZPG d"d# d#eNZQG d$d% d%eNZRG d&d' d'eRZSd2d)d*ZTejUG d+d, d,ZVeW ZXG d-d. d.ZYG d/d0 d0ZZdS )3    N)
AnyCounterDefaultDictDictListOptionalSequenceSetTupleUnion)dynamo_timed)get_metric_tableis_metric_table_enabled)free_unbacked_symbols)
has_triton   )commsconfigdependenciesirmetrics)get_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)StarDepWeakDep)ComputedBufferMultiOutputMultiOutputLayout)SimplifyIndexing)	cache_on_selfcmpfree_symbol_hasget_device_tflopsget_dtype_sizeget_gpu_dram_gbps
green_textred_textsympy_product)Vfusionc                   @   sH   e Zd ZU g dZeed< eedf ed< ddd	Zd
d Z	dd Z
dS )	WhyNoFuse)node1node2reasonargsr.   .r/   r,   BaseSchedulerNoder-   c                 C   s   || _ || _d S Nr,   r-   selfr,   r-    r5   DC:\wamp64\www\opt\env\Lib\site-packages\torch/_inductor/scheduler.py__init__>   s   
zWhyNoFuse.__init__c                 G   s   || _ || _t|  d S r1   )r.   r/   
fusion_logdebug)r4   r.   r/   r5   r5   r6   __call__B   s   zWhyNoFuse.__call__c                 C   s*   d| j   d| j  d| j| j  S )Nzcannot fuse z with : )r,   get_namer-   r.   r/   r4   r5   r5   r6   __str__G   s   
zWhyNoFuse.__str__Nr,   r0   r-   r0   )__name__
__module____qualname__	__slots__str__annotations__r
   r   r7   r:   r>   r5   r5   r5   r6   r+   7   s   
 
r+   c                 C   sB   t | trt| td} tj| dd}d|v rdt|d S |S )Nkey   )indent
    )
isinstancesetsortedrD   pprintpformattextwraprI   )objresultr5   r5   r6   rP   M   s   
rP   c                   @   s0   e Zd Zdd Zdd Zdd Zdd ZeZd	S )

OutputNodec                 C   s   |h| _ g | _d S r1   )unmet_dependenciesinverse_usersr4   depr5   r5   r6   r7   X   s   
zOutputNode.__init__c                 C      dS NFr5   r=   r5   r5   r6   is_reduction\      zOutputNode.is_reductionc                 C   rY   )Nr5   r5   r=   r5   r5   r6   get_alias_names_   r\   zOutputNode.get_alias_namesc                 C   rY   )NZOUTPUTr5   r=   r5   r5   r6   r<   b   r\   zOutputNode.get_nameN)r@   rA   rB   r7   r[   r]   r<   __repr__r5   r5   r5   r6   rT   W   s    rT   r,   r0   r-   c                 C   s(   |   s|  rt| |S t| |S r1   )
is_foreachForeachKernelSchedulerNodefuseFusedSchedulerNoder2   r5   r5   r6   ra   h   s   ra   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                   @   s  e Zd ZdddejfddZdd Zdefd	d
ZdefddZ	dd Z
deeef fddZdd Zdd Zded fddZdee deeef fddZdd Zd d! Zd"d# Zd$ejfd%d&Zd'd( Zdee fd)d*Zdee fd+d,Zd-d. Zd/d0 Zd1d2 Zdefd3d4Zdefd5d6Z dee fd7d8Z!de"d  fd9d:Z#d;d< Z$d=d> Z%d?d@ Z&dAdB Z'dCdD Z(dEej)fdFdGZ*dHdI Z+dJdK Z,dLdM Z-dNdO Z.dXdQdRZ/de0fdSdTZ1de2fdUdVZ3dWS )Yr0   	scheduler	Schedulernodec                 C   sN   || _ || _g | _g | _g | _| |  t | _|  |  t | _	d| _
d S rZ   )rc   re   usersrV   
node_usersset_read_writesZget_read_writesrM   	ancestors
last_usagewritten)r4   rc   re   r5   r5   r6   r7   y   s   
zBaseSchedulerNode.__init__c                 C   s   t | j d|  dS )Nz(name=)typer@   r<   r=   r5   r5   r6   r^         zBaseSchedulerNode.__repr__returnc              	   C   s   |   }| dt| j dtt| ddj d| dt| jj | dt| j | dt| jj| j  | d	| j	 g}z	|| 
 g7 }W n tyZ   tjd
dd Y nw d| S )z#Longer form printout for trace logsr;   (re   Nrl   z
.writes = z.unmet_dependencies = z.met_dependencies = z	.users = zIgnoring error in debug_str()T)exc_inforJ   )r<   rn   r@   getattrrP   read_writeswritesrU   readsrf   debug_str_extra	Exceptionlogwarningjoinrstripr4   namelinesr5   r5   r6   	debug_str   s   (
zBaseSchedulerNode.debug_strc                 C   rY   )N r5   r=   r5   r5   r6   rw      r\   z!BaseSchedulerNode.debug_str_extrac                 C   s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)ry   inforU   rt   ru   r=   r5   r5   r6   log_details   s   zBaseSchedulerNode.log_detailsrenamesc                 C      |  | j| d S r1   )rh   rt   renamer4   r   r5   r5   r6   update_mutated_names      z&BaseSchedulerNode.update_mutated_namesc                 C   r   r1   rh   rt   Z	with_readrW   r5   r5   r6   add_mutation_dep   r   z"BaseSchedulerNode.add_mutation_depc                 C   r   r1   r   rW   r5   r5   r6   add_fake_dep   r   zBaseSchedulerNode.add_fake_deprf   NodeUserc                 C   s\   i }|D ] }t |j|v r||t |j |t |j< q||t |j< qt| | _d S r1   )idre   mergelistvaluesrf   )r4   rf   rS   user5   r5   r6   	set_users   s    zBaseSchedulerNode.set_usersfuture_used_buffersmutation_real_namec                    s(   |   } fdd|D }|| | _d S )Nc                    s   h | ]}  ||qS r5   )get).0kr   r5   r6   	<setcomp>       z3BaseSchedulerNode.set_last_usage.<locals>.<setcomp>)used_or_aliased_buffer_namesrj   )r4   r   r   Zused_buffersr5   r   r6   set_last_usage   s   z BaseSchedulerNode.set_last_usagec                 C   
   | j  S r1   )re   r]   r=   r5   r5   r6   get_aliases      
zBaseSchedulerNode.get_aliasesc                 C   r   r1   )re   get_mutation_namesr=   r5   r5   r6   get_mutations   r   zBaseSchedulerNode.get_mutationsc                 C   s   t |  p|  S r1   )boolr   r   r=   r5   r5   r6   has_aliasing_or_mutation   s   z*BaseSchedulerNode.has_aliasing_or_mutationrwc                 C   s   || _ | j j| _|   d S r1   )rt   rv   rU   
prune_deps)r4   r   r5   r5   r6   rh      s   
z!BaseSchedulerNode.set_read_writesc                 C   s   | j jS r1   )rt   	op_countsr=   r5   r5   r6   r         zBaseSchedulerNode.op_countsc                 C   s   dd t | jj| jjD S )Nc                 S      h | ]}|j qS r5   r~   r   rX   r5   r5   r6   r      s    z6BaseSchedulerNode.used_buffer_names.<locals>.<setcomp>)	itertoolschainrt   rv   ru   r=   r5   r5   r6   used_buffer_names   s   z#BaseSchedulerNode.used_buffer_namesc                 C   sp   t  }t| jj| jjD ](}||j tj	j
|jr5tj	j
|j  }t|tjr5||jj  q|S r1   )rM   r   r   rt   rv   ru   addr~   r)   graphname_to_bufferr   
get_layoutrL   r   AliasedLayoutviewdatar<   )r4   Z
used_namesrX   layoutr5   r5   r6   r      s   z.BaseSchedulerNode.used_or_aliased_buffer_namesc                    s    fdd j D  _ d S )Nc                    s   h | ]}|j  jjvr|qS r5   )r~   rc   available_buffer_namesr   r=   r5   r6   r      
    z/BaseSchedulerNode.prune_deps.<locals>.<setcomp>rU   r=   r5   r=   r6   r      s   
zBaseSchedulerNode.prune_depsc                    s4   dd   fdd| j jD }| | j | d S )Nc                 S   s   t | to| jtjjv S r1   )rL   r   r~   r)   r   removed_buffers)rX   r5   r5   r6   should_prune   s   z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                       h | ]} |r|qS r5   r5   r   r   r5   r6   r      r   z4BaseSchedulerNode.prune_weak_deps.<locals>.<setcomp>)rt   rv   rh   remove_reads)r4   	to_remover5   r   r6   prune_weak_deps   s   z!BaseSchedulerNode.prune_weak_depsc                    s   t   jD ]}t|ts |j    d7  < q fddfddjD }|rAj| _j	| dS dS )a  
        Prunes weakdeps intended for mutation ordering
        on an upstream fused node if after fusion there is another dependency
        on the fused upstream node, making the weakdep redundant

        In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
        be incrementally removed, enabling other fusions, ensuring they are fused in order.
        r   c                    s:   t | tr | j   dk}| j k}|p|S dS )Nr   F)rL   r   r~   r<   )rX   Zis_redundantZis_self_dep)name_to_dep_countname_to_fused_noder4   r5   r6   r     s   
z<BaseSchedulerNode.prune_redundant_deps.<locals>.should_prunec                    r   r5   r5   r   r   r5   r6   r     r   z9BaseSchedulerNode.prune_redundant_deps.<locals>.<setcomp>N)
collectionsr   rU   rL   r   r~   r<   rh   rt   r   )r4   r   rX   Zdeps_to_pruner5   )r   r   r4   r   r6   prune_redundant_deps   s   	

z&BaseSchedulerNode.prune_redundant_depsc                 C   r   r1   re   r<   r=   r5   r5   r6   r<     r   zBaseSchedulerNode.get_namec                 C   s   |   S r1   r<   r=   r5   r5   r6   get_first_name  r   z BaseSchedulerNode.get_first_namec                 C   s
   |   hS r1   r   r=   r5   r5   r6   	get_names  r   zBaseSchedulerNode.get_namesc                 C   s   | gS r1   r5   r=   r5   r5   r6   	get_nodes!     zBaseSchedulerNode.get_nodesc                 C   r   r1   )re   
get_devicer=   r5   r5   r6   r   $  r   zBaseSchedulerNode.get_devicec                 C   rY   rZ   r5   r=   r5   r5   r6   r[   '  r\   zBaseSchedulerNode.is_reductionc                 C   rY   rZ   r5   r=   r5   r5   r6   is_template*  r\   zBaseSchedulerNode.is_templatec                 C   rY   rZ   r5   r=   r5   r5   r6   	is_extern-  r\   zBaseSchedulerNode.is_externc                 C   rY   rZ   r5   r=   r5   r5   r6   r_   0  r\   zBaseSchedulerNode.is_foreachread_depc                 C   rY   rZ   r5   r4   r   r5   r5   r6   can_inplace3  r\   zBaseSchedulerNode.can_inplacec                 C   rY   rZ   r5   r=   r5   r5   r6   has_side_effects6  r\   z"BaseSchedulerNode.has_side_effectsc                    s   j  sdS t tfr j  s j  rdS t tfs.t trt j tjtj	frt
jrttjtjjjjrDttjdddurddlm} t jjdd d}|D ]} jj|j}|rtjj| r|j dusqJ  fdd	|j D }t!|dkr|d
 j"r|d
 j  u rt|j # tj$tj%tj&fst|j tj'rt!|j  d
ks||j | j krt(tjdrtjj)*|+  +  ttjtjjjjrtjj,-|+  tjj,- +   j./|+  |+ tjj0 + <  dS qVdS dS dS dS dS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        N	mutationsr   )buffer_reuse_keyc                 S      | j S r1   r   xr5   r5   r6   <lambda>W      z9BaseSchedulerNode.decide_inplace_update.<locals>.<lambda>rF   c                    s"   g | ]}|j   jjvr|qS r5   )re   r<   rc   r   r   r   r=   r5   r6   
<listcomp>_  s    z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>r   r/   )1re   should_allocaterL   SchedulerNoder]   r   ExternKernelSchedulerNoder   	AllReduceInPlaceHintr   inplace_buffersr)   kerneltorch	_inductorcodegentritonZTritonKernelrs   Zcodegen.wrapperr   rN   rt   rv   rc   name_to_noder   r~   r   wrapper_codeZ	can_reuserf   lenr   r   r   ZMutationLayoutr   ZFallbackKernelhasattrr/   Zmake_inplacer<   r   r   rj   discardinplace_update_buffers)r4   r   Zordered_readsreadZ
input_nodeZremaining_usesr5   r=   r6   decide_inplace_update9  s   

	

z'BaseSchedulerNode.decide_inplace_updatec                 C   s   | j  sd S t| tfr!| j  s| j  r!tjj	| j  d S t
tjdrE|  tjjv rEtjj| jjtjj|    j | j  d S tjj	| j  d S )Nr/   )re   r   rL   r   r]   r   r)   r   r   Zcodegen_allocationr   r   r<   r   Zcodegen_inplace_reuserc   r   r=   r5   r5   r6   allocate  s(   

zBaseSchedulerNode.allocatec                 C   s"   | j D ]}t|jtr dS qdS )NFT)rf   rL   re   rT   )r4   r   r5   r5   r6   can_free  s
   
zBaseSchedulerNode.can_freeTc           	      C   s  t jsd S |r| jrd S | jj}g }|D ]_}|jdkrq|d |d d|j d|j }d|jv r?|d|jd   }|| d|jv rs|jd  }|	d	d
 }|d|
dd
dd
dd  |d |d qt|dkr|d S || d| _d S )Noutputr   z#pragma CMT ORIGIN:z#pragma CMT  Zseq_nrz seq_nr:stack_trace|{z{{}z}}rJ   \z#pragma CMT END ORIGINr   T)r   Zcomment_originrk   re   originsopappendtargetmetasplitreplacer   
writelines)	r4   bufferZ	only_oncer   Z	out_linesoZop_info_strr   Zstack_trace_last_liner5   r5   r6   codegen_originating_info  s@   











z*BaseSchedulerNode.codegen_originating_infoc                    s  t trdS t trt jtrdS t tr/tjj	t
 d t
 d  ntdtt}jjjjB D ]
}||j | q@dd jjD }dd jjD }fddt trzfd	d|D }|| }|| }d}||B D ]Z}tfd
d|| D }|tjjv rtjj| }	n|tjjv rtjj| }	nqdd  t |	jtrɈjj|	  j}
t fdd|
D }n |	}|t||t |	!  7 }q|S )aM  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size
        r   r       eAc                 S   r   r5   r   r   r5   r5   r6   r         zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<setcomp>c                 S   r   r5   r   r   r5   r5   r6   r      r  c                    s0    j j|  j}dd |D }t|t| dkS )Nc                 S   r   r5   re   r   userr5   r5   r6   r     r  zZBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<setcomp>r   )rc   r   rf   r   rM   )bufsnodesrf   Zbuf_usesr=   r5   r6   is_materialized  s   zGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materializedc                    s   h | ]
} |j s|qS r5   r	  r   )r
  r4   r5   r6   r     s
    c                    s   g | ]} qS r5   r5   r   )
node_numelr5   r6   r     s    zBBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<listcomp>c                 S   s   t jjt|  S r1   )r)   r   sizevars	size_hintr(   get_size)r  r5   r5   r6   get_buf_elems  r   zEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_elemsc                 3   s    | ]	} |j j V  qd S r1   r  r  )r  r5   r6   	<genexpr>   s    zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>)"rL   NopKernelSchedulerNoder   re   r   r   r)   r   r  r  r(   
get_rangesintr   defaultdictr   rt   rv   ru   r~   r   rb   sumr   graph_inputsr   r   rc   r   r<   rf   minr$   	get_dtype)r4   Zbuf_accessesrX   rv   ru   r   Z
node_bytesbuf_nameZbuf_accessed_elemsr  rf   Z	buf_elemsr5   )r  r
  r  r4   r6   get_read_write_buffers_sizes  sT   




z.BaseSchedulerNode.get_read_write_buffers_sizesc              	      s\  d}d}t | dr| js:t| ttfsJ dt| | js!J | jd js)dS | jd j }| jd j }n
| j }| j }d|j	jkrLdS zt
 }t|d }W n
 tya   Y dS w t| trt| jtjsyJ dt| jtt| jdd	d}|durdd
lm} ddlm} | b |ddM}ddlm   fdd| jjD }	| jj}
|
j|g|	R i | jj d}| }|  }|| | d }|| }t||W  d   W  d   S 1 sw   Y  W d   n1 sw   Y  nt| tst| jtr|  | S t| jtj r"t!| S t| jtj"r,dS dS )zB
        Returns estimated op runtime in nanoseconds (ns)
        Nre   ztype(self)=r   cudal    J)type(self.node)=r   r   )FakeTensorMode)FlopCounterModeF)displayr   ir_node_to_tensorc                    s   g | ]} |d dqS )F)Zguard_shaper5   )r   inputr!  r5   r6   r   U  s    
z;BaseSchedulerNode.get_estimated_runtime.<locals>.<listcomp>g      ?r  )#r   re   rL   rb   r`   rn   r	  r   r  devicer%   r#   rx   r   r   ExternKernelkernel_name_to_opr   rs   Ztorch._subclasses.fake_tensorr  Ztorch.utils.flop_counterr  r"  Zinputs	__class__Zprocess_kernelkwargsZget_total_flopsr  maxr   ZCollectiveKernelr   ZWait)r4   r   ZdtypeZgpu_memory_bandwidthZ	gpu_flopsr   r  r  Zflop_counter_modeZfake_inputsclsfactorZcounted_flopsZcounted_bytesZcompute_timeZtransfer_timer5   r!  r6   get_estimated_runtime*  sp   


"


Lz'BaseSchedulerNode.get_estimated_runtimeN)T)4r@   rA   rB   r   ZBufferr7   r^   rD   r   rw   r   r   r   r   r   r   r   r	   r   r   r   r   r   
ReadWritesrh   r   r   r   r   r   r   r<   r   r   r   r   r   r[   r   r   r_   	MemoryDepr   r   r   r   r   r  r  r  floatr,  r5   r5   r5   r6   r0   x   sT    


"V
)Tc                   @   s:   e Zd ZdefddZdd Zdd Zdejfd	d
Z	dS )r   rp   c                 C   s   |    dt| jdd  S )Nz.node.kernel = r   )r<   rs   re   r=   r5   r5   r6   rw   z  s   z)ExternKernelSchedulerNode.debug_str_extrac                 C   rY   NTr5   r=   r5   r5   r6   r   }  r\   z#ExternKernelSchedulerNode.is_externc                 C   s   t | jdo
| j S )Nr   )r   re   r   r=   r5   r5   r6   r     r   z*ExternKernelSchedulerNode.has_side_effectsr   c                 C   s   |   s|  r
dS |j| jjvrdS t| jtjj	j
tjj	jfs#dS t| jjdkrDtt| jj}| |  }tjj|dkS dS )NFr   r   )r   r   r~   rc   r   rL   re   r   r   r   r   r   r   rt   ru   nextiterZ	get_numelr)   r   r  simplify)r4   r   	write_depZ
numel_diffr5   r5   r6   r     s   z%ExternKernelSchedulerNode.can_inplaceN)
r@   rA   rB   rD   rw   r   r   r   r.  r   r5   r5   r5   r6   r   y  s
    r   c                   @   s   e Zd ZdS )r  N)r@   rA   rB   r5   r5   r5   r6   r    s    r  c                       s   e Zd Zdddeejejf f fddZdefddZ	d	d
 Z
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdejfddZedee fddZdd Z  ZS ) r   rc   rd   re   c                    sv   t  || | \| _| _| || jf| _t|tj	r(| 
|  d S | 
tj| jg| jR ddi d S )N	normalizeT)superr7   Zsimplify_and_reorder_sizes_bodyr   grouprL   r   TemplateBufferrh   Znormalized_read_writesr   extract_read_writes)r4   rc   re   group_fnr'  r5   r6   r7     s"   zSchedulerNode.__init__rp   c                 C   s   |   }| d| jd  | d| jd  | d| j g}|  r1|| dt|    |  rC|| dt|    t| jt	j
r_|d| d	 |t| j d
 d|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z.aliases = z.mutations = zclass z_loop_body:rK   rJ   )r<   r9  r7  r   r   rP   r   rL   r8  r   LoopBodyrQ   rI   r   r{   r}   r5   r5   r6   rw     s   
zSchedulerNode.debug_str_extrac                 C   r   r1   )r7  r=   r5   r5   r6   r    r   zSchedulerNode.get_rangesc                 C   s6   t | jtjtjfsJ dt| jt| j S Nr  )rL   re   r   r   r:  rn   r   Zget_reduction_typer=   r5   r5   r6   r[     s   zSchedulerNode.is_reductionc                 C   s   t | jtjS r1   )rL   re   r   r:  r=   r5   r5   r6   r        zSchedulerNode.is_templatec                 G   s   |    |   | | d S r1   )r   mark_runr   )r4   
index_varsr5   r5   r6   run  s   zSchedulerNode.runc                 C   s   |    d S r1   )r   r=   r5   r5   r6   rA       zSchedulerNode.mark_runc                 C   sH   | j }ttt|ttt|ksJ tttj|tj|}|S r1   )	r7  r  mapr   dictzipr   r   from_iterable)r4   rB  sizes
var_rangesr5   r5   r6   ranges_from_index_vars  s    

z$SchedulerNode.ranges_from_index_varsc              	   C   s   |  |}zCttt |. tj|  | j|  W d    n1 s'w   Y  W d    W d S W d    W d S 1 sAw   Y  W d S  tyW   t	
d| j  w )NzError in codegen for %s)rK  r)   Zset_ops_handlerr   Zget_ops_handlerr   Zset_current_noder8  rx   ry   fatalre   )r4   rB  rJ  r5   r5   r6   r     s   

VzSchedulerNode.codegenc                    s$   j \}  fdd}t||S )zH
        Get the memory dependencies in the non-reduction axis.
        c                    s    | dd  D S )Nc                 S   s   g | ]}t d qS )r   )sympyInteger)r   _r5   r5   r6   r         zCSchedulerNode.pointwise_read_writes.<locals>.fn.<locals>.<listcomp>)r8  )indexZreduction_sizesr4   r5   r6   fn  r   z/SchedulerNode.pointwise_read_writes.<locals>.fn)r7  r   r;  )r4   rI  rS  r5   rR  r6   pointwise_read_writes  s   
z#SchedulerNode.pointwise_read_writesr   c                 C   sz   |   s|  r
dS t| jjdkr;t|tjr;tt	| jj}t|tjs/J dt
||j|jko:|j|jkS dS )NFr   ztype(write_dep)=)r   r   r   rt   ru   rL   r   r.  r1  r2  rn   rQ  size)r4   r   r4  r5   r5   r6   r     s   zSchedulerNode.can_inplacec                 C   s   t  }t| jtjrQ| j D ]A}|jdkrP|jdkrPd|jv r'|jd dks5t	|j
dkrP|j
d dkrP|d|jv rA|jd nt	|j
dkrM|j
d	 nd
 q|S )NZcall_methodstoremode
atomic_add   rH   r~      r   r   )rM   rL   r8  r   r>  r   r   r   r(  r   r/   r   )r4   Zbuffers_store_as_atomic_addre   r5   r5   r6   _get_atomic_add_buffers  s   



z%SchedulerNode._get_atomic_add_buffersc                 C   s   ||   v S r1   )r[  r4   	check_bufr5   r5   r6   has_atomic_add  rD  zSchedulerNode.has_atomic_add)r@   rA   rB   r   r   r   r:  r7   rD   rw   r  r[   r   rC  rA  rK  r   rT  r   r.  r   r    r	   r[  r^  __classcell__r5   r5   r=  r6   r     s&    r   c                       s  e Zd ZdZededefddZdddee fd	d
Z	e
defddZdefddZe
dee fddZdefddZdee deeef f fddZe
dee fddZe
dee fddZdee fddZdd Ze
d d! Ze
d"d# Ze
d$d% Zd&d' Ze
d(d) Ze
d*d+ Zd,d- Zd.eeef fd/d0Zd1d2 Zd3ed4 fd5d6Z d7d8 Z!d9d: Z"d;e#j$fd<d=Z%d>d? Z&d@dA Z'  Z(S )Brb   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r,   r-   c                 C   sP   |j |j u sJ t|ttfrt|ttfsJ | |j t| t|  S r1   )rc   rL   r   rb   r   r   )r*  r,   r-   r5   r5   r6   ra   $  s
    zFusedSchedulerNode.fuserc   rd   r	  c                    s   | _ | _d  _g  _g  _g  _t|dd dj _tj	dd |D   _
 tjdd |D   fddtj	d	d |D  D  jj  _td
d  j D  _tdd  j D  _d S )Nc                 S   s   t |  S r1   )r  r[   r   r5   r5   r6   r   4  s    z-FusedSchedulerNode.__init__.<locals>.<lambda>rF   c                 S   s   g | ]
}|j d ur|j qS r1   )ri   r   r5   r5   r6   r   6  s    z/FusedSchedulerNode.__init__.<locals>.<listcomp>c                 S      g | ]}|j qS r5   )rt   r   r5   r5   r6   r   :  r  c                       h | ]}|j   vr|qS r5   r~   r   r   r=   r5   r6   r   =  r   z.FusedSchedulerNode.__init__.<locals>.<setcomp>c                 S   r`  r5   r   r   r5   r5   r6   r   ?  r  c                 S   r`  r5   	min_orderr   r5   r5   r6   r   B  r  c                 S   r`  r5   )	max_orderr   r5   r5   r6   r   C  r  )r	  rc   re   rf   rV   rg   r)  r9  rM   unionri   rh   r   r-  
merge_listrt   ru   rU   r  rd  re  )r4   rc   r	  r5   r=   r6   r7   ,  s(   
zFusedSchedulerNode.__init__rp   c                 C   s   d dd | jD S )NrO  c                 S      g | ]}|  qS r5   r   r   r5   r5   r6   r   G      z/FusedSchedulerNode.get_name.<locals>.<listcomp>)r{   r	  r=   r5   r5   r6   r<   E     zFusedSchedulerNode.get_namec                 C      | j d  S Nr   )r	  r<   r=   r5   r5   r6   r   I  r@  z!FusedSchedulerNode.get_first_namec                 C      t jdd | jD  S )Nc                 S   rh  r5   )r   r   r5   r5   r6   r   N  ri  z0FusedSchedulerNode.get_names.<locals>.<listcomp>rM   rf  r	  r=   r5   r5   r6   r   L  rj  zFusedSchedulerNode.get_namesc                    s.    fddt  jD }td| dS )Nc                    s,   g | ]\}}    d | d|  qS )z.snodes[z] =
)r<   r   )r   ire   r=   r5   r6   r   Q  s    z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>rJ   rK   )	enumerater	  rQ   rI   r{   r|   )r4   r   r5   r=   r6   rw   P  s   
z"FusedSchedulerNode.debug_str_extrar   r   c                    s@   t  || t }t| jD ]}||| ||j qd S r1   )r6  r   rM   reversedr	  updaterj   )r4   r   r   re   r=  r5   r6   r   W  s   z!FusedSchedulerNode.set_last_usagec                 C   rm  )Nc                 S   rh  r5   )r   r   r5   r5   r6   r   f  ri  z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>rn  r=   r5   r5   r6   r   d  rj  z$FusedSchedulerNode.used_buffer_namesc                 C   rm  )Nc                 S   rh  r5   )r   r   r5   r5   r6   r   j  ri  zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>rn  r=   r5   r5   r6   r   h  rj  z/FusedSchedulerNode.used_or_aliased_buffer_namesc                 C   r   r1   r  r=   r5   r5   r6   r   l  r   zFusedSchedulerNode.get_nodesc                 C   s   t | j d|   dS )Nz(nodes=rl   rm   r=   r5   r5   r6   r^   o  ro   zFusedSchedulerNode.__repr__c                 C      t dd | jD S )Nc                 s       | ]}|  V  qd S r1   )r[   r   r5   r5   r6   r  t      z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>anyr	  r=   r5   r5   r6   r[   r     zFusedSchedulerNode.is_reductionc                 C   rs  )Nc                 s   rt  r1   )r   r   r5   r5   r6   r  x  ru  z1FusedSchedulerNode.is_template.<locals>.<genexpr>rv  r=   r5   r5   r6   r   v  rx  zFusedSchedulerNode.is_templatec                 C   s    | j D ]
}| r|  S qd S r1   )r	  r   r4   re   r5   r5   r6   get_template_nodez  s
   
z$FusedSchedulerNode.get_template_nodec                 C   s
   | j d S rl  )r9  r=   r5   r5   r6   r     r   zFusedSchedulerNode.get_devicec                 C   rs  )Nc                 s   rt  r1   )r   r   r5   r5   r6   r    ru  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>rv  r=   r5   r5   r6   r     rx  z+FusedSchedulerNode.has_aliasing_or_mutationc                 C   s&   t  }| jD ]	}||  q|S r1   )r   r   r	  rr  r   )r4   r   re   r5   r5   r6   r     s   
zFusedSchedulerNode.op_countsc                    s   t  fdd|  D S )Nc                 3   s$    | ]}t |to| V  qd S r1   )rL   r   r^  )r   Zsub_schedule_node1r]  r5   r6   r    s    

z4FusedSchedulerNode.has_atomic_add.<locals>.<genexpr>)rw  r   r\  r5   r{  r6   r^    s   z!FusedSchedulerNode.has_atomic_addr   c                 C      t r1   NotImplementedErrorr   r5   r5   r6   r     r\   z'FusedSchedulerNode.update_mutated_namesc                 C   r|  r1   r}  r4   r~   r5   r5   r6   r     r\   z#FusedSchedulerNode.add_mutation_deprf   r   c                 C   r|  r1   r}  )r4   rf   r5   r5   r6   r     r\   zFusedSchedulerNode.set_usersc                 C   r|  r1   r}  r=   r5   r5   r6   r     r\   zFusedSchedulerNode.get_aliasesc                 C   r|  r1   r}  r=   r5   r5   r6   r     r\   z FusedSchedulerNode.get_mutationsr   c                 C   r|  r1   r}  r   r5   r5   r6   r     r\   zFusedSchedulerNode.can_inplacec                 C   r|  r1   r}  r=   r5   r5   r6   r     r\   zFusedSchedulerNode.allocatec                 C   r|  r1   r}  r=   r5   r5   r6   r     r\   zFusedSchedulerNode.can_free))r@   rA   rB   __doc__classmethodr0   ra   r   r   r7   r    rD   r<   r   r	   r   rw   r   r   r   r   r   r^   r[   r   rz  r   r   r   r^  r   r   r   r   r   r   r.  r   r   r   r_  r5   r5   r=  r6   rb     sT    





rb   c                       s   e Zd ZdZdd Zdd Zedd Zedd	 Z	
	
d ddde	e
 f fddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z  ZS )!r`   z{Scheduler node which consists of a list of scheduler nodes that each operate on a
    distinct tensor in a list of tensors.c                 C   s    |  | jv r| j|   S d S r1   )r<   read_to_node)r4   producerr5   r5   r6   get_consumer_subnode_for  s   z3ForeachKernelSchedulerNode.get_consumer_subnode_forc                 C   s.   |j jD ]}|j| jv r| j|j   S qd S r1   )rt   rv   r~   r   )r4   consumerrdr5   r5   r6   get_producer_subnode_for  s
   z3ForeachKernelSchedulerNode.get_producer_subnode_forc                    s   t  |}  r/| r/t jt|jk}|s|d |o.t fddt j|jD S | rI| }|d urC|j |S |d dS   rc 	|}|d ur] j||S |d dS t
d)Nzforeach do not have same lengthc                 3   s"    | ]\}} j ||V  qd S r1   )rc   can_fuser   lrr  r5   r6   r    s
    
z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>z5candidate producer is not dep of any foreach consumerFz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r+   r_   r   r	  allrG  r  rc   r  r  AssertionError)r*  r  r  whyZforeach_matchconsumer_subnodeproducer_subnoder5   r  r6   r    s.   


z#ForeachKernelSchedulerNode.can_fusec           
      C   s  |  s
|  s
J d }d }|  r#|  r#dd t|j|jD }nY|  rP||}g }|}d }|jD ]}||u rIt||}|}|| q5|| q5n,|  r|||}	g }|}d }|jD ]}||	u rvt||}|}|| qb|| qb| |j|||S )Nc                 S   s   g | ]
\}}t ||qS r5   )rb   ra   r  r5   r5   r6   r     s    
z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>)	r_   rG  r	  r  rb   ra   r   r  rc   )
r*  r  r  prev_node_1prev_node_2fused_nodesr  re   new_noder  r5   r5   r6   ra     s>   



zForeachKernelSchedulerNode.fuseNrc   rd   nodesc           
         sZ  i  _ i  _|d u s|d u r4t || |D ]}|jjD ]}| j |j< q| D ]}| j|< q*qnj| _| _	d  _
g  _ tj|j|jg  fddt|j|jD  jj  _t|j|jg _t|j|jg _| ry|n|}| r|n|}	|j _ j|	j |j _|	 D ]}|	 j|< q|d  df _t  _d S )Nc                    ra  r5   rb  r   r=   r5   r6   r   )  s
    z6ForeachKernelSchedulerNode.__init__.<locals>.<setcomp>r   Zforeach)r  r   r6  r7   rt   rv   r~   r   rc   r	  re   rf   rh   r   r-  rg  rM   rf  rU   ru   r  rd  r)  re  r_   ri   rr  r   r9  r   )
r4   rc   r  r  r  re   r   r~   Zforeach_node
other_noder=  r=   r6   r7   
  sL   

z#ForeachKernelSchedulerNode.__init__c                 C   r|  r1   r}  r=   r5   r5   r6   rA  B  r\   z#ForeachKernelSchedulerNode.mark_runc                 C   s<   t | jtjsJ dt| j| j | j   d S r?  )rL   re   r   r   rn   Zget_store_functionZmake_loaderr=   r5   r5   r6   r   E  s   "z"ForeachKernelSchedulerNode.codegenc                 C   s   t S r1   r}  r=   r5   r5   r6   r   I  r\   z#ForeachKernelSchedulerNode.can_freec                 C   rY   r0  r5   r=   r5   r5   r6   r_   L  r\   z%ForeachKernelSchedulerNode.is_foreachc                 C   s
   t | jS )zReturns a list of nodes which comprise the foreach kernel, operating on corresponding elements of our input lists.
        These nodes may be vertically fused.)r   r	  r=   r5   r5   r6   get_subkernel_nodesO  s   
z.ForeachKernelSchedulerNode.get_subkernel_nodesc                 C   s   t tjdd | jD  S )ziReturns all nodes contained in this kernel, unpacking fused nodes into their constituent scheduler nodes.c                 S   rh  r5   )r   r   r5   r5   r6   r   V  ri  z8ForeachKernelSchedulerNode.get_nodes.<locals>.<listcomp>)r   r   r   r	  r=   r5   r5   r6   r   T  s   z$ForeachKernelSchedulerNode.get_nodesc                 C   rk  rl  )r	  r   r=   r5   r5   r6   r   X  r@  z)ForeachKernelSchedulerNode.get_first_namec                 C   s   | j D ]}|| qd S r1   )r	  r   )r4   r   re   r5   r5   r6   r   [  s   
z/ForeachKernelSchedulerNode.prune_redundant_deps)NN)r@   rA   rB   r  r  r  r  r  ra   r   r   r7   rA  r   r   r_   r  r   r   r   r_  r5   r5   r=  r6   r`     s.    

*8r`   r5   c                    s`   t j fdd}ttttd }t|dkr%fdd|D tjr.|j|d |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                    s     dks dkrt   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krIdS ||krOdS t  S )	Nr   c                       g | ]}|  qS r5   r5   r   sl)ar5   r6   r   l  ri  z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                    r  r5   r5   r  )br5   r6   r   m  ri  c                 s   s$    | ]\}}|d kp||k V  qdS r   Nr5   r   Zsl_aZsl_br5   r5   r6   r  q      
z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s   s$    | ]\}}|d kp||k V  qdS r  r5   r  r5   r5   r6   r  t  r  r   )r!   r  rG  )r  r  Zstride_len_aZstride_len_bZa_firstZb_firstrI  stride_lengths)r  r  r6   	index_cmpf  s   
z"pick_loop_order.<locals>.index_cmpr   c                       g | ]} | qS r5   r5   )r   pi)r  r5   r6   r     ri  z#pick_loop_order.<locals>.<listcomp>rF   )		functools
cmp_to_keyr   rq  ranger   r   Zpick_loop_orderssort)r  rI  Zpriority_idxr  orderr5   r  r6   pick_loop_order`  s   r  c                   @   s@   e Zd ZU eed< dZeed< dZeed< dd Zdd	d
Z	dS )r   re   Fr   is_weakc                 C   r   r1   r   r=   r5   r5   r6   r<     r   zNodeUser.get_nameotherrp   c                 C   s.   | j |j u sJ t| j | jo|j| jo|jS r1   )re   r   r   r  )r4   r  r5   r5   r6   r     s   

zNodeUser.mergeN)r  r   rp   r   )
r@   rA   rB   r0   rE   r   r   r  r<   r   r5   r5   r5   r6   r     s   
 r   c                       sh  e Zd Ze fddZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#ed$efd%d&Zd#ed$efd'd(Zd)d* Zd#ed$efd+d,Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Zd7d8 Zd9d: Z d;d< Z!d=e"fd>d?Z#d@e$j%fdAdBZ&d@e$j%fdCdDZ'dEdF Z(edGdH Z)dIdJ Z*  Z+S )Krd   c                    s  t    i  _i  _tt _g  _h tj	j
 tj	j  _ fdd|D  _ jtj	j   jD ]}|  q9dd  jD  _t  _i  _i  _         tjrht j    t jt j7  _tj ! j t j _"dd  jD  _ #     t$  _% &  tjr '  t( j _ )  tj * j tj + j  ,  d  _-t$  _.i  _/t0d1 fdd d S )	Nc                    s   g | ]}  |qS r5   )create_scheduler_noder   nr=   r5   r6   r     rP  z&Scheduler.__init__.<locals>.<listcomp>c                 S      i | ]}|  |qS r5   r   r  r5   r5   r6   
<dictcomp>      
z&Scheduler.__init__.<locals>.<dictcomp>c                 S   r  r5   r   r  r5   r5   r6   r    rP  Zgraph_statsc                      s    j  jt jdS )N)Zgraph_idZnum_nodes_before_fusionZnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  r5   r=   r5   r6   r     s   z$Scheduler.__init__.<locals>.<lambda>)2r6  r7   backendsZ
fuse_cacher1  _post_grad_graph_counterr  r  r)   r   r  keys	constantsr   rr  r   r   rF  r   r   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationr   Z reorder_for_compute_comm_overlapr   Zdecide_global_ordering_of_commscompute_ancestorsr   Zir_nodes_pre_fusionr   r9   Zir_pre_fusionr  create_foreach_nodesrM   logged_slow_fusion
fuse_nodescompute_node_usersZ$reorder_compute_and_comm_for_overlapcompute_last_usageZir_post_fusionZgraph_diagramdebug_draw_graphcurrent_devicebuffer_names_to_freeorigin_to_indexr   add_row)r4   r  re   r=  r=   r6   r7     s`   





	

zScheduler.__init__c                 C   s4   t jdddkrddlm} || jdd dS dS )z,Generate an image of the graph for debuggingZINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)Zprint_graph)osenvironr   r9   r  r  )r4   r  r5   r5   r6   r    s   zScheduler.debug_draw_graphc                 C   s4   t tjrt d| | jD ]}|  qd S d S )Nz%s:)ry   isEnabledForloggingINFOr   r  r   )r4   labelre   r5   r5   r6   debug_print_nodes  s   

zScheduler.debug_print_nodesc                 C   sp   |j d us	J d| rt| |S t|tjtjfr)| | j	}t
| ||S t|tjr4t| |S t|)Nz2All nodes passed to scheduling must have an origin)r   Zis_no_opr  rL   r   r   r:  get_backendr   r<  r   r%  r   r~  )r4   re   r<  r5   r5   r6   r    s   

zScheduler.create_scheduler_nodec                    s   t  g }j  tjj D ]1} fdd|D }|sq| fdd|D }t|}|	| |D ]}|j|< q9qfddj
D | _
d S )Nc                    s(   g | ]}| v rt j| ts|qS r5   )rL   r   r  r   r~   )kept_node_namesr4   r5   r6   r     s    z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                    s   g | ]} j | qS r5   )r   r  r=   r5   r6   r   !  rP  c                    s   g | ]
}|   vr|qS r5   r   )r   re   )removed_node_namesr5   r6   r   *  s    )rM   r   r  r)   r   listsr   rr  r`   r   r  )r4   Zfe_nodesnamesr	  Zfe_noder~   r5   )r  r  r4   r6   r    s*   





zScheduler.create_foreach_nodesc                    s  t tjD ]J}| }| D ]?}|v r@|v r@| }| }|| } D ]}| |u s:| |u r>||< q,q|v rK| |< q| |< qqfdd fdd dfdd	}i }	jD ]}
td|
j	 |
j	
 D ]}t|tjsJ ||	vr|
|	|< q||
j	 D ]}||	v sJ | d	|	 |
t|	|   qt|
 d
ksJ |
 D ]7}|}|||
 |
t| | D ] }| } |
 }||vr|
t| |||
dd qq|
jjD ]}t|t}||j|
|
|| q|
j |
 D ]}|
 j|< |
 j|< j||j|
 < qqntj D ]}td| ||t t| q<tjj!D ]9}
t|
t"j#rt$|
j%D ])}||	v ssJ | d	|	  |	| j	j}td|| ||t t| qaqSjD ]}|tjj&v r||t t| tjj'(| qdd t)tjj& D fddtjj'D tj_*jD ]}
|
+|
   qȈjD ]}
|
j,D ]
}|j	j-.|
 qݐqdS )zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                    s   | j v r j |  S | S r1   )r  r  )r   r4   r5   r6   r   H  s   
z.Scheduler.compute_dependencies.<locals>.renamec                    s~   | h}j |  }tt|jj}|jjD ](}|jj v r<t|tj	r<t|tj	r<|j
|j
kr<|j|jkr<| |j q|S r1   )r   r1  r2  rt   ru   rv   r~   rL   r   r.  rQ  rU  rr  )	node_nameZreachable_namesre   r4  r   )dep_closurer4   r5   r6   r  M  s   


z3Scheduler.compute_dependencies.<locals>.dep_closureFc                    s    |   t||| d S r1   )r   r   )Zused_by_nameZ	user_noder   r  )name_to_usersr   r5   r6   add_user\  s   
z0Scheduler.compute_dependencies.<locals>.add_userzscheduling %sz not in r   T)r  zscheduling output %sz+scheduling output %s for unbacked symint %sc                 S      i | ]\}}||qS r5   r5   )r   rQ  r~   r5   r5   r6   r    r  z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                    r  r5   r5   r  )	inp_namesr5   r6   r     s    z2Scheduler.compute_dependencies.<locals>.<listcomp>N)FF)/r   r  r   r  r<   r   r  ry   r9   re   Zget_unbacked_symbol_defsrL   rM  SymbolZget_unbacked_symbol_usesr   r   r   r   r   r   rt   rv   r~   r   r   r  r   r   r)   r   get_output_namesrT   Zgraph_outputsr   ZShapeAsConstantBufferr   shaper  Zmutated_inputsr   rp  Zmutated_input_idxsr   rf   rV   r   )r4   r,   Z
node1_nameZ
node2_nameZlist1Zlist2combinedrG   r  Zunbacked_symbol_to_origin_nodere   sZalt_namer  Z
other_nameZknown_dep_node_namesr   r  r  r~   r  r5   )r  r  r  r   r4   r6   r  .  s   











zScheduler.compute_dependenciesc           
      C   s   i }| j D ]}t|tr|jD ]}||| < q||| < q| j D ]}g |_g |_q"| j D ]}g }|jD ]}|j|v s>J ||j }|	| q5||_q.i }| j D ]}|jD ]}|
|g 	| qWqR| D ]\}}	|	|_qhd S r1   )r  rL   rb   r	  r<   rg   rV   rU   r~   r   
setdefaultitems)
r4   Zbuf_to_snodere   r   rV   rX   Zdep_nodeZnode_to_usersZinverse_userrf   r5   r5   r6   r    s0   








zScheduler.compute_node_usersc                    s   d}|rKg }| j D ]3}dtfdd |  o"t fdd|jD }|s+|| q	td|  t	j
j|  q	t| j t|k}|| _ |s| j D ]}|  qNdS )	z0
        Remove any nodes without users
        Tr  c                 S   s   | j p
|  tjjv S r1   )r  r<   r)   r   r   )r  r5   r5   r6   can_eliminate_user  r   z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userc                 3   s    | ]} |V  qd S r1   r5   )r   ur  r5   r6   r    s    
z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead node: %sN)r  r   r   r  rf   r   ry   r9   r<   r)   r   r   r   r   r   )r4   ZagainZupdated_nodesre   Zcan_eliminater5   r  r6   r    s$   


zScheduler.dead_node_eliminationc                    sb   t  t  g  fdd| jD ]}| D ]}| |< qq| jD ]}| q%| _dS )zD
        Ensure self.nodes is in topologically sorted order
        c                    sJ   | vr# |  t| jdd dD ]	} |j  q|  d S d S )Nc                 S   r   r1   r   )dr5   r5   r6   r   	  r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>rF   )r   rN   rU   r~   r   )r  rX   r   rS   seenvisitr5   r6   r    s   
z2Scheduler.topological_sort_schedule.<locals>.visitN)rM   rF  r  r   )r4   re   r~   r5   r  r6   r    s   




z#Scheduler.topological_sort_schedulec                 C   sr   i }| j D ]!}t }|jD ]}||j |||j O }q||| < ||_qt| j D ]
\}}||_||_	q,dS )z.
        Populate each node.ancestors
        N)
r  rM   rU   r   r~   r<   ri   rp  rd  re  )r4   Zname_to_ancestorsre   ri   rX   r  r5   r5   r6   r    s   

zScheduler.compute_ancestorsc                 C   sz   t dD ]6}t| j}td|d | |   t| j}td|d || ||ks/|dkr:td|d   dS qdS )zO
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
        
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r  r   r  r8   r9   fuse_nodes_once)r4   ro  Zold_lenZnew_lenr5   r5   r6   r  &  s$   


zScheduler.fuse_nodesc                 C   s>   t |dksJ |d  }| tj_|| _| |}||S )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   )r   r   r)   r   rc   r  r  benchmark_fused_nodes)r4   r  r$  backendr5   r5   r6   r  ;  s   

zScheduler.benchmark_fused_nodesc           	   
      s  t jsdS | rdS | }|d  }|jdkrdS | }|| }tdd |D r/dS ddlm} z>| 	|\ t
 rJtd W dS | 	|\t
r^td	 W dS | 	|\t
rrtd
 W dS W n |y } zdt|v rW Y d}~dS  d}~ww ttjrÈ  k rtd| | t   d ntd| | t   d tdr  krf| jvr| jf td fdd   k S )z
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        Tr   cpuc                 s   s8    | ]}t |jd ot |jjdo|jjjdkV  qdS )r   scatter_moderX  N)r   re   r   r  r  r5   r5   r6   r  `  s    
z.Scheduler.speedup_by_fusion.<locals>.<genexpr>)CompilationErrorz>cannot fuse (benchmark): register spilling of the first kernelFz?cannot fuse (benchmark): register spilling of the second kernelz>cannot fuse (benchmark): register spilling of the fused kernelzLoop-carried variableNz9can fuse (benchmark): fusing %s with %s cause %sx speedupz.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdownZslow_fusionc                	      s       dS )N)Zkernel1_pathZkernel1_latencyZkernel2_pathZkernel2_latencyZfused_kernel_pathZfused_kernel_latencyZslow_down_ratior5   r5   Zms1Zms2Zms_fusedZpath1Zpath2Z
path_fusedr5   r6   r     s   
z-Scheduler.speedup_by_fusion.<locals>.<lambda>)r   Zbenchmark_fusionr   r   r   rn   rw  Ztriton.compiler.errorsr  r  mathisinfr8   r9   rD   r  r  DEBUGr   r&   r'   r   r  r   r   r  )	r4   r,   r-   Znode_list_1r$  Znode_list_2Znode_list_fusedr  er5   r  r6   speedup_by_fusionG  s   



zScheduler.speedup_by_fusionc                    s   t | j}|  D ]H\}}| j|  }| j|  }| ||rQ| ||sQ| ||s.q	t|| |	| |	| |
  | j fdd  D  q	t|dd d| _|   |   dS )a  
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuses(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        c                    s   i | ]}|   qS r5   r   r  Znode3r5   r6   r    rP  z-Scheduler.fuse_nodes_once.<locals>.<dictcomp>c                 S   r   r1   rc  r   r5   r5   r6   r     r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>rF   N)rM   r  get_possible_fusionsr   r   r  will_fusion_create_cycler  ra   remover   rr  r   rN   r  r   )r4   r  r,   r-   r5   r  r6   r    s(   




zScheduler.fuse_nodes_oncec                 C   s   | j D ]}|| j qd S r1   )r  r   r   ry  r5   r5   r6   r     s   
zScheduler.prune_redundant_depsc           	         s  g  t   fdd}tt}jD ]}| D ]	}|| | qq| D ]}|| q*tj	rYtt}jD ]}t
|dd}|rM|| | q<| D ]}|| qR jjdd ttjrtdt   D ]}td| qqtd	  S )
z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                    s   t | D ]>\}}| |d d  D ]1}||f}|v rq| ||r, | q| s4| rA||rA ||f qqd S )Nr   )rp  r   r  r   r   r_   )r  Znode1_indexr,   r-   rG   Zpossible_fusionsr  r4   r5   r6   check_all_pairs  s   
z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr9  NT)rG   reversez
found %d possible fusions:z%sr   )rM   r   r  r   r  r   r   r   r   aggressive_fusionrs   r  score_fusion_keyr8   r  r  r  r9   r   )	r4   r   Zbuffer_names_groupingre   r  Znode_groupingZgroup_groupingr9  r*   r5   r  r6   r    s4   






zScheduler.get_possible_fusionsc                    sh    fddt  | | B |j|jB   tfdd D }|r2t||d |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                    s^   t | tr-| vr-|  |   rdS t| j@ p,tfdd| j  D S dS )NFc                 3       | ]
} j | V  qd S r1   r   r  
found_pathr4   r5   r6   r    s
    
zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>)rL   rb   r   r   issubsetr   ri   rw  r  Zcombined_ancestorsZcombined_namesr  r4   visitedr5   r6   r    s   

z6Scheduler.will_fusion_create_cycle.<locals>.found_pathc                 3   r  r1   r  r  r  r5   r6   r    s    z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>zwill create cycle)rM   r   ri   rw  r+   )r4   r,   r-   cycler5   r	  r6   r    s   z"Scheduler.will_fusion_create_cycler,   r-   c                 C   s*   t t|j|j t|j|j }|dkS )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r)  absrd  re  )r4   r,   r-   proximity_scorer5   r5   r6   can_fusion_increase_peak_memory  s
   z)Scheduler.can_fusion_increase_peak_memoryc                    s   |u rdS t  |}t ttfr  s|d dS t|ttfr-| s-|d dS   s5| r;t |S |  j	@ rH|d dS t t
tfrmt|trmt|jtjrmt fdd|jj D rmdS | rw|d dS   r| s| stjs|d dS   }| }||kr|d	|| dS ~ |d
k}|rtjr  s| r|d dS   s| st  t|  tjkr|d dS   |j	@ r |sdS | |S  |r|d dS | |S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        Fznode1 is extern or nopznode2 is extern or nopznode1 must go before node2c                 3   s*    | ]}|j v o j | V  qd S r1   )r  r^  )r   Znode2_used_bufr,   r4   r5   r6   r  _  s    

z%Scheduler.can_fuse.<locals>.<genexpr>z!templates can only fuse epiloguesztemplate epilogue not satisfiedzdevice mismatch (%s vs %s)r   zno shared datazexceeds max fusionzwill increase peak memory) r+   rL   r   r  r   r_   r`   r  r   ri   rb   r   r8  r   r>  rw  Zreads_name2exprr  r   r[   r   Zepilogue_fusionr   score_fusion_memoryr  r   r   Zmax_fusion_sizecan_fuse_verticalr  r  can_fuse_horizontal)r4   r,   r-   r  r$  Zdevice2Zno_shared_datar5   r  r6   r  6  s   


zScheduler.can_fusec           
      C   s   |  }t }t||}|jD ]E}|jjD ]>}|j|jkrSt|t|krSt|j	dsSt|j	dsS|j	|j	krSt
|jt
|jkrS|jdt
|j |jkrS|| qqdd |j| D }||@ ri|d dS |D ]}	|| j|	 j@ r||d  dS qkdS )	a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        tmpNc                 S   r   r5   r   r   r5   r5   r6   r     r  z.Scheduler.can_fuse_vertical.<locals>.<setcomp>zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r   rM   r+   rU   rt   ru   r~   rn   r"   rQ  r   rU  r   r   ri   )
r4   r,   r-   Znode1_namesZcomputed_depsr  r  cdZremaining_depsr~   r5   r5   r6   r    s6   




zScheduler.can_fuse_verticalc                 C   sb   |  ||}tt|j|j t|j|j  }| tjko"|dk| | ko-|dk||fS )a\  
        Assign a score (higher comes first) to the fusion of node1
        and node2.  When different fusions conflict with each other,
        this is the way we decide what order to run them in.

        Our current score is based on:
        - Estimate of the saved memory operations
        - Fusions closer together in original order
        r   )	r  r)  r  rd  re  r   r   Zepilogue_fusion_firstr[   )r4   r,   r-   Zmemory_scorer  r5   r5   r6   score_fusion  s   
zScheduler.score_fusionc                 C   s@   |j j|j jB |j j|j jB @ }dd |D }tdd |D S )zf
        The first term in our fusion score that estimates number of saved memory operations.
        c                 S   s   h | ]}|  s|qS r5   )Zhas_unbacked_symbolsr   r5   r5   r6   r     s
    z0Scheduler.score_fusion_memory.<locals>.<setcomp>c                 s   rt  r1   )Znumbytes_hintr   r5   r5   r6   r    ru  z0Scheduler.score_fusion_memory.<locals>.<genexpr>)rt   rv   ru   r  )r4   r,   r-   Zcommon_memory_depsr5   r5   r6   r    s   zScheduler.score_fusion_memoryc                 C   s   |\}}|  ||S )z-
        Shim for list.sort(key=...)
        )r  )r4   r  r,   r-   r5   r5   r6   r    s   zScheduler.score_fusion_keyc                 C   sN   t  }tj D ]}|| qt| jD ]}||| j |	|j
 qdS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)rM   r)   r   r  r   rq  r  r   r   rr  rj   )r4   r   r  re   r5   r5   r6   r    s   zScheduler.compute_last_usagec                 C   s   t | jtjj tjjj D ]:}|| jv r'| j| }| r&tjj	|j
 q|tjjv rHtjj| j}t|tjr>| s@J tjj	|j q| j  dS )z*Free any buffers that are no longer neededN)rN   r  r)   r   r   r   Zfreedr   r   Zcodegen_freere   r  r   rL   r   Z
StorageBoxZis_input_bufferclear)r4   r~   re   Zstorager5   r5   r6   free_buffers  s$   

zScheduler.free_buffersc                    s   t jj}g  t jjD ]}j| j}|dusJ dd |D }||r) | q
fdd}tt|   D ];}|t jj	j
v rot jj	j
| }t|trT|drTq9t fdd|jD }|rg| t jj| q9| q9dS )	zr
        Any buffers that are both created and have a last use in the
        same kernel can be removed.
        Nc                 S   s   h | ]	}|j s| qS r5   )r  r<   r  r5   r5   r6   r     s    z8Scheduler.remove_kernel_local_buffers.<locals>.<setcomp>c                    s.   | t jjvo| t jjjvo|  jvo|  jvS r1   )r)   r   Zmust_keep_buffersr/   Zinput_buffersr  r   r  r=   r5   r6   remove_filter  s   z<Scheduler.remove_kernel_local_buffers.<locals>.remove_filterREMOVEDc                 3   s    | ]}| v V  qd S r1   r5   r  )names_to_remover5   r6   r  '  ru  z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>)r)   r   Zstore_buffer_namesr   rf   r  r   r   filterr/   r   rL   rD   
startswithr  Zother_namesremove_inplace_bufferZinplaced_to_remover   remove_buffer)r4   Zfused_node_namesZout_bufrf   r  r~   r  r  r5   )r  r4   r6   remove_kernel_local_buffers  s,   


z%Scheduler.remove_kernel_local_buffersc                 C   s,   t d| dtjjj|< tjj| d S )Nzremove_buffer(%r)r  )ry   r9   r)   r   r/   Zoutput_buffersr   r   r  r5   r5   r6   r  .  s   zScheduler.remove_bufferc                 C   sD   t d| tjjj| j}|ddtjjj|< tjj	| d S )Nzremoving_inplace_buffer(%r)Z
in_out_ptrr  )
ry   r9   r)   r   r/   r   
inner_namer   r   r   )r4   r~   r!  r5   r5   r6   r  6  s   zScheduler.remove_inplace_bufferc                 C   s$   | j  D ]}|  q|   d S r1   )r  r   flushr  )r4   r  r5   r5   r6   r"  >  s   
zScheduler.flushscheduler_nodec                 C   s   t |tsJ ttdd |  |  W d    n1 s"w   Y  |j}t |tj	s9J dt
||tjj |   d S )NF)Zincrease_kernel_countztype(node)=)rL   r   r)   Zset_kernel_handlerr   r   r   re   r   r%  rn   r   r   r   r  )r4   r#  re   r5   r5   r6   codegen_extern_callC  s   
zScheduler.codegen_extern_callr$  c                 C   s   |j dks|jd usJ | dtj| t|j }|d u r(td|j  |j dkrOt sOtj	
|}|jdk rKtd|j d|j d|j td|| S )	Nr  z( should have been normalized in loweringzUnsupported device type:    zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability .zCannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton)rn   rQ  r)   r   Zadd_device_infor   RuntimeErrorr   r   r  Zget_device_propertiesmajorr~   minor)r4   r$  Zdevice_schedulingZdevice_propsr5   r5   r6   create_backendQ  s"   

zScheduler.create_backendc                 C   s$   || j vr| || j |< | j | S r1   )r  r*  )r4   r$  r5   r5   r6   r  h  s   

zScheduler.get_backendc                    sH   fdd  fdd|  D }|r"t|\}}tjj| d S d S )Nc                    s2   |  j vr j dd t| jjD   j |  S )Nc                 S   r  r5   r5   )r   ro  r  r5   r5   r6   r  p  rP  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)r  rr  rp  r   r  r  r=   r5   r6   	get_ordern  s   

z*Scheduler.enter_context.<locals>.get_orderc                    s&   g | ]}|j jD ]} ||fqqS r5   )re   r   )r   r  r  )r+  r5   r6   r   s  s   & z+Scheduler.enter_context.<locals>.<listcomp>)r   r)  r)   r   r   enter_context)r4   re   r   rO  lastr5   )r+  r4   r6   r,  m  s   zScheduler.enter_contextc                 C   s&  | j D ]}ztd| |  W n ty- } ztd|  W Y d }~nd }~ww | | t|ts|	 }|| j
ksI| sI| rM|   || j
kr|jdkrx| j
rf| j
jdkrftjj  |jd usoJ dtjj|j n| j
r| j
jdkrtjj  || _
| j|j | r| ^}}| ||| n4| r| | n*| r| || nt|ttfr| | |  nt|tsJ |!  t"j#rtjj$| t"j%j&r| |'  | j(|)  t|ts|	 }| |* r|   q|   d S )Nz5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r  zdevice should have an index)+r  ry   r9   r<   r,  rx   r,  rL   r  r   r  r   r   r"  rn   r)   r   r   Zcodegen_device_guard_exitrQ  Zcodegen_device_guard_enterr  rr  rj   r   r  codegen_templater$  r_   Zcodegen_foreachrb   r   codegen_nodesr   r   Zdebug_check_inf_and_nanZgenerate_inf_and_nan_checkerr   Zdebug_sync_kernelcodegen_syncr   r   ready_to_flush)r4   re   r  r$  epiloguer5   r5   r6   r   x  sn   




zScheduler.codegenc                 C   sJ   |t jjv s|t jjv rdS | j| }|j }t|tj	r#|
  S dS rZ   )r)   r   r  r  r   re   r   rL   r   r   Zmaybe_guard_aligned)r4   r  re   r   r5   r5   r6   is_unaligned_buffer  s   


zScheduler.is_unaligned_buffer),r@   rA   rB   r   r7   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r0   r  r  r  r  r  r  r  r  r   r  r  r"  r   r$  r   r$  r*  r  r,  r   r3  r_  r5   r5   r=  r6   rd     sV    S !`/%
^+'
@rd   c                   @   s   e Zd ZdedefddZdedefddZdd Zd	ed
ee fddZ	dee fddZ
dd ZdefddZdd Zdd ZdS )BaseSchedulingr,   r-   c                 C      t  )zO
        Check whether node1 and node2 can be vertically fused or not.
        r}  r3   r5   r5   r6   r       z BaseScheduling.can_fuse_verticalc                 C   r5  )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r}  r3   r5   r5   r6   r    r6  z"BaseScheduling.can_fuse_horizontalc                 C   r5  )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r}  )r4   rI  r5   r5   r6   r<    r6  zBaseScheduling.group_fntemplate_nodeepilogue_nodesc                 C   r5  )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r}  )r4   r7  r8  r5   r5   r6   r.    s   	zBaseScheduling.codegen_templater  c                 C   r5  )zD
        Generate a kernel given a list of pre-fused nodes.
        r}  r4   r  r5   r5   r6   r/    r6  zBaseScheduling.codegen_nodesc                 C   r5  )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r}  r=   r5   r5   r6   r0    r6  zBaseScheduling.codegen_syncrp   c                 C   rY   )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr5   r=   r5   r5   r6   r1    s   zBaseScheduling.ready_to_flushc                 C   r5  )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r}  r=   r5   r5   r6   r"    r6  zBaseScheduling.flushc                 C   r5  )r  r}  r9  r5   r5   r6   r    s   z$BaseScheduling.benchmark_fused_nodesN)r@   rA   rB   r0   r  r  r<  r   r   r.  r/  r0  r   r1  r"  r  r5   r5   r5   r6   r4    s    
r4  r?   )r5   )[r   dataclassesr  r   r  r  r  rO   rQ   typingr   r   r   r   r   r   r   r	   r
   r   rM  r   Ztorch._dynamo.utilsr   Ztorch._inductor.metricsr   r   Z%torch.fx.experimental.symbolic_shapesr   Ztorch.utils._tritonr   r   r   r   r   r   r   Zcodegen.commonr   r   Zcomm_analysisr   r   r   r   r   r   r  r   utilsr    r!   r"   r#   r$   r%   r&   r'   r(   Zvirtualizedr)   	getLoggerr@   ry   Z_loggingZgetArtifactLoggerr8   r+   rP   rT   ra   opsZatenZconvolutionmmZbmmZaddmmr&  r0   r   r  r   rb   r`   r  	dataclassr   countr  rd   r4  r5   r5   r5   r6   <module>   s|    0,


	        
.(        -