o
    H&iA]                  
   @   sh  U d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
mZ ddgZe sGd dlZG dd dZdd	 Zeejd
 _eejd
 _dS d dlmZmZmZmZmZmZmZmZmZ e eZer{zd dl m!Z! W n e"yz   e#d Y nw G dd dZ$e$ Z%e$e&d< dde'fddZ(G dd dZddde'dee)df deee'df  defddZdS )    N)DictListOptionalTupleTYPE_CHECKINGUnion)is_availableinit_device_mesh
DeviceMeshc                   @   s   e Zd ZdS )_DeviceMeshStubN)__name__
__module____qualname__ r   r   HC:\wamp64\www\opt\env\Lib\site-packages\torch/distributed/device_mesh.pyr      s    r   c                   C   s   d S Nr   r   r   r   r   _init_device_mesh_stub   s   r   ztorch.distributed.device_mesh)	_find_pg_by_ranks_and_tag_get_default_group_get_group_tagget_rankget_world_sizeinit_process_groupis_initialized	new_groupProcessGroup)	ArrayLikezCDeviceMesh requires numpy >= 1.21 to be installed for type checkingc                   @   s   e Zd ZdddZdddZddd	ed
eddfddZddded fddZ	dddee fddZ
ededefddZededefddZdS )_MeshEnvreturnNc                 C   s   g | _ i | _d S r   )
mesh_stackchild_to_parent_mappingselfr   r   r   __init__:   s   
z_MeshEnv.__init__r
   c                 C   s    t | jdkrtd| jd S )Nr   z#No device mesh is currently active!)lenr   RuntimeErrorr!   r   r   r   get_current_mesh>   s   
z_MeshEnv.get_current_meshdevice_meshmesh_dimmesh_dim_namec           	      C   sl   |  }|jd|d|j|}|D ]}t|j||fdd}||v r'|}q|j| g|_|| j|< |S )Nr$   Fmesh_dim_names_init_process_groups)	r   meshswapdimsreshapesizer
   device_type_dim_group_infosr    )	r"   r(   r)   r*   Zcur_rankpg_ranks_by_dimZmesh_1dZsub_meshZres_sub_meshr   r   r   create_child_meshC   s"   
z_MeshEnv.create_child_meshc                 C   s   | j |d S r   )r    get)r"   r(   r   r   r   get_parent_mesh\   s   z_MeshEnv.get_parent_meshc                 C   sH   |  |}|j}|r"|r"t|dksJ d|d }|jr"||S dS )z
            Return the index of the mesh dim in the parent mesh.
            The device_mesh passed in needs to be sliced out from a parent mesh.
               z%The child mesh can only be a 1D mesh.r   N)r7   r,   r%   _get_mesh_dim_by_name)r"   r(   Zparent_meshZchild_mesh_dim_namesZchild_mesh_dim_namer   r   r   get_parent_mesh_dim_   s   

z_MeshEnv.get_parent_mesh_dimr2   c                 C   s   t |  S r   )_get_device_handledevice_countr2   r   r   r   num_devices_per_hosto      z_MeshEnv.num_devices_per_hostc                 C   s   t  t|  S r   )r   r   r>   r=   r   r   r   	num_hostss   s   z_MeshEnv.num_hostsr   Nr   r
   )r   r   r   r#   r'   intstrr5   r   r7   r:   staticmethodr>   r@   r   r   r   r   r   9   s$    


r   _mesh_resourcescudar2   c                 C   s   t t| dS )a:  
        Get the module corresponding to the device_type which is cuda or cuda-like device.
        For example, when the device_type is cuda, the module `torch.cuda` is returned.
        Return None when there is no corresponding module for device_type, otherwise
        return the corresponding module.
        N)getattrtorchr=   r   r   r   r;   {   s   r;   c                   @   s  e Zd ZU dZeed< ejed< ee	edf  ed< ddddede
ejd	f dee	edf  d
eddf
ddZdd Zdd Zd1ddZd2ddZdefddZdd ZdedefddZdedd fddZ	d3d ee
eef  de
eee f fd!d"Zd3d ee defd#d$Zedefd%d&Zede	edf fd'd(Zdefd)d*Zd3d ee
eef  defd+d,Zdeee  fd-d.Z dedefd/d0Z!dS )4r
   a  
        DeviceMesh represents a mesh of devices, where layout of devices could be
        represented as a n-d dimension array, and each value of the n-d dimensional
        array is the global id of the default process group ranks.

        DeviceMesh could be used to describe the layout of devices across the cluster,
        and serves as a proxy for communication among the device lists within the cluster.

        DeviceMesh can be used as a context manager.

        .. note::
            DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
            is running on all processes/ranks in the cluster. Therefore, users need to make sure the
            `mesh` array (which describes the layout of devices) should be identical across all ranks.
            Inconsistent `mesh` will lead to silent hang.

        Args:
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
            mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
                of devices, where the IDs are global IDs of the default process group.

        Returns:
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.

        The following program runs on each process/rank in an SPMD manner. In this example, we have 2
        hosts with 4 GPUs each.
        A reduction over the first dimension of mesh will reduce across
        columns (0, 4), .. and (3, 7), a reduction over the second dimension
        of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).

        Example::
            >>> # xdoctest: +SKIP("no rank")
            >>> from torch.distributed.device_mesh import DeviceMesh
            >>>
            >>> # Initialize device mesh as (2, 4) to represent the topology
            >>> # of cross-host(dim 0), and within-host (dim 1).
            >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
        r2   r.   .r,   NTr+   r   r-   r   c                C   s   || _ t|tjr| ntj|tjd| _|| _t	| j
  | _t| j| jjt| f| _|dkr@|   |rB|   d S d S d S )N)ZdtypeZxla)r2   
isinstancerI   TensordetachZtensorrC   r.   r,   tupleflattentolist_flatten_mesh_listhashshapeid_hash_get_or_create_default_groupr-   )r"   r2   r.   r,   r-   r   r   r   r#      s   

zDeviceMesh.__init__c                 C   s   t  }|st  t }| j |krtd| j  dt| j}|sK|rK| }||krC|| dkrCtd| d| d| j d|	t
 |  | jt
 k }|ddv s\J |ddkri|d  nd | _t S )	Nz=Mesh should not be bigger than default world size, but found z ranks!r   z8DeviceMesh only support homogeneous hardware, but found z ranks and  z	 devices!)r   r8   )r   r   r   r.   numelr&   r;   r2   r<   Z
set_devicer   Znonzeror1   rO   _coordinate_on_dimr   )r"   Zdefault_initializedZ
world_sizeZdevice_handler>   Zrank_coordsr   r   r   rU      s8   
z'DeviceMesh._get_or_create_default_groupc                 C   s   g }| j jdkr | j  t kr |tt ttt f nGt| j jD ]@}| j 	d|
d| j |}|D ],}| }t|d}|  |v ret||kr\td| j d| d|t||f q9q&|| _d S )Nr8   r$   )ZrankszFEach device mesh dimension should get only one process group, but got z in !)r.   ndimrW   r   appendr   r   listranger/   r0   r1   rO   r   r   r%   r&   r3   )r"   Zdim_group_infosdimr4   Zdim_meshZsubgroup_ranksZ	dim_groupr   r   r   r-      s6   



zDeviceMesh._init_process_groupsc                 C   s   t j|  | S r   )rF   r   r[   r!   r   r   r   	__enter__  s   zDeviceMesh.__enter__c                 C   s   t j  d S r   )rF   r   pop)r"   exc_type	exc_valueexc_tracebackr   r   r   __exit__"  s   zDeviceMesh.__exit__c                 C   s   d| j   dS )NzDeviceMesh())r.   rO   r!   r   r   r   __repr__&  s   zDeviceMesh.__repr__c                 C   s   | j S r   )rT   r!   r   r   r   __hash__)  s   zDeviceMesh.__hash__otherc                 C   sB   t |tsdS t| jt|jkrdS | jj|jjko | j|jkS )NFT)rJ   r
   rS   r.   rR   rP   )r"   rh   r   r   r   __eq__,  s   

zDeviceMesh.__eq__r*   c                 C   s<   | j jdkrtd| j j d| |}t| ||}|S )a  
            Slice the current DeviceMesh based on the mesh_dim_name given to create a child
            DeviceMesh.

            Args:
                mesh_dim_name (str): the name of the mesh dimension of the parent DeviceMesh
                to create a child DeviceMesh for.
            Returns:
                A :class:`DeviceMesh` object

            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
            hosts with 4 GPUs each.
            Calling mesh["tp"] on rank 0, 1, 2, 3 would return a 1D child DeviceMesh:([0, 1, 2, 3]).
            Calling mesh["tp"] on rank 4, 5, 6, 7 would return a 1D child DeviceMesh:([4, 5, 6, 7]).
            Calling mesh["dp"] on rank 0, 4 would return a 1D child DeviceMesh:([0, 4]).
            Calling mesh["dp"] on rank 1, 5 would return a 1D child DeviceMesh:([1, 5]).
            Calling mesh["dp"] on rank 2, 6 would return a 1D child DeviceMesh:([2, 6]).
            Calling mesh["dp"] on rank 3, 7 would return a 1D child DeviceMesh:([3, 7]).

            Example::
                >>> # xdoctest: +SKIP("no rank")
                >>> from torch.distributed.device_mesh import DeviceMesh
                >>>
                >>> # Initialize device mesh as (2, 4) to represent the topology
                >>> # of cross-host(dim 0), and within-host (dim 1).
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
            r8   zCannot slice a DeviceMesh with z dimension.)r.   rZ   r&   r9   rF   r5   )r"   r*   r)   Zsubmeshr   r   r   __getitem__6  s   
zDeviceMesh.__getitem__r)   c                 C   s   t | ds	td| jjdkrt| jd  S |dur+t|tr$| |}t| j|  S g }t	| jjD ]}|
t| j|   q3|S )a  
            Returns a list of ProcessGroups corresponding to the mesh dimensions, or
            returns a single ProcessGroup if mesh_dim is specified or the given mesh has
            only one mesh dimension.

            Args:
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
                of the mesh dimension. Default is None.

            Returns:
                A list of :class:`ProcessGroup` object when `mesh_dim` is not specified for
                a DeviceMesh with more than 1 dimension; otherwise, returns a single
                :class:`ProcessGroup` object.
            r3   z*DeviceMesh process groups not initialized!r8   r   N)hasattrr&   r.   rZ   r   r3   rJ   rD   r9   r]   r[   )r"   r)   Z
dim_groupsZith_dimr   r   r   	get_group[  s   


zDeviceMesh.get_groupc                 C   s   |d u r	| j  S | j |S r   )r.   rW   r1   )r"   r)   r   r   r   r1   ~  s   zDeviceMesh.sizec                 C   s   | j jS r   )r.   rZ   r!   r   r   r   rZ     s   zDeviceMesh.ndimc                 C   s   t | jjS r   )rM   r.   rR   r!   r   r   r   rR     r?   zDeviceMesh.shapec                 C   s   t  S )z:
            Returns the current global rank.
            )r   r!   r   r   r   r     s   zDeviceMesh.get_rankc                 C   sX   | j dkr|du rtd| jj  dd|du rd}| |}t|ts(J dt|S )a{  
            Returns the local rank of the given mesh_dim of the DeviceMesh.

            Args:
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
                of the mesh dimension. Default is None.

            Returns:
                An integer denotes the local rank.

            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
            hosts with 4 GPUs each.
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0.
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.

            Example::
                >>> # xdoctest: +SKIP("no rank")
                >>> from torch.distributed.device_mesh import DeviceMesh
                >>>
                >>> # Initialize device mesh as (2, 4) to represent the topology
                >>> # of cross-host(dim 0), and within-host (dim 1).
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
            r8   NzFound the DeviceMesh have z dimensionszJOptional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.r   z1We expect ProcessGroup before calling `get_rank`!)rZ   r&   r.   rl   rJ   r   r   )r"   r)   Zmesh_dim_groupr   r   r   get_local_rank  s   
zDeviceMesh.get_local_rankc                 C   s   | j r| j S dS )z
            Return the relative indices of this rank relative to all
            dimensions of the mesh. If this rank is not part of the mesh, return None.
            N)rX   r!   r   r   r   get_coordinate  s   zDeviceMesh.get_coordinatec                 C   sP   | j d u st| j dkrtd|| j vr"td| dd| j  | j |S )Nr   zNo `mesh_dim_names` found.zMesh dimension 'z' does not exist.zAvailable mesh dimensions are: )r,   r%   KeyErrorindex)r"   r*   r   r   r   r9     s   


z DeviceMesh._get_mesh_dim_by_namerB   rA   r   )"r   r   r   __doc__rD   __annotations__rI   rK   r   r   r   boolr#   rU   r-   r_   rd   rf   rg   objectri   rj   rC   r   r   rl   r1   propertyrZ   rR   r   rm   rn   r9   r   r   r   r   r
      sR   
 '

#
+

&
# *)r,   
mesh_shape.r,   r   c                C   s   |dur.t t|t |krtdd| t |t |kr.tddt | dt | dtt||}t| ||d}|S )	aG  
        Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.

        This creates a DeviceMesh with an n-dimensional array layout, where `n` is the length of `mesh_shape`.
        If `mesh_dim_names` is provided, each dimension is labeled as `mesh_dim_names[i]`.

        .. note::
            `init_device_mesh` follows SPMD programming model, meaning the same PyTorch Python program
            runs on all processes/ranks in the cluster. Ensure `mesh_shape` (the dimensions of the nD array
            describing device layout) is identical across all ranks. Inconsistent `mesh_shape` may lead to hanging.

        .. note::
            If no process group is found, init_device_mesh will initialize distributed process group/groups
            required for distributed communications behind the scene.

        Args:
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
            mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
                describing the layout of devices.
            mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
                of the multi-dimensional array describing the layout of devices. Its length must match the length
                of `mesh_shape`. Each string in `mesh_dim_names` must be unique.

        Returns:
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.

        Example::
            >>> # xdoctest: +SKIP("no rank")
            >>> from torch.distributed.device_mesh import init_device_mesh
            >>>
            >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
            >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))

        Nz"Each mesh_dim_name must be unique.z/Found repeated mesh_dim_name in mesh_dim_names z6mesh_shape and mesh_dim_names should have same length!zFound len(mesh_dim_names): z and len(mesh_shape):.)r2   r.   r,   )	r%   setr&   rI   Zarangemathprodviewr
   )r2   rv   r,   r.   r(   r   r   r   r	     s$   ()rG   )*loggingry   typingr   r   r   r   r   r   rI   Ztorch.distributedr   __all__sysr   r   modulesr
   r	   Z"torch.distributed.distributed_c10dr   r   r   r   r   r   r   r   r   	getLoggerr   loggerZnumpy.typingr   ImportErrorwarningr   rF   rr   rD   r;   rC   r   r   r   r   <module>   sT   
 ,
@	  N
