接入 Micro Perf

创建 Backend

参考已有的 GPU 实现,在ByteMLPerf/byte_micro_perf/backends/路径下,创建对应硬件的文件夹,新硬件接口backend_xpu.py和所需运行环境的requirements.txt。 需要注意的是在继承 build_tensor, _run_operation, initialize_ccl 等函数时时,注意修改为对应硬件的函数(如 torch.cuda.synchronize())。

backend_gpu.py
import os import math from typing import Any, List, Dict from datetime import timedelta from backends.backend import Backend from backends.module_store import * import torch import torch.distributed as dist import torch.distributed.distributed_c10d as dist_c10d class BackendGPU(Backend): # 获取硬件名字 def get_device_name(self): return torch.cuda.get_device_name(0) # 若提供了硬件属性文件,则读取文件获取内存信息 def get_backend_properties(self): self.memory_limit = int(torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)) if os.path.exists(self.vendor_path): with open(self.vendor_path, 'r') as f: self.hw_info_dict = json.load(f) # if the vendor path does not exist, please set this param manually self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"] # 根据硬件的内存容量,提前创建多个输入张量,并随机选择某一个作为输入,避免L2 cache # 若算子需要额外处理输入张量,则可以在process_inputs函数内进一步处理。返回合适的输入 def build_tensor(self, input_shapes, dtype): dtype_size = torch.finfo(getattr(torch, dtype)).bits // 8 size = sum([math.prod(shape) for shape in input_shapes]) data_amount = size * 2 * dtype_size data_cnt = (self.memory_limit - 4)* 1024**3 // data_amount data_cnt = min(data_cnt, self.iterations) input_tensors_list = [] for _ in range(data_cnt): input_tensors = [torch.randn(shape).type(getattr(torch, dtype)).to(torch.device('cuda')) for shape in input_shapes] input_tensors_list.append(input_tensors) rand_idx = torch.randint(0, data_cnt, (1,)) if hasattr(self.op, "process_inputs"): input_tensors = self.op.process_inputs(*(input_tensors_list[rand_idx])) return input_tensors return input_tensors_list[rand_idx] # 重载正确的同步函数 def sync_xpu(self): torch.cuda.synchronize() return True # 配置分布式相关的参数,如master_addr,master_port等 # 设置对应的backend def initialize_ccl(self, rank, world_size): """ initialize distributed process groups and relevant ENVs """ torch.manual_seed(1) os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '49373' os.environ['LOCAL_RANK'] = str(rank) os.environ['RANK'] = str(rank) os.environ['WORLD_SIZE'] = str(world_size) torch.cuda.set_device(rank) # Call the init process timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30)) torch.distributed.init_process_group( backend="nccl", world_size=world_size, rank=rank, store=None, timeout=timedelta(seconds=timeout_seconds)) self.setup_2d_group() print(f'DIST INFO: rank {rank}, world_size {world_size}', flush=True)

Workload 说明

softmax.json
{ "operator": "softmax", #算子名 "iterations": 100, #迭代次数 "input_shape_list": [ [ 131072, 64 ], [ 131072, 1024 ] ], #算子输入形状,可同时评测多个不同的形状,注意不要超过内存容量进而OOM "dtype": [ "float32", "bfloat16", "half" ] #数据类型,可填写硬件支持的数据类型 }

评估指标说明

  • Memory Size(MB): 算子输入的访存量
  • Group: 通信算子的切分 group 数量
  • Bus bandwidth(GB/s): 通信算子的通信带宽,计算算子为 None
  • Algo bandwidth(GB/s): 算子的 IO 带宽
  • Bandwidth Utilization(%): 算子的内存带宽利用率
  • Avg latency(us): N 次循环的平均延迟时间