参考已有的 GPU 实现,在ByteMLPerf/byte_micro_perf/backends/
路径下,创建对应硬件的文件夹,新硬件接口backend_xpu.py
和所需运行环境的requirements.txt
。
需要注意的是在继承 build_tensor, _run_operation, initialize_ccl 等函数时时,注意修改为对应硬件的函数(如 torch.cuda.synchronize())。
import os
import math
from typing import Any, List, Dict
from datetime import timedelta
from backends.backend import Backend
from backends.module_store import *
import torch
import torch.distributed as dist
import torch.distributed.distributed_c10d as dist_c10d
class BackendGPU(Backend):
# 获取硬件名字
def get_device_name(self):
return torch.cuda.get_device_name(0)
# 若提供了硬件属性文件,则读取文件获取内存信息
def get_backend_properties(self):
self.memory_limit = int(torch.cuda.get_device_properties(0).total_memory / (1024 ** 3))
if os.path.exists(self.vendor_path):
with open(self.vendor_path, 'r') as f:
self.hw_info_dict = json.load(f)
# if the vendor path does not exist, please set this param manually
self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"]
# 根据硬件的内存容量,提前创建多个输入张量,并随机选择某一个作为输入,避免L2 cache
# 若算子需要额外处理输入张量,则可以在process_inputs函数内进一步处理。返回合适的输入
def build_tensor(self, input_shapes, dtype):
dtype_size = torch.finfo(getattr(torch, dtype)).bits // 8
size = sum([math.prod(shape) for shape in input_shapes])
data_amount = size * 2 * dtype_size
data_cnt = (self.memory_limit - 4)* 1024**3 // data_amount
data_cnt = min(data_cnt, self.iterations)
input_tensors_list = []
for _ in range(data_cnt):
input_tensors = [torch.randn(shape).type(getattr(torch, dtype)).to(torch.device('cuda')) for shape in input_shapes]
input_tensors_list.append(input_tensors)
rand_idx = torch.randint(0, data_cnt, (1,))
if hasattr(self.op, "process_inputs"):
input_tensors = self.op.process_inputs(*(input_tensors_list[rand_idx]))
return input_tensors
return input_tensors_list[rand_idx]
# 重载正确的同步函数
def sync_xpu(self):
torch.cuda.synchronize()
return True
# 配置分布式相关的参数,如master_addr,master_port等
# 设置对应的backend
def initialize_ccl(self, rank, world_size):
"""
initialize distributed process groups and relevant ENVs
"""
torch.manual_seed(1)
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '49373'
os.environ['LOCAL_RANK'] = str(rank)
os.environ['RANK'] = str(rank)
os.environ['WORLD_SIZE'] = str(world_size)
torch.cuda.set_device(rank)
# Call the init process
timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
torch.distributed.init_process_group(
backend="nccl",
world_size=world_size,
rank=rank,
store=None,
timeout=timedelta(seconds=timeout_seconds))
self.setup_2d_group()
print(f'DIST INFO: rank {rank}, world_size {world_size}', flush=True)
{
"operator": "softmax", #算子名
"iterations": 100, #迭代次数
"input_shape_list": [
[
131072,
64
],
[
131072,
1024
]
], #算子输入形状,可同时评测多个不同的形状,注意不要超过内存容量进而OOM
"dtype": [
"float32",
"bfloat16",
"half"
] #数据类型,可填写硬件支持的数据类型
}