泛化Matmul工程结构说明
【免费下载链接】catlass本项目是CANN的算子模板库,提供NPU上高性能矩阵乘及其相关融合类算子模板样例。项目地址: https://gitcode.com/cann/catlass
1 工程结构
├── CMakeLists.txt ├── README.md ├── dynamic_optimized_matmul.cpp ├── impl │ ├── kernel │ │ ├── common_matmul_kernel.h │ │ ├── ...... │ ├── scripts │ │ ├── templates │ │ │ ├── common_matmul_template.py │ │ │ ├── ...... │ │ ├── utils │ │ │ └── config.py │ │ └── wrapper_code_gen.py │ └── wrapper # 自动生成 │ ├── common_matmul_kernel_half_layout00.cpp # 自动生成 │ ├── common_matmul_kernel_half_layout01.cpp # 自动生成 │ ├── common_matmul_kernel_half_layout10.cpp # 自动生成 │ ├── common_matmul_kernel_half_layout11.cpp # 自动生成 │ ├── ...... └── include ├── do_tiling_b16.h ├── dynamic_optimized_matmul.h ├── launch_map.h # 自动生成 ├── platform_info.h ├── select_kernel_b16.h ├── tiling_params.h └── utils.h1.1 工程编译阶段
(1) 调用python脚本生成代码,具体包括调用各个模板的外围代码(即wrapper文件夹下的文件),以及launch_map.h(包含tilingKey和具体的映射关系)。
例如common_matmul_kernel_half_layout00.cpp内容如下:
#include "kernel/common_matmul_kernel.h" void LaunchCommonMatmulKernelHalfLayout00(aclrtStream& stream, uint64_t fftsAddr, uint8_t* dA, uint8_t* dB, uint8_t* dC, uint8_t* dW, uint8_t* dTilingParams, TilingParams& tilingParams) { using ArchTag = Catlass::Arch::AtlasA2; using ElementA = half; using ElementB = half; using ElementC = half; using LayoutA = Catlass::layout::RowMajor; using LayoutB = Catlass::layout::RowMajor; using LayoutC = Catlass::layout::RowMajor; LaunchCommonMatmulKernel<ArchTag, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC>( stream, fftsAddr, dA, dB, dC, dTilingParams, tilingParams); } size_t CommonMatmulKernelHalfLayout00GetWorkspaceSize(TilingParams& tilingParams) { using ArchTag = Catlass::Arch::AtlasA2; using ElementA = half; using ElementB = half; using ElementC = half; using LayoutA = Catlass::layout::RowMajor; using LayoutB = Catlass::layout::RowMajor; using LayoutC = Catlass::layout::RowMajor; return CommonMatmulKernelGetWorkspaceSize< ArchTag, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC>(tilingParams); }生成的launch_map.h示例如下:
#ifndef LAUNCH_MAP_H #define LAUNCH_MAP_H #include <unordered_map> #include <string> #include "acl/acl.h" #include "tiling_params.h" #define DECLARE_KERNEL_FUNC(kernelName) \ void Launch##kernelName(aclrtStream&, uint64_t, uint8_t*, uint8_t*, uint8_t*, uint8_t*, uint8_t*, TilingParams&); \ size_t kernelName##GetWorkspaceSize(TilingParams&); DECLARE_KERNEL_FUNC(CommonMatmulKernelHalfLayout00) DECLARE_KERNEL_FUNC(CommonMatmulKernelHalfLayout01) DECLARE_KERNEL_FUNC(CommonMatmulKernelHalfLayout10) DECLARE_KERNEL_FUNC(CommonMatmulKernelHalfLayout11) std::unordered_map<uint64_t, void(*)(aclrtStream&, uint64_t, uint8_t*, uint8_t*, uint8_t*, uint8_t*, uint8_t*, TilingParams&)> launchKernelFuncMap = { { 0x0000000000000000, LaunchCommonMatmulKernelHalfLayout00 }, { 0x0000000000000010, LaunchCommonMatmulKernelHalfLayout01 }, { 0x0000000000000100, LaunchCommonMatmulKernelHalfLayout10 }, { 0x0000000000000110, LaunchCommonMatmulKernelHalfLayout11 } }; using GetWorkspaceFunc = size_t(*)(TilingParams& tilingParams); std::unordered_map<uint64_t, GetWorkspaceFunc> getWorkspaceFuncMap = { { 0x0000000000000000, CommonMatmulKernelHalfLayout00GetWorkspaceSize }, { 0x0000000000000010, CommonMatmulKernelHalfLayout01GetWorkspaceSize }, { 0x0000000000000100, CommonMatmulKernelHalfLayout10GetWorkspaceSize }, { 0x0000000000000110, CommonMatmulKernelHalfLayout11GetWorkspaceSize }, }; // only for print kernel Info std::unordered_map<uint64_t, std::string> funcNameMap = { { 0x0000000000000000, "CommonMatmulKernelHalfLayout00" }, { 0x0000000000000010, "CommonMatmulKernelHalfLayout01" }, { 0x0000000000000100, "CommonMatmulKernelHalfLayout10" }, { 0x0000000000000110, "CommonMatmulKernelHalfLayout11" } }; #endif // LAUNCH_MAP_H(2) 编译完成后产生两个文件,一个是二进制可执行文件output/bin/102_dynamic_optimized_matmul,一个是静态库文件output/shared_lib/lib/libdynamic_optimized_kernel.a。二进制文件会调用静态库文件。
1.2 运行流程
TilingKey内容如下:
/* * Bit field layout description (little-endian): * ------------------------------------------------------------------------- * | Bit Range | Size | Field Name | Description | * |-----------|------|-----------------------|----------------------------| * | 0-3 | 4 | layoutTagC | Layout tag for C matrix | * | 4-7 | 4 | layoutTagB | Layout tag for B matrix | * | 8-11 | 4 | layoutTagA | Layout tag for A matrix | * | 12-15 | 4 | paddingTagC | Padding tag for C matrix | * | 16-19 | 4 | paddingTagB | Padding tag for B matrix | * | 20-23 | 4 | paddingTagA | Padding tag for A matrix | * | 24-51 | 28 | reserveBit | Reserved for future use | * | 52-55 | 4 | dtype | Data type specification | * | 56-63 | 8 | templateKernelSerial | Template kernel serial ID | * ------------------------------------------------------------------------- */ union TilingKey { uint64_t value; struct { uint64_t layoutTagC : 4; // 0-3 uint64_t layoutTagB : 4; // 4-7 uint64_t layoutTagA : 4; // 8-11 uint64_t paddingTagC : 4; // 12-15 uint64_t paddingTagB : 4; // 16-19 uint64_t paddingTagA : 4; // 20-23 uint64_t reserveBit : 28; // 24-51 May be used in the future uint64_t dtype : 4; // 52-55 uint64_t templateKernelSerial : 8; // 56-63 } bits; ...... }用DoTiling和SelectKernel后得到的信息设置TilingKey,根据TilingKey匹配对应的Matmul函数。
2 使用说明
// 1.输入shape信息,构建tilingParams结构体。 TilingParams tilingParams{m, n, k, layoutTagA, layoutTagB, layoutTagC}; // 2.该函数包含两个阶段: // (1)根据tilingParams中shape信息计算tiling参数。 // (2)根据shape信息和上一步得到的tiling参数进行模板选择。 DoTilingAndSelectKernel<fp16_t>(tilingParams, platformInfo); // 3.打印tilingParams结构体参数。(可选) PrintTilingParams<fp16_t>(tilingParams, platformInfo); // 4.获取需要的workspace大小。 size_t workspaceSize = DynamicOptimizedMatmulGetWorkspace(tilingParams); // 5.申请device侧空间。 ACL_CHECK(aclrtMalloc((void **)&dA, sizeA, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc((void **)&dB, sizeB, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc((void **)&dC, sizeC, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc((void **)&dTilingParams, sizeof(TilingParams), ACL_MEM_MALLOC_HUGE_FIRST)); if (workspaceSize > 0) { ACL_CHECK(aclrtMalloc((void **)&dW, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); } // 6.拷贝TilingParams结构体数据到device空间。 ACL_CHECK(aclrtMemcpy( dTilingParams, sizeof(TilingParams), &tilingParams, sizeof(TilingParams), ACL_MEMCPY_HOST_TO_DEVICE)); // 7.获取fftsAddr。 uint64_t fftsAddr{0}; uint32_t fftsLen{0}; RT_CHECK(rtGetC2cCtrlAddr(&fftsAddr, &fftsLen)); // 8.执行Matmul计算。 ExecuteDynamicOptimizedMatmul(stream, fftsAddr, dA, dB, dC, dW, dTilingParams, tilingParams); ACL_CHECK(aclrtSynchronizeStream(stream)); // 9.获取计算结果。 ACL_CHECK(aclrtMemcpy(hostC.data(), sizeC, dC, sizeC, ACL_MEMCPY_DEVICE_TO_HOST));3 实现原理
3.1 DoTiling实现
这里DoTiling主要是根据shape(M、N、K、LayoutA、LayoutB)确定L1上的分块大小。
主要基于以下三个规则:
- 首先保证指令搬运带宽充分发挥。关键点为Stride方向的基本块参数需要512B对齐。
- 尽量做到负载均衡
- 计算轮次尽量少。
3.2 SelectKernel实现
bool PaddingMatmulB16Handler(TilingParams ¶ms, PlatformInfo& platformInfo) { uint8_t kernelSerial = 2; if (params.paddingTagA || params.paddingTagB || params.paddingTagC) { params.tilingKey.SetTilingKey(kernelSerial, params.layoutTagA, params.layoutTagB, 0, params.paddingTagA, params.paddingTagB, params.paddingTagC); return true; } return false; } bool CommonMatmulB16Handler(TilingParams ¶ms, PlatformInfo& platformInfo) { uint8_t kernelSerial = 0; uint32_t taskBlocks = CeilDiv(params.m, params.m1) * CeilDiv(params.n, params.n1); params.blockDim = taskBlocks > platformInfo.coreNum ? platformInfo.coreNum : taskBlocks; // kernelSerial, layoutTagA, layoutTagB, layoutTagC, paddingTagA, paddingTagB, paddingTagC, dtype(defalut 0). params.tilingKey.SetTilingKey(kernelSerial, params.layoutTagA, params.layoutTagB, 0, 0, 0, 0); return true; } using HandlerPtr = bool (*)(TilingParams& tilingParams, PlatformInfo& platformInfo); HandlerPtr handlers[] = { SmallMatmulB16Handler, PaddingMultiCoreSplitkMatmulB16Handler, PaddingMatmulB16Handler, CommonMatmulB16Handler }; for (auto handler : handlers) { if (handler(tilingParams, platformInfo)) { break; } }每个模板都设置自己的使用条件,按特定顺序遍历各个模板,当前shape满足当前模板的适用条件的话,就采用当前的模板进行计算,否则继续遍历下一个模板,直到能找到处理当前shape的模板。
【免费下载链接】catlass本项目是CANN的算子模板库,提供NPU上高性能矩阵乘及其相关融合类算子模板样例。项目地址: https://gitcode.com/cann/catlass
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考