/*
* Broadcast B8 Performance Benchmark
*
* Purpose: Compare B8 vs B16 broadcast performance on C310 architecture
* for non-tail axis broadcast with small last dimension.
*
* Expected result: B8 should be faster or comparable to B16 (half data size),
* but due to GatherWrapper exclusion (broadcast_c310_impl.h:1168),
* B8 falls back to a naive loop and is significantly slower.
*/
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include "acl/acl.h"
#include "kernel_operator.h"
constexpr uint32_t CYCLENUMS = 1000;
// ============================================================================
// Kernel: Broadcast with BroadcastTiling API (the path used on C310)
// ============================================================================
template <typename T, uint32_t RANK>
class KernelBroadcastPerf {
public:
__aicore__ inline KernelBroadcastPerf() {}
__aicore__ inline void Init(GM_ADDR src_gm, GM_ADDR dst_gm,
uint32_t srcSize, uint32_t dstSize)
{
constexpr uint32_t alignSize = AscendC::GetDataBlockSizeInBytes() / sizeof(T);
srcSizeAligned = (srcSize + alignSize - 1) / alignSize * alignSize;
dstSizeAligned = (dstSize + alignSize - 1) / alignSize * alignSize;
srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(src_gm), srcSizeAligned);
dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(dst_gm), dstSizeAligned);
pipe.InitBuffer(inQueue, 1, srcSizeAligned * sizeof(T));
pipe.InitBuffer(outQueue, 1, dstSizeAligned * sizeof(T));
}
__aicore__ inline void Process(const uint32_t srcShape[RANK],
const uint32_t dstShape[RANK])
{
AscendC::LocalTensor<T> srcLocal = inQueue.AllocTensor<T>();
AscendC::DataCopy(srcLocal, srcGlobal, srcSizeAligned);
inQueue.EnQue(srcLocal);
srcLocal = inQueue.DeQue<T>();
AscendC::LocalTensor<T> dstLocal = outQueue.AllocTensor<T>();
AscendC::BroadcastTiling tiling;
AscendC::GetBroadcastTilingInfo<T>(RANK, dstShape, srcShape, false, tiling);
AscendC::Broadcast(dstLocal, srcLocal, dstShape, srcShape, &tiling);
outQueue.EnQue<T>(dstLocal);
inQueue.FreeTensor(srcLocal);
dstLocal = outQueue.DeQue<T>();
AscendC::DataCopy(dstGlobal, dstLocal, dstSizeAligned);
outQueue.FreeTensor(dstLocal);
}
private:
AscendC::GlobalTensor<T> srcGlobal;
AscendC::GlobalTensor<T> dstGlobal;
AscendC::TPipe pipe;
AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue;
uint32_t srcSizeAligned;
uint32_t dstSizeAligned;
};
// ============================================================================
// Kernel entry points
// ============================================================================
__global__ __vector__ void broadcast_b8(GM_ADDR src, GM_ADDR dst,
uint32_t outerDim, uint32_t broadcastDim, uint32_t lastDim)
{
uint32_t srcShape[3] = {outerDim, 1, lastDim};
uint32_t dstShape[3] = {outerDim, broadcastDim, lastDim};
uint32_t srcSize = outerDim * lastDim;
uint32_t dstSize = outerDim * broadcastDim * lastDim;
KernelBroadcastPerf<uint8_t, 3> op;
op.Init(src, dst, srcSize, dstSize);
op.Process(srcShape, dstShape);
}
__global__ __vector__ void broadcast_b16(GM_ADDR src, GM_ADDR dst,
uint32_t outerDim, uint32_t broadcastDim, uint32_t lastDim)
{
uint32_t srcShape[3] = {outerDim, 1, lastDim};
uint32_t dstShape[3] = {outerDim, broadcastDim, lastDim};
uint32_t srcSize = outerDim * lastDim;
uint32_t dstSize = outerDim * broadcastDim * lastDim;
KernelBroadcastPerf<half, 3> op;
op.Init(src, dst, srcSize, dstSize);
op.Process(srcShape, dstShape);
}
// ============================================================================
// Host
// ============================================================================
struct BenchCase {
const char *name;
uint32_t outerDim;
uint32_t broadcastDim;
uint32_t lastDim;
};
void RunBenchmark(aclrtStream stream, BenchCase c)
{
uint32_t dstElems = c.outerDim * c.broadcastDim * c.lastDim;
size_t maxSrcBytes = (size_t)c.outerDim * c.lastDim * sizeof(half);
size_t maxDstBytes = (size_t)dstElems * sizeof(half);
uint8_t *srcDevice, *dstDevice;
aclrtMalloc((void **)&srcDevice, maxSrcBytes, ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void **)&dstDevice, maxDstBytes, ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMemset(srcDevice, maxSrcBytes, 0x01, maxSrcBytes);
// Warmup
for (int i = 0; i < 10; ++i) {
broadcast_b8<<<1, nullptr, stream>>>(srcDevice, dstDevice,
c.outerDim, c.broadcastDim, c.lastDim);
}
aclrtSynchronizeStream(stream);
auto startB8 = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < CYCLENUMS; ++i) {
broadcast_b8<<<1, nullptr, stream>>>(srcDevice, dstDevice,
c.outerDim, c.broadcastDim, c.lastDim);
}
aclrtSynchronizeStream(stream);
auto endB8 = std::chrono::steady_clock::now();
double timeB8 = std::chrono::duration<double, std::milli>(endB8 - startB8).count();
for (int i = 0; i < 10; ++i) {
broadcast_b16<<<1, nullptr, stream>>>(srcDevice, dstDevice,
c.outerDim, c.broadcastDim, c.lastDim);
}
aclrtSynchronizeStream(stream);
auto startB16 = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < CYCLENUMS; ++i) {
broadcast_b16<<<1, nullptr, stream>>>(srcDevice, dstDevice,
c.outerDim, c.broadcastDim, c.lastDim);
}
aclrtSynchronizeStream(stream);
auto endB16 = std::chrono::steady_clock::now();
double timeB16 = std::chrono::duration<double, std::milli>(endB16 - startB16).count();
double ratio = timeB8 / timeB16;
uint32_t iters = c.outerDim * c.broadcastDim;
printf(" %-42s | iters=%6u | B8: %8.2f ms | B16: %8.2f ms | ratio: %5.2fx\n",
c.name, iters, timeB8, timeB16, ratio);
fflush(stdout);
aclrtFree(srcDevice);
aclrtFree(dstDevice);
}
// ============================================================================
// Main
// ============================================================================
int32_t main()
{
int32_t deviceId = 0;
aclInit(nullptr);
aclrtSetDevice(deviceId);
aclrtStream stream = nullptr;
aclrtCreateStream(&stream);
printf("=====================================================================\n");
printf(" Broadcast B8 vs B16 Performance Benchmark (C310)\n");
printf(" Pattern: [outer, 1, lastDim] -> [outer, bcast, lastDim]\n");
printf(" Iterations per case: %u\n", CYCLENUMS);
printf(" UB budget: B16 dst <= 48KB (24K half elements)\n");
printf("=====================================================================\n");
fflush(stdout);
// =========================================================================
// UB budget: keep B16 dst <= 48KB = 24576 half elements
// Formula: outer * bcast * lastDim <= 24576
//
// Code path thresholds (broadcast_c310_impl.h):
// B16 GatherWrapper: lastDim < 64 (VF_LEN_HALF for B16)
// B8 GatherWrapper: ALWAYS EXCLUDED
// Both naive loop: 64 <= lastDim < 128
// Both aligned L/S: lastDim >= 128 (block-aligned)
// =========================================================================
// =============================================
// TEST 1: lastDim sweep
// Fixed bcast=32. Adjust outer to stay within UB.
// outer = min(128, 24576 / (32 * lastDim))
// =============================================
printf("\n[TEST 1] lastDim sweep (bcast=32, outer adjusted for UB)\n");
printf(" Expect: ratio > 1 when lastDim < 64, ratio ~ 1 when lastDim >= 64\n");
fflush(stdout);
BenchCase test1[] = {
// lastDim < 64: B16 uses GatherWrapper, B8 uses naive loop
{"lastDim=1 [128,1,1]->[128,32,1]", 128, 32, 1}, // B16 dst=8KB
{"lastDim=2 [128,1,2]->[128,32,2]", 128, 32, 2}, // B16 dst=16KB
{"lastDim=3 [128,1,3]->[128,32,3]", 128, 32, 3}, // B16 dst=24KB
{"lastDim=4 [128,1,4]->[128,32,4]", 128, 32, 4}, // B16 dst=32KB
{"lastDim=8 [64,1,8]->[64,32,8]", 64, 32, 8}, // B16 dst=32KB
{"lastDim=16 [32,1,16]->[32,32,16]", 32, 32, 16}, // B16 dst=32KB
{"lastDim=32 [16,1,32]->[16,32,32]", 16, 32, 32}, // B16 dst=32KB
{"lastDim=48 [10,1,48]->[10,32,48]", 10, 32, 48}, // B16 dst=30KB
// --- B16 GatherWrapper boundary at lastDim=64 ---
{"lastDim=63 [8,1,63]->[8,32,63]", 8, 32, 63}, // B16 dst=32KB
{"lastDim=64 [8,1,64]->[8,32,64]", 8, 32, 64}, // B16 dst=32KB
// --- Both use naive loop or aligned path ---
{"lastDim=96 [4,1,96]->[4,32,96]", 4, 32, 96}, // B16 dst=24KB
{"lastDim=128 [4,1,128]->[4,32,128]", 4, 32, 128}, // B16 dst=32KB
{"lastDim=256 [2,1,256]->[2,32,256]", 2, 32, 256}, // B16 dst=32KB
};
for (auto &c : test1) {
RunBenchmark(stream, c);
}
// =============================================
// TEST 2: outer scaling (lastDim=4, bcast=8)
// Show degradation scales with data size.
// All fit UB: outer*32*3*2 = outer*192 bytes for B16
// Max outer for 48KB: 48*1024/192 = 256
// =============================================
printf("\n[TEST 2] outer scaling (lastDim=3, bcast=32)\n");
printf(" Expect: ratio grows as outer increases\n");
fflush(stdout);
BenchCase test2[] = {
{"outer=128 [128,1,4]->[128,8,4]", 128, 8, 4},
{"outer=256 [256,1,4]->[256,8,4]", 256, 8, 4},
{"outer=378 [378,1,4]->[378,8,4]", 378, 8, 4},
{"outer=512 [512,1,4]->[512,8,4]", 512, 8, 4},
};
for (auto &c : test2) {
RunBenchmark(stream, c);
}
for (auto &c : test3) {
RunBenchmark(stream, c);
}
printf("\n=====================================================================\n");
printf(" Summary:\n");
printf(" TEST 1: expect performance cliff at lastDim=64 boundary\n");
printf(" TEST 2: expect ratio grows with outer (more naive loop iters)\n");
printf(" TEST 3: expect ratio grows with bcast (more naive loop iters)\n");
printf(" Root cause: broadcast_c310_impl.h:1168\n");
printf(" sizeof(T) != sizeof(uint8_t) excludes B8 from GatherWrapper\n");
printf("=====================================================================\n");
fflush(stdout);
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return 0;
}
// call
if (stride[1] == 0) {
BrcNlastLessThanVLUnalignedBrc<T>(dstUb, srcUb, sizeI[0], sizeI[1], sizeI[2], stride[0]);
} else {
BrcNlastLessThanVLUnaligned<T>(dstUb, srcUb, sizeI[0], sizeI[1], sizeI[2], stride[0], stride[1]);
}
// add
template <typename T>
__simd_vf__ inline void BrcNlastLessThanVLUnalignedBrc(
__ubuf__ T* dstUb, __ubuf__ T* srcUb, uint16_t size0, uint16_t size1, uint16_t size2, uint16_t srcStride0)
{
Reg::RegTensor<T> srcReg;
Reg::UnalignReg ureg0, ureg1;
for (uint16_t i = 0; i < size0; ++i) {
auto srcUbT = srcUb + i * srcStride0;
Reg::LoadUnAlignPre(ureg0, srcUbT);
Reg::LoadUnAlign(srcReg, ureg0, srcUbT, size2);
for (uint16_t j = 0; j < size1; ++j) {
Reg::StoreUnAlign(dstUb, srcReg, ureg1, size2);
}
}
Reg::StoreUnAlignPost(dstUb, ureg1, 0);
}
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)