DEV Community

gaoshuer
gaoshuer

Posted on • Edited on

TestCase

/*
 * Broadcast B8 Performance Benchmark
 *
 * Purpose: Compare B8 vs B16 broadcast performance on C310 architecture
 *          for non-tail axis broadcast with small last dimension.
 *
 * Expected result: B8 should be faster or comparable to B16 (half data size),
 *                  but due to GatherWrapper exclusion (broadcast_c310_impl.h:1168),
 *                  B8 falls back to a naive loop and is significantly slower.
 */

#include <chrono>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include "acl/acl.h"
#include "kernel_operator.h"

constexpr uint32_t CYCLENUMS = 1000;

// ============================================================================
// Kernel: Broadcast with BroadcastTiling API (the path used on C310)
// ============================================================================
template <typename T, uint32_t RANK>
class KernelBroadcastPerf {
public:
    __aicore__ inline KernelBroadcastPerf() {}
    __aicore__ inline void Init(GM_ADDR src_gm, GM_ADDR dst_gm,
                                 uint32_t srcSize, uint32_t dstSize)
    {
        constexpr uint32_t alignSize = AscendC::GetDataBlockSizeInBytes() / sizeof(T);
        srcSizeAligned = (srcSize + alignSize - 1) / alignSize * alignSize;
        dstSizeAligned = (dstSize + alignSize - 1) / alignSize * alignSize;
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(src_gm), srcSizeAligned);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(dst_gm), dstSizeAligned);
        pipe.InitBuffer(inQueue, 1, srcSizeAligned * sizeof(T));
        pipe.InitBuffer(outQueue, 1, dstSizeAligned * sizeof(T));
    }

    __aicore__ inline void Process(const uint32_t srcShape[RANK],
                                    const uint32_t dstShape[RANK])
    {
        AscendC::LocalTensor<T> srcLocal = inQueue.AllocTensor<T>();
        AscendC::DataCopy(srcLocal, srcGlobal, srcSizeAligned);
        inQueue.EnQue(srcLocal);

        srcLocal = inQueue.DeQue<T>();
        AscendC::LocalTensor<T> dstLocal = outQueue.AllocTensor<T>();

        AscendC::BroadcastTiling tiling;
        AscendC::GetBroadcastTilingInfo<T>(RANK, dstShape, srcShape, false, tiling);
        AscendC::Broadcast(dstLocal, srcLocal, dstShape, srcShape, &tiling);

        outQueue.EnQue<T>(dstLocal);
        inQueue.FreeTensor(srcLocal);

        dstLocal = outQueue.DeQue<T>();
        AscendC::DataCopy(dstGlobal, dstLocal, dstSizeAligned);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<T> srcGlobal;
    AscendC::GlobalTensor<T> dstGlobal;
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue;
    uint32_t srcSizeAligned;
    uint32_t dstSizeAligned;
};

// ============================================================================
// Kernel entry points
// ============================================================================
__global__ __vector__ void broadcast_b8(GM_ADDR src, GM_ADDR dst,
    uint32_t outerDim, uint32_t broadcastDim, uint32_t lastDim)
{
    uint32_t srcShape[3] = {outerDim, 1, lastDim};
    uint32_t dstShape[3] = {outerDim, broadcastDim, lastDim};
    uint32_t srcSize = outerDim * lastDim;
    uint32_t dstSize = outerDim * broadcastDim * lastDim;

    KernelBroadcastPerf<uint8_t, 3> op;
    op.Init(src, dst, srcSize, dstSize);
    op.Process(srcShape, dstShape);
}

__global__ __vector__ void broadcast_b16(GM_ADDR src, GM_ADDR dst,
    uint32_t outerDim, uint32_t broadcastDim, uint32_t lastDim)
{
    uint32_t srcShape[3] = {outerDim, 1, lastDim};
    uint32_t dstShape[3] = {outerDim, broadcastDim, lastDim};
    uint32_t srcSize = outerDim * lastDim;
    uint32_t dstSize = outerDim * broadcastDim * lastDim;

    KernelBroadcastPerf<half, 3> op;
    op.Init(src, dst, srcSize, dstSize);
    op.Process(srcShape, dstShape);
}

// ============================================================================
// Host
// ============================================================================
struct BenchCase {
    const char *name;
    uint32_t outerDim;
    uint32_t broadcastDim;
    uint32_t lastDim;
};

void RunBenchmark(aclrtStream stream, BenchCase c)
{
    uint32_t dstElems = c.outerDim * c.broadcastDim * c.lastDim;
    size_t maxSrcBytes = (size_t)c.outerDim * c.lastDim * sizeof(half);
    size_t maxDstBytes = (size_t)dstElems * sizeof(half);

    uint8_t *srcDevice, *dstDevice;
    aclrtMalloc((void **)&srcDevice, maxSrcBytes, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&dstDevice, maxDstBytes, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMemset(srcDevice, maxSrcBytes, 0x01, maxSrcBytes);

    // Warmup
    for (int i = 0; i < 10; ++i) {
        broadcast_b8<<<1, nullptr, stream>>>(srcDevice, dstDevice,
            c.outerDim, c.broadcastDim, c.lastDim);
    }
    aclrtSynchronizeStream(stream);

    auto startB8 = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < CYCLENUMS; ++i) {
        broadcast_b8<<<1, nullptr, stream>>>(srcDevice, dstDevice,
            c.outerDim, c.broadcastDim, c.lastDim);
    }
    aclrtSynchronizeStream(stream);
    auto endB8 = std::chrono::steady_clock::now();
    double timeB8 = std::chrono::duration<double, std::milli>(endB8 - startB8).count();

    for (int i = 0; i < 10; ++i) {
        broadcast_b16<<<1, nullptr, stream>>>(srcDevice, dstDevice,
            c.outerDim, c.broadcastDim, c.lastDim);
    }
    aclrtSynchronizeStream(stream);

    auto startB16 = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < CYCLENUMS; ++i) {
        broadcast_b16<<<1, nullptr, stream>>>(srcDevice, dstDevice,
            c.outerDim, c.broadcastDim, c.lastDim);
    }
    aclrtSynchronizeStream(stream);
    auto endB16 = std::chrono::steady_clock::now();
    double timeB16 = std::chrono::duration<double, std::milli>(endB16 - startB16).count();

    double ratio = timeB8 / timeB16;
    uint32_t iters = c.outerDim * c.broadcastDim;

    printf("  %-42s | iters=%6u | B8: %8.2f ms | B16: %8.2f ms | ratio: %5.2fx\n",
           c.name, iters, timeB8, timeB16, ratio);
    fflush(stdout);

    aclrtFree(srcDevice);
    aclrtFree(dstDevice);
}

// ============================================================================
// Main
// ============================================================================
int32_t main()
{
    int32_t deviceId = 0;
    aclInit(nullptr);
    aclrtSetDevice(deviceId);
    aclrtStream stream = nullptr;
    aclrtCreateStream(&stream);

    printf("=====================================================================\n");
    printf("  Broadcast B8 vs B16 Performance Benchmark (C310)\n");
    printf("  Pattern: [outer, 1, lastDim] -> [outer, bcast, lastDim]\n");
    printf("  Iterations per case: %u\n", CYCLENUMS);
    printf("  UB budget: B16 dst <= 48KB (24K half elements)\n");
    printf("=====================================================================\n");
    fflush(stdout);

    // =========================================================================
    // UB budget: keep B16 dst <= 48KB = 24576 half elements
    // Formula: outer * bcast * lastDim <= 24576
    //
    // Code path thresholds (broadcast_c310_impl.h):
    //   B16 GatherWrapper:  lastDim < 64  (VF_LEN_HALF for B16)
    //   B8  GatherWrapper:  ALWAYS EXCLUDED
    //   Both naive loop:    64 <= lastDim < 128
    //   Both aligned L/S:   lastDim >= 128 (block-aligned)
    // =========================================================================

    // =============================================
    // TEST 1: lastDim sweep
    //   Fixed bcast=32. Adjust outer to stay within UB.
    //   outer = min(128, 24576 / (32 * lastDim))
    // =============================================
    printf("\n[TEST 1] lastDim sweep (bcast=32, outer adjusted for UB)\n");
    printf("  Expect: ratio > 1 when lastDim < 64, ratio ~ 1 when lastDim >= 64\n");
    fflush(stdout);
    BenchCase test1[] = {
        // lastDim < 64: B16 uses GatherWrapper, B8 uses naive loop
        {"lastDim=1   [128,1,1]->[128,32,1]",      128, 32,   1},  // B16 dst=8KB
        {"lastDim=2   [128,1,2]->[128,32,2]",      128, 32,   2},  // B16 dst=16KB
        {"lastDim=3   [128,1,3]->[128,32,3]",      128, 32,   3},  // B16 dst=24KB
        {"lastDim=4   [128,1,4]->[128,32,4]",      128, 32,   4},  // B16 dst=32KB
        {"lastDim=8   [64,1,8]->[64,32,8]",         64, 32,   8},  // B16 dst=32KB
        {"lastDim=16  [32,1,16]->[32,32,16]",       32, 32,  16},  // B16 dst=32KB
        {"lastDim=32  [16,1,32]->[16,32,32]",       16, 32,  32},  // B16 dst=32KB
        {"lastDim=48  [10,1,48]->[10,32,48]",       10, 32,  48},  // B16 dst=30KB
        // --- B16 GatherWrapper boundary at lastDim=64 ---
        {"lastDim=63  [8,1,63]->[8,32,63]",          8, 32,  63},  // B16 dst=32KB
        {"lastDim=64  [8,1,64]->[8,32,64]",          8, 32,  64},  // B16 dst=32KB
        // --- Both use naive loop or aligned path ---
        {"lastDim=96  [4,1,96]->[4,32,96]",          4, 32,  96},  // B16 dst=24KB
        {"lastDim=128 [4,1,128]->[4,32,128]",        4, 32, 128},  // B16 dst=32KB
        {"lastDim=256 [2,1,256]->[2,32,256]",        2, 32, 256},  // B16 dst=32KB
    };
    for (auto &c : test1) {
        RunBenchmark(stream, c);
    }

    // =============================================
    // TEST 2: outer scaling (lastDim=4, bcast=8)
    //   Show degradation scales with data size.
    //   All fit UB: outer*32*3*2 = outer*192 bytes for B16
    //   Max outer for 48KB: 48*1024/192 = 256
    // =============================================
    printf("\n[TEST 2] outer scaling (lastDim=3, bcast=32)\n");
    printf("  Expect: ratio grows as outer increases\n");
    fflush(stdout);
    BenchCase test2[] = {
        {"outer=128  [128,1,4]->[128,8,4]",        128, 8, 4},
        {"outer=256  [256,1,4]->[256,8,4]",        256, 8, 4},
        {"outer=378  [378,1,4]->[378,8,4]",        378, 8, 4},
        {"outer=512  [512,1,4]->[512,8,4]",        512, 8, 4},
    };
    for (auto &c : test2) {
        RunBenchmark(stream, c);
    }

    for (auto &c : test3) {
        RunBenchmark(stream, c);
    }

    printf("\n=====================================================================\n");
    printf("  Summary:\n");
    printf("  TEST 1: expect performance cliff at lastDim=64 boundary\n");
    printf("  TEST 2: expect ratio grows with outer (more naive loop iters)\n");
    printf("  TEST 3: expect ratio grows with bcast (more naive loop iters)\n");
    printf("  Root cause: broadcast_c310_impl.h:1168\n");
    printf("    sizeof(T) != sizeof(uint8_t) excludes B8 from GatherWrapper\n");
    printf("=====================================================================\n");
    fflush(stdout);

    aclrtDestroyStream(stream);
    aclrtResetDevice(deviceId);
    aclFinalize();
    return 0;
}


// call
        if (stride[1] == 0) {
            BrcNlastLessThanVLUnalignedBrc<T>(dstUb, srcUb, sizeI[0], sizeI[1], sizeI[2], stride[0]);
        } else {
            BrcNlastLessThanVLUnaligned<T>(dstUb, srcUb, sizeI[0], sizeI[1], sizeI[2], stride[0], stride[1]);
        }

// add
template <typename T>
__simd_vf__ inline void BrcNlastLessThanVLUnalignedBrc(
    __ubuf__ T* dstUb, __ubuf__ T* srcUb, uint16_t size0, uint16_t size1, uint16_t size2, uint16_t srcStride0)
{
    Reg::RegTensor<T> srcReg;
    Reg::UnalignReg ureg0, ureg1;

    for (uint16_t i = 0; i < size0; ++i) {
        auto srcUbT = srcUb + i * srcStride0;
        Reg::LoadUnAlignPre(ureg0, srcUbT);
        Reg::LoadUnAlign(srcReg, ureg0, srcUbT, size2);
        for (uint16_t j = 0; j < size1; ++j) {
            Reg::StoreUnAlign(dstUb, srcReg, ureg1, size2);
        }
    }
    Reg::StoreUnAlignPost(dstUb, ureg1, 0);
}

Enter fullscreen mode Exit fullscreen mode

Top comments (0)