トップ 差分 一覧 Farm ソース 検索 ヘルプ PDF RSS ログイン

Diary/2021-1-25

VitisとHBM

HLSでHBMアクセスする場合のバンド幅とレイテンシについて実機確認とか文献調査とか.


Alveo U50で測ってみた結果は次のような感じ.
特にケアしない(むしろHBMをあえて分けて,足をひっぱってる)場合に読み書き30MBps/500MBpsで,
これをビット幅増やしたりburst_length指定すると490MBps/7050MBpsに速度向上.
ビット幅そのままでbundle指定してAXIを分割すると読み書き1170GBps/1060GBpsくらいで,
ビット幅を考慮してbundleもちゃんと指定すると10GBps程度出る.


(Alveo U280で)uBench使っても読み書き10GBps程度はでてることが確認できる.

すごく腑抜けたコード

Hello World的なこんなコード.

miyo@dev-8800:~/vitis_au50/vadd_2$ cat vadd.cpp
extern "C" {
    void vadd(int count,
              int* a_0, int* b_0, int* c_0
              );
}

void vadd(int count,
          int* a_0, int* b_0, int* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    for(int i = 0; i < count; i++){
#pragma HLS PIPELINE
        c_0[i] = a_0[i] + b_0[i];
    }
}

HBM分けてみたかったので,design.cfgはこんな感じ.

platform=xilinx_u50_gen3x16_xdma_201920_3
debug=1
profile_kernel=data:all:all:all
 
[connectivity]
nk=vadd:1:vadd_1
sp=vadd_1.a_0:HBM[0]
sp=vadd_1.b_0:HBM[1]
sp=vadd_1.c_0:HBM[2]

で,実行結果をプロファイルで見た.


腑抜けたコードだとここまで遅いのか...


ちょっと気の利いたようなコードを書いてみる
#include <ap_int.h>

extern "C" {
    void vadd(int count,
              ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
              );
}

void vadd(int count,
          ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    ap_uint<512> tmp_a_0, tmp_b_0, tmp_c_0;
    int x[16];
    int y[16];
    int z[16];

    int num = count / 4; // 間違えた

#pragma HLS DATAFLOW
    for(int i = 0; i < num; i++){
#pragma HLS PIPELINE II=1
        tmp_a_0 = a_0[i];
        tmp_b_0 = b_0[i];
        for(int j = 0; j < 16; j++){
            tmp_c_0(j*32+31, j*32) = tmp_a_0.range(j*32+31,j*32) + tmp_b_0.range(j*32+31,j*32);
        }
        c_0[i] = tmp_c_0;
    }

}

オリジナルのvadd.cから,512bit単位でメモリ読み書きするように変更.
Number of Transfersの値がおかしいのは,numの計算を間違ってるから.
それでもスループットは少しよくなった.


burstの指定をしてみた
#include <ap_int.h>
 
extern "C" {
    void vadd(int count,
              ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
              );
}

void vadd(int count,
          ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave max_read_burst_length=16
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave max_read_burst_length=16
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave max_write_burst_length=16
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    ap_uint<512> tmp_a_0, tmp_b_0, tmp_c_0;

    int x[16];
    int y[16];
    int z[16];

    int num = count / (512/32);

    for(int i = 0; i < num; i++){
#pragma HLS unroll factor=16
#pragma HLS PIPELINE II=1
        tmp_a_0 = a_0[i];
        tmp_b_0 = b_0[i];
        for(int j = 0; j < 16; j++){
            tmp_c_0(j*32+31, j*32) = tmp_a_0.range(j*32+31,j*32) + tmp_b_0.range(j*32+31,j*32);
        }
        c_0[i] = tmp_c_0;
    }

}

max_{read,write}_burst_lengthを指定して,ついでにアンロールしてみた.
書く方は,だいぶ速くなった,かな.読む方は回数も多いしうまくいってない.残念


ポートを分ける

元のコードで同じm_axiポート使ってるのはダメ,という指摘を受けたので,bundleで分割.

extern "C" {
    void vadd(int count,
              int* a_0, int* b_0, int* c_0
              );
}

void vadd(int count,
          int* a_0, int* b_0, int* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    for(int i = 0; i < count; i++){
#pragma HLS PIPELINE
        c_0[i] = a_0[i] + b_0[i];
    }
}

速くなった!!

ビット幅とbundleの両方をケア

ビット幅を考慮しつつ(とりあえずの512bit幅),bundleでポート分けるのも忘れないバージョン

#include <ap_int.h>

extern "C" {
    void vadd(int count,
              ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
              );
}

void vadd(int count,
          ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    ap_uint<512> tmp_a_0, tmp_b_0, tmp_c_0;
    int x[16];
    int y[16];
    int z[16];

    int num = count / 16;

#pragma HLS DATAFLOW
    for(int i = 0; i < num; i++){
#pragma HLS PIPELINE II=1
        tmp_a_0 = a_0[i];
        tmp_b_0 = b_0[i];
        for(int j = 0; j < 16; j++){
            tmp_c_0(j*32+31, j*32) = tmp_a_0.range(j*32+31,j*32) + tmp_b_0.range(j*32+31,j*32);
        }
        c_0[i] = tmp_c_0;
    }
}

読み書きとも10GBps近くなった!!

ビット幅とbundle両方をケア/burst_lengthも指定

#include <ap_int.h>

extern "C" {
    void vadd(int count,
              ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
              );
}

void vadd(int count,
          ap_uint<512>* a_0, ap_uint<512>* b_0, ap_uint<512>* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave max_read_burst_length=16 bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave max_read_burst_length=16 bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave max_write_burst_length=16 bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    ap_uint<512> tmp_a_0, tmp_b_0, tmp_c_0;

    int x[16];
    int y[16];
    int z[16];

    int num = count / (512/32);

    for(int i = 0; i < num; i++){
#pragma HLS unroll factor=16
#pragma HLS PIPELINE II=1
        tmp_a_0 = a_0[i];
        tmp_b_0 = b_0[i];
        for(int j = 0; j < 16; j++){
            tmp_c_0(j*32+31, j*32) = tmp_a_0.range(j*32+31,j*32) + tmp_b_0.range(j*32+31,j*32);
        }
        c_0[i] = tmp_c_0;
    }

}

このケースだとわざわざ指定するまでもなかった,みたい.

uBench

uBenchのubenchで測定.
コード見るかぎり,読みっぱなし,書きっぱなしのチャンピオンデータ測定をしているように見える.
ちなみに,実行環境はAlveo U280.

~/uBench/ubench/off-chip_bandwidth/read/HBM/2ports_512bit$ make check TARGET=hw DEVICE=xilinx_u280-es1_xdma_201910_1 HOST_ARCH=x86 SYSROOT=/
g++ -I../../../../..//common/includes/xcl2 -pthread -I/opt/xilinx/xrt/include -I/tools/Xilinx/Vivado/2019.2/include -Wall -O0 -g -std=c++11 -fmessage-length=0 ../../../../..//common/includes/xcl2/xcl2.cpp src/host.cpp src/krnl_config.h  -o 'ubench'  -L/opt/xilinx/xrt/lib -lOpenCL -lpthread  -lrt -lstdc++ 
./ubench ./build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin
Found Platform
Platform Name: Xilinx
INFO: Reading ./build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin
Loading: './build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin'
Trying to program device[0]: xilinx_u280-es1_xdma_201910_1
Device[0]: program successful!
Creating a kernel [krnl_ubench:{krnl_ubench_1}] for CU(1)
Execution time = 0.00300834
Payload Size: 3.8147e-06MB - Bandwidth = 6.80775GB/s
Execution time = 0.00419801
Payload Size: 3.8147e-06MB - Bandwidth = 9.75701GB/s
Execution time = 0.00746985
Payload Size: 3.8147e-06MB - Bandwidth = 10.9668GB/s
Execution time = 0.014406
Payload Size: 3.8147e-06MB - Bandwidth = 11.3731GB/s
Execution time = 0.029443
Payload Size: 3.8147e-06MB - Bandwidth = 11.1293GB/s
Execution time = 0.0565577
Payload Size: 3.8147e-06MB - Bandwidth = 11.5875GB/s
Execution time = 0.11245
Payload Size: 3.8147e-06MB - Bandwidth = 11.656GB/s
Execution time = 0.224376
Payload Size: 3.8147e-06MB - Bandwidth = 11.6833GB/s
Execution time = 0.44852
Payload Size: 3.8147e-06MB - Bandwidth = 11.6893GB/s
Execution time = 0.896489
Payload Size: 3.8147e-06MB - Bandwidth = 11.6965GB/s
Execution time = 1.79279
Payload Size: 3.8147e-06MB - Bandwidth = 11.6977GB/s
perf_analyze profile -i profile_summary.csv -f html
ERROR: Could not open file profile_summary.csv
Makefile:130: recipe for target 'check' failed
make: *** [check] Error 5

~/uBench/ubench/off-chip_bandwidth/write/HBM/2ports_512bit$ make check TARGET=hw DEVICE=xilinx_u280-es1_xdma_201910_1 HOST_ARCH=x86 SYSROOT=/
g++ -I../../../../..//common/includes/xcl2 -pthread -I/opt/xilinx/xrt/include -I/tools/Xilinx/Vivado/2019.2/include -Wall -O0 -g -std=c++11 -fmessage-length=0 ../../../../..//common/includes/xcl2/xcl2.cpp src/host.cpp src/krnl_config.h  -o 'ubench'  -L/opt/xilinx/xrt/lib -lOpenCL -lpthread  -lrt -lstdc++ 
./ubench ./build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin
Found Platform
Platform Name: Xilinx
INFO: Reading ./build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin
Loading: './build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin'
Trying to program device[0]: xilinx_u280-es1_xdma_201910_1
Device[0]: program successful!
Creating a kernel [krnl_ubench:{krnl_ubench_1}] for CU(1)
Creating a kernel [krnl_ubench:{krnl_ubench_2}] for CU(2)
Execution time = 0.00226169
Payload Size: 7.62939e-06MB - Bandwidth = 9.05517GB/s
Execution time = 0.00379374
Payload Size: 7.62939e-06MB - Bandwidth = 10.7967GB/s
Execution time = 0.0079286
Payload Size: 7.62939e-06MB - Bandwidth = 10.3322GB/s
Execution time = 0.0145442
Payload Size: 7.62939e-06MB - Bandwidth = 11.2649GB/s
Execution time = 0.02898
Payload Size: 7.62939e-06MB - Bandwidth = 11.3071GB/s
Execution time = 0.058234
Payload Size: 7.62939e-06MB - Bandwidth = 11.2539GB/s
Execution time = 0.116943
Payload Size: 7.62939e-06MB - Bandwidth = 11.2082GB/s
Execution time = 0.230614
Payload Size: 7.62939e-06MB - Bandwidth = 11.3672GB/s
Execution time = 0.463979
Payload Size: 7.62939e-06MB - Bandwidth = 11.2998GB/s
Execution time = 0.92165
Payload Size: 7.62939e-06MB - Bandwidth = 11.3772GB/s
Execution time = 1.84582
Payload Size: 7.62939e-06MB - Bandwidth = 11.3616GB/s
perf_analyze profile -i profile_summary.csv -f html
ERROR: Could not open file profile_summary.csv
Makefile:130: recipe for target 'check' failed
make: *** [check] Error 5

~/uBench/ubench/off-chip_latency/HBM/32bit_per_access$ make check TARGET=hw DEVICE=xilinx_u280-es1_xdma_201910_1 HOST_ARCH=x86 SYSROOT=/
g++ -I../../../..//common/includes/xcl2 -pthread -I/opt/xilinx/xrt/include -I/tools/Xilinx/Vivado/2019.2/include -Wall -O0 -g -std=c++11 -fmessage-length=0 ../../../..//common/includes/xcl2/xcl2.cpp src/host.cpp src/krnl_config.h  -o 'ubench'  -L/opt/xilinx/xrt/lib -lOpenCL -lpthread  -lrt -lstdc++
./ubench ./build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin
Found Platform
Platform Name: Xilinx
INFO: Reading ./build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin
Loading: './build_dir.hw.xilinx_u280-es1_xdma_201910_1/ubench.xclbin'
Trying to program device[0]: xilinx_u280-es1_xdma_201910_1
Device[0]: program successful!
Creating a kernel [krnl_ubench:{krnl_ubench_1}] for CU(1)
Execution time = 0.00582883
Payload Size: 6.10352e-05MB - Latency = 0.109799GB/s
Execution time = 0.0113186
Payload Size: 0.00012207MB - Latency = 0.113088GB/s
Execution time = 0.0226948
Payload Size: 0.000244141MB - Latency = 0.112801GB/s
Execution time = 0.0449585
Payload Size: 0.000488281MB - Latency = 0.113883GB/s
Execution time = 0.0901793
Payload Size: 0.000976562MB - Latency = 0.113552GB/s
Execution time = 0.183912
Payload Size: 0.00195312MB - Latency = 0.111357GB/s
Execution time = 0.367893
Payload Size: 0.00390625MB - Latency = 0.111337GB/s
Execution time = 0.736642
Payload Size: 0.0078125MB - Latency = 0.111207GB/s
Execution time = 1.50362
Payload Size: 0.015625MB - Latency = 0.108963GB/s
Execution time = 3.21045
Payload Size: 0.03125MB - Latency = 0.102067GB/s
Execution time = 6.54912
Payload Size: 0.0625MB - Latency = 0.100068GB/s
Execution time = 13.2363
Payload Size: 0.125MB - Latency = 0.0990244GB/s
Execution time = 26.6355
Payload Size: 0.25MB - Latency = 0.0984188GB/s
Execution time = 53.4147
Payload Size: 0.5MB - Latency = 0.0981542GB/s
Execution time = 106.987
Payload Size: 1MB - Latency = 0.09801GB/s
perf_analyze profile -i profile_summary.csv -f html
ERROR: Could not open file profile_summary.csv
Makefile:130: recipe for target 'check' failed
make: *** [check] Error 5