トップ 差分 一覧 Farm ソース 検索 ヘルプ PDF RSS ログイン

Diary/2021-1-26

VitisとHBM(続)

複数HBMリージョンにアクセスしてみる.bundleに気をつけて,
27個のHBMリージョンにアクセス.
ちなみに,もう一組増やして30個利用しようとしたらP&Rで配線できないというエラーがでた.

extern "C" {
    void vadd(int count,
              int* a_0, int* b_0, int* c_0,
              int* a_1, int* b_1, int* c_1,
              int* a_2, int* b_2, int* c_2,
              int* a_3, int* b_3, int* c_3,
              int* a_4, int* b_4, int* c_4,
              int* a_5, int* b_5, int* c_5,
              int* a_6, int* b_6, int* c_6,
              int* a_7, int* b_7, int* c_7,
              int* a_8, int* b_8, int* c_8);
}

void vadd(int count,
          int* a_0, int* b_0, int* c_0,
          int* a_1, int* b_1, int* c_1,
          int* a_2, int* b_2, int* c_2,
          int* a_3, int* b_3, int* c_3,
          int* a_4, int* b_4, int* c_4,
          int* a_5, int* b_5, int* c_5,
          int* a_6, int* b_6, int* c_6,
          int* a_7, int* b_7, int* c_7,
          int* a_8, int* b_8, int* c_8)
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_1 offset=slave bundle=gmem3
#pragma HLS INTERFACE s_axilite port=a_1 bundle=control
#pragma HLS INTERFACE m_axi     port=b_1 offset=slave bundle=gmem4
#pragma HLS INTERFACE s_axilite port=b_1 bundle=control
#pragma HLS INTERFACE m_axi     port=c_1 offset=slave bundle=gmem5
#pragma HLS INTERFACE s_axilite port=c_1 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_2 offset=slave bundle=gmem6
#pragma HLS INTERFACE s_axilite port=a_2 bundle=control
#pragma HLS INTERFACE m_axi     port=b_2 offset=slave bundle=gmem7
#pragma HLS INTERFACE s_axilite port=b_2 bundle=control
#pragma HLS INTERFACE m_axi     port=c_2 offset=slave bundle=gmem8
#pragma HLS INTERFACE s_axilite port=c_2 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_3 offset=slave bundle=gmem9
#pragma HLS INTERFACE s_axilite port=a_3 bundle=control
#pragma HLS INTERFACE m_axi     port=b_3 offset=slave bundle=gmem10
#pragma HLS INTERFACE s_axilite port=b_3 bundle=control
#pragma HLS INTERFACE m_axi     port=c_3 offset=slave bundle=gmem11
#pragma HLS INTERFACE s_axilite port=c_3 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_4 offset=slave bundle=gmem12
#pragma HLS INTERFACE s_axilite port=a_4 bundle=control
#pragma HLS INTERFACE m_axi     port=b_4 offset=slave bundle=gmem13
#pragma HLS INTERFACE s_axilite port=b_4 bundle=control
#pragma HLS INTERFACE m_axi     port=c_4 offset=slave bundle=gmem14
#pragma HLS INTERFACE s_axilite port=c_4 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_5 offset=slave bundle=gmem15
#pragma HLS INTERFACE s_axilite port=a_5 bundle=control
#pragma HLS INTERFACE m_axi     port=b_5 offset=slave bundle=gmem16
#pragma HLS INTERFACE s_axilite port=b_5 bundle=control
#pragma HLS INTERFACE m_axi     port=c_5 offset=slave bundle=gmem17
#pragma HLS INTERFACE s_axilite port=c_5 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_6 offset=slave bundle=gmem18
#pragma HLS INTERFACE s_axilite port=a_6 bundle=control
#pragma HLS INTERFACE m_axi     port=b_6 offset=slave bundle=gmem19
#pragma HLS INTERFACE s_axilite port=b_6 bundle=control
#pragma HLS INTERFACE m_axi     port=c_6 offset=slave bundle=gmem20
#pragma HLS INTERFACE s_axilite port=c_6 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_7 offset=slave bundle=gmem21
#pragma HLS INTERFACE s_axilite port=a_7 bundle=control
#pragma HLS INTERFACE m_axi     port=b_7 offset=slave bundle=gmem22
#pragma HLS INTERFACE s_axilite port=b_7 bundle=control
#pragma HLS INTERFACE m_axi     port=c_7 offset=slave bundle=gmem23
#pragma HLS INTERFACE s_axilite port=c_7 bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_8 offset=slave bundle=gmem24
#pragma HLS INTERFACE s_axilite port=a_8 bundle=control
#pragma HLS INTERFACE m_axi     port=b_8 offset=slave bundle=gmem25
#pragma HLS INTERFACE s_axilite port=b_8 bundle=control
#pragma HLS INTERFACE m_axi     port=c_8 offset=slave bundle=gmem26
#pragma HLS INTERFACE s_axilite port=c_8 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control
    for(int i = 0; i < count; i++){
#pragma HLS PIPELINE
        c_0[i] = a_0[i] + b_0[i];
        c_1[i] = a_1[i] + b_1[i];
        c_2[i] = a_2[i] + b_2[i];
        c_3[i] = a_3[i] + b_3[i];
        c_4[i] = a_4[i] + b_4[i];
        c_5[i] = a_5[i] + b_5[i];
        c_6[i] = a_6[i] + b_6[i];
        c_7[i] = a_7[i] + b_7[i];
        c_8[i] = a_8[i] + b_8[i];
    }
}

こんなコードを用意.design.cfgは,

platform=xilinx_u50_gen3x16_xdma_201920_3
debug=1
profile_kernel=data:all:all:all
 
[connectivity]
nk=vadd:1:vadd_1
sp=vadd_1.a_0:HBM[0]
sp=vadd_1.b_0:HBM[1]
sp=vadd_1.c_0:HBM[2]
sp=vadd_1.a_1:HBM[3]
sp=vadd_1.b_1:HBM[4]
sp=vadd_1.c_1:HBM[5]
sp=vadd_1.a_2:HBM[6]
sp=vadd_1.b_2:HBM[7]
sp=vadd_1.c_2:HBM[8]
sp=vadd_1.a_3:HBM[9]
sp=vadd_1.b_3:HBM[10]
sp=vadd_1.c_3:HBM[11]
sp=vadd_1.a_4:HBM[12]
sp=vadd_1.b_4:HBM[13]
sp=vadd_1.c_4:HBM[14]
sp=vadd_1.a_5:HBM[15]
sp=vadd_1.b_5:HBM[16]
sp=vadd_1.c_5:HBM[17]
sp=vadd_1.a_6:HBM[18]
sp=vadd_1.b_6:HBM[19]
sp=vadd_1.c_6:HBM[20]
sp=vadd_1.a_7:HBM[21]
sp=vadd_1.b_7:HBM[22]
sp=vadd_1.c_7:HBM[23]
sp=vadd_1.a_8:HBM[24]
sp=vadd_1.b_8:HBM[25]
sp=vadd_1.c_8:HBM[26]

ビルドして実行.

という感じでHBMがたくさん並んだ.
リソース使用量はLUTとRegisterがそれぞれ6,453と9,417.BRAMとDSPは使ってない.

実行時のメモリバンド幅をみてみると,それぞれ1GBpsを越える性能がでてる.よかった.

512bit幅で読み書きするように修正.

   const int num = count / 16;

   ap_uint<512> tmp_a_0, tmp_b_0, tmp_c_0;
   for(int i = 0; i < num; i++){

#pragma HLS PIPELINE II=1

       tmp_a_0 = a_0[i];
       tmp_b_0 = b_0[i];
       for(int j = 0; j < 16; j++){
           tmp_c_0(j*32+31, j*32) = tmp_a_0.range(j*32+31,j*32) + tmp_b_0.range(j*32+31,j*32);
       }
       c_0[i] = tmp_c_0;
   }

バンド幅は8GBps〜10.5GBpsっていう感じ

リソース使用量は,LUTとレジスタが,それぞれ,49,778個と111,190個.
またBRAMを207個(15.4%相当)利用している.

複数のHBMリージョンを使う.

C++コードはかわらず,たとえば,

extern "C" {
    void vadd(int count,
              int* a_0, int* b_0, int* c_0
              );
}

void vadd(int count,
          int* a_0, int* b_0, int* c_0
          )
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi     port=a_0 offset=slave bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi     port=b_0 offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi     port=c_0 offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control

    for(int i = 0; i < count; i++){
#pragma HLS PIPELINE
        c_0[i] = a_0[i] + b_0[i];
    }
}

で,design.cfgで,

platform=xilinx_u50_gen3x16_xdma_201920_3
debug=1
profile_kernel=data:all:all:all
 
[connectivity]
nk=vadd:1:vadd_1
sp=vadd_1.a_0:HBM[0:1]
sp=vadd_1.b_0:HBM[2:3]
sp=vadd_1.c_0:HBM[4:5]

とかする.

という感じに複数のHBMリージョンをぶらさげることができる.