diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-01-25 18:43:57 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2018-01-25 18:43:57 +0300 |
commit | c2ae707d5b4c5e12bc3b99a09db37ba6b2261fcb (patch) | |
tree | bfdae1f311480b8bb98e3265dd25768356950616 | |
parent | 7d239425d030318281d32d152b2d0d20848a7602 (diff) | |
parent | fee148295545cc7cdfac4039314349d235e57204 (diff) |
Merge ../marian.master
-rw-r--r-- | contrib/fpga/kernels/OutputLayer.cl | 88 |
1 files changed, 77 insertions, 11 deletions
diff --git a/contrib/fpga/kernels/OutputLayer.cl b/contrib/fpga/kernels/OutputLayer.cl index 72030e41..cbe20c43 100644 --- a/contrib/fpga/kernels/OutputLayer.cl +++ b/contrib/fpga/kernels/OutputLayer.cl @@ -1,13 +1,79 @@ -#pragma once - -__kernel void square( - __global float* input, - __global float* output, - const unsigned int count) -{ - int i = get_global_id(0); - if(i < count) - output[i] = input[i] * input[i]; -} +#ifndef EMULATOR +#define EMULATOR 0 +#endif +#define VOCABSIZE 85120 //good multiple of 16 and 128 +#define LAYER_DIM 512 // assuming to be multiple of 16 + +#define P 16 //should be multiple 16 for B loading logic to work +#define TILECOUNT (VOCABSIZE / P); //VOCABSIZE will be a good multiple of P + +#define WLOADTIME (P * LAYER_DIM) >> 4 //using float16 +#define BLOADTIME P >> 4 //using float16 + +__attribute__((max_global_work_dim(0))) +__kernel void OutputLayer_float( + __global float * restrict W, + __global float * restrict X, + __global float * restrict B, + __global float * restrict Y, + unsigned batchsize, + ) +{ +#if EMULATOR == 1 + printf("OpenCL: OutputLayer_float, batchsize=%d \n",batchsize); +#endif + + __global volatile float16* restrict ddr_access_pointer; + __global volatile float16* restrict Wpointer_prev; + __global volatile float16* restrict Bpointer_prev; + + float Wlocal[P][LAYER_DIM]; + float Blocal[P]; + + + Wpointer_prev = (__global volatile float16 *)W; + Bpointer_prev = (__global volatile float16 *)B; + + for (unsigned tile=0; tile < TILECOUNT; tile++) { + ddr_access_pointer = (__global volatile float16 *)Wpointer_prev; + + unsigned wr_index=0; + //fetch W and B to local + for (unsigned i=0; i < (WLOADTIME + BLOADTIME); i++) { + + float16 temp_val = *ddr_access_pointer; + if (i < WLOADTIME) { + #pragma unroll + for (char u=0; u < 16; u++) { + Wlocal[wr_index >> 5][(wr_index & 0x1F)*16+u]=temp_val[u]; // good for LAYER_DIM 512 (512/16=32) + } + wr_index++; + } + else { + #pragma unroll + for (char u=0; u < 16; u++) { + Blocal[(wr_index*16+u]=temp_val[u]; // good for P as a multiple of 16 + } + wr_index++; + } + ddr_access_pointer++; + + if (i==(WLOADTIME-1)) { //we should keep track of W for the next batch + Wpointer_prev = ddr_access_pointer; + ddr_access_pointer = (__global volatile float16 *)Bpointer_prev; //would byte aligning be a problem? + wr_index = 0; + } + } + + //do the matrix multiplication of tile with X + + + + } + + +} + + |