Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2018-01-25 18:43:57 +0300
committerHieu Hoang <hieuhoang@gmail.com>2018-01-25 18:43:57 +0300
commitc2ae707d5b4c5e12bc3b99a09db37ba6b2261fcb (patch)
treebfdae1f311480b8bb98e3265dd25768356950616
parent7d239425d030318281d32d152b2d0d20848a7602 (diff)
parentfee148295545cc7cdfac4039314349d235e57204 (diff)
Merge ../marian.master
-rw-r--r--contrib/fpga/kernels/OutputLayer.cl88
1 files changed, 77 insertions, 11 deletions
diff --git a/contrib/fpga/kernels/OutputLayer.cl b/contrib/fpga/kernels/OutputLayer.cl
index 72030e41..cbe20c43 100644
--- a/contrib/fpga/kernels/OutputLayer.cl
+++ b/contrib/fpga/kernels/OutputLayer.cl
@@ -1,13 +1,79 @@
-#pragma once
-
-__kernel void square(
- __global float* input,
- __global float* output,
- const unsigned int count)
-{
- int i = get_global_id(0);
- if(i < count)
- output[i] = input[i] * input[i];
-}
+#ifndef EMULATOR
+#define EMULATOR 0
+#endif
+#define VOCABSIZE 85120 //good multiple of 16 and 128
+#define LAYER_DIM 512 // assuming to be multiple of 16
+
+#define P 16 //should be multiple 16 for B loading logic to work
+#define TILECOUNT (VOCABSIZE / P); //VOCABSIZE will be a good multiple of P
+
+#define WLOADTIME (P * LAYER_DIM) >> 4 //using float16
+#define BLOADTIME P >> 4 //using float16
+
+__attribute__((max_global_work_dim(0)))
+__kernel void OutputLayer_float(
+ __global float * restrict W,
+ __global float * restrict X,
+ __global float * restrict B,
+ __global float * restrict Y,
+ unsigned batchsize,
+ )
+{
+#if EMULATOR == 1
+ printf("OpenCL: OutputLayer_float, batchsize=%d \n",batchsize);
+#endif
+
+ __global volatile float16* restrict ddr_access_pointer;
+ __global volatile float16* restrict Wpointer_prev;
+ __global volatile float16* restrict Bpointer_prev;
+
+ float Wlocal[P][LAYER_DIM];
+ float Blocal[P];
+
+
+ Wpointer_prev = (__global volatile float16 *)W;
+ Bpointer_prev = (__global volatile float16 *)B;
+
+ for (unsigned tile=0; tile < TILECOUNT; tile++) {
+ ddr_access_pointer = (__global volatile float16 *)Wpointer_prev;
+
+ unsigned wr_index=0;
+ //fetch W and B to local
+ for (unsigned i=0; i < (WLOADTIME + BLOADTIME); i++) {
+
+ float16 temp_val = *ddr_access_pointer;
+ if (i < WLOADTIME) {
+ #pragma unroll
+ for (char u=0; u < 16; u++) {
+ Wlocal[wr_index >> 5][(wr_index & 0x1F)*16+u]=temp_val[u]; // good for LAYER_DIM 512 (512/16=32)
+ }
+ wr_index++;
+ }
+ else {
+ #pragma unroll
+ for (char u=0; u < 16; u++) {
+ Blocal[(wr_index*16+u]=temp_val[u]; // good for P as a multiple of 16
+ }
+ wr_index++;
+ }
+ ddr_access_pointer++;
+
+ if (i==(WLOADTIME-1)) { //we should keep track of W for the next batch
+ Wpointer_prev = ddr_access_pointer;
+ ddr_access_pointer = (__global volatile float16 *)Bpointer_prev; //would byte aligning be a problem?
+ wr_index = 0;
+ }
+ }
+
+ //do the matrix multiplication of tile with X
+
+
+
+ }
+
+
+}
+
+