Merge ../marian.master

author: Hieu Hoang <hieuhoang@gmail.com> 2018-01-25 18:43:57 +0300
committer: Hieu Hoang <hieuhoang@gmail.com> 2018-01-25 18:43:57 +0300
commit: c2ae707d5b4c5e12bc3b99a09db37ba6b2261fcb (patch)
tree: bfdae1f311480b8bb98e3265dd25768356950616
parent: 7d239425d030318281d32d152b2d0d20848a7602 (diff)
parent: fee148295545cc7cdfac4039314349d235e57204 (diff)
1 files changed, 77 insertions, 11 deletions
diff --git a/contrib/fpga/kernels/OutputLayer.cl b/contrib/fpga/kernels/OutputLayer.cl
index 72030e41..cbe20c43 100644
--- a/contrib/fpga/kernels/OutputLayer.cl
+++ b/contrib/fpga/kernels/OutputLayer.cl
@@ -1,13 +1,79 @@
-#pragma once
-
-__kernel void square(                                                    
-   __global float* input, 
-   __global float* output,
-   const unsigned int count)
-{                           
-   int i = get_global_id(0);
-   if(i < count)            
-       output[i] = input[i] * input[i];
-}                                      
+#ifndef EMULATOR
+#define EMULATOR 0
+#endif
 
+#define VOCABSIZE 85120  //good multiple of 16 and 128
+#define LAYER_DIM 512 // assuming to be multiple of 16
+
+#define P 16 //should be multiple 16 for B loading logic to work
+#define TILECOUNT (VOCABSIZE / P); //VOCABSIZE will be a good multiple of P 
+
+#define WLOADTIME (P * LAYER_DIM) >> 4 //using float16
+#define BLOADTIME P >> 4 //using float16
+
+__attribute__((max_global_work_dim(0)))
+__kernel void OutputLayer_float(
+				__global float * restrict W,
+				__global float * restrict X,
+				__global float * restrict B,
+				__global float * restrict Y,
+				unsigned batchsize,
+                  )
+{
+#if EMULATOR == 1
+    printf("OpenCL: OutputLayer_float, batchsize=%d \n",batchsize);
+#endif
+
+	__global volatile float16* restrict ddr_access_pointer;
+	__global volatile float16* restrict Wpointer_prev;
+	__global volatile float16* restrict Bpointer_prev;
+
+    float Wlocal[P][LAYER_DIM];
+    float Blocal[P];
+
+
+	Wpointer_prev = (__global volatile float16 *)W;
+	Bpointer_prev = (__global volatile float16 *)B;
+	
+	for (unsigned tile=0; tile < TILECOUNT; tile++) {
+		ddr_access_pointer = (__global volatile float16 *)Wpointer_prev;
+	
+		unsigned wr_index=0;
+		//fetch W and B to local
+		for (unsigned i=0; i < (WLOADTIME + BLOADTIME); i++) {
+	
+			float16 temp_val = *ddr_access_pointer;
+			if (i < WLOADTIME) {
+				#pragma unroll 
+				for (char u=0; u < 16; u++) {
+					Wlocal[wr_index >> 5][(wr_index & 0x1F)*16+u]=temp_val[u]; // good for LAYER_DIM 512 (512/16=32)
+				}
+				wr_index++;
+			}
+			else {
+				#pragma unroll 
+				for (char u=0; u < 16; u++) {
+					Blocal[(wr_index*16+u]=temp_val[u]; // good for P as a multiple of 16
+				}		
+				wr_index++;
+			}
+			ddr_access_pointer++;
+		
+			if (i==(WLOADTIME-1)) { //we should keep track of W for the next batch
+				Wpointer_prev = ddr_access_pointer;
+				ddr_access_pointer = (__global volatile float16 *)Bpointer_prev; //would byte aligning be a problem?
+				wr_index = 0;
+			}
+		}
+		
+		//do the matrix multiplication of tile with X
+		
+		
+		
+	}
+		
+
+}
+	
+
author	Hieu Hoang <hieuhoang@gmail.com>	2018-01-25 18:43:57 +0300
committer	Hieu Hoang <hieuhoang@gmail.com>	2018-01-25 18:43:57 +0300
commit	c2ae707d5b4c5e12bc3b99a09db37ba6b2261fcb (patch)
tree	bfdae1f311480b8bb98e3265dd25768356950616
parent	7d239425d030318281d32d152b2d0d20848a7602 (diff)
parent	fee148295545cc7cdfac4039314349d235e57204 (diff)