diff options
author | Ronan Collobert <ronan@collobert.com> | 2014-11-11 09:28:25 +0300 |
---|---|---|
committer | Ronan Collobert <ronan@collobert.com> | 2014-11-11 09:28:25 +0300 |
commit | f91b3a2c65c1228a6e7b03acdb8673e8efdd1570 (patch) | |
tree | b2f55761a91d146e34392537ab7ad362f89ea519 /benchmark | |
parent | 7c95906eb811370b2bd1025d6bc1d6d9a3e0bd0b (diff) |
added benchmark suite, as well as report analysis
Diffstat (limited to 'benchmark')
-rw-r--r-- | benchmark/README.md | 130 | ||||
-rw-r--r-- | benchmark/doc/tree1.png | bin | 0 -> 93178 bytes | |||
-rw-r--r-- | benchmark/doc/tree7.png | bin | 0 -> 504277 bytes | |||
-rw-r--r-- | benchmark/torch7capi.lua | 31 | ||||
-rw-r--r-- | benchmark/torch9ffi.lua | 148 | ||||
-rw-r--r-- | benchmark/torch9luaffi.lua | 159 |
6 files changed, 468 insertions, 0 deletions
diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..e4289f7 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,130 @@ +argcheck benchmark +================== + +We show here an example of `argcheck` in a real-life case: wrapping a call +to the numerical library [TH](https://github.com/torch/TH), used in +[torch7](https://github.com/torch/torch7). + +The code does a simple loop over this particular function call. In torch7, +this looks like: +```lua + for i=1,N do + torch.add(y, x, 5) + torch.add(y, x, scale, y) + end +``` + +The add function of torch7 is non-trivial, because it has to handle cases +where one wants to add a tensor to a tensor or a value to a tensor. There +is also an optional scale argument. The function is also overloaded for 7 +different types of tensors (double, float, int...), which makes things even +more uneasy. We define the double overloading last, to study the worst case +performance. + +In the following, we compare: + - `torch7` (here ran with luajit). Torch7 uses the regular lua/C API. + - `torch9`, a FFI interface for [`luajit`](http://luajit.org), to the TH library achieved with `argcheck`. + - `torch9lua`, running [`lua`](http://www.lua.org) with [`libffi`](https://github.com/jmckaskill/luaffi) and `argcheck`. + - `C`, plain C calls to `TH` library. Contrary to other versions, _it does not include the overhead of multiple tensor types_. + +What we call `torch9` here is only a thin interface to `TH` with FFI, +limited to the purpose of this benchmark. The only thing it has to do with +the upcoming `torch9` is the way we use `argcheck` with FFI. + +We avoid garbage-collection side-effects by not allocating objects. + +## Call to argcheck + +We create a function `add()`, which is overloaded to handle various possible argument situations. + +```lua +add = argcheck{ + overload = add, + {name="res", type="torch.DoubleTensor", opt=true}, + {name="src", type="torch.DoubleTensor"}, + {name="value", type="number"}, + call = + function(res, src, value) + res = res or DoubleTensor() + C.THDoubleTensor_add(res, src, value) + return res + end +} + +add = argcheck{ + overload = add, + {name="res", type="torch.DoubleTensor", opt=true}, + {name="src1", type="torch.DoubleTensor"}, + {name="value", type="number", default=1}, + {name="src2", type="torch.DoubleTensor"}, + call = + function(res, src1, value, src2) + res = res or torch.DoubleTensor() + C.THDoubleTensor_cadd(res, src1, value, src2) + return res + end +} +``` + +As you can see, there are many variations to handle. The generated code is +201 lines of `lua` code, only for the case of DoubleTensor. With all the 7 +types of tensor, it is 5250 lines of code! This code handles both ordered +arguments (as in `torch7`) and named arguments calls. Arguments calls is +just for syntactic sugar, but is slower (it implies creating argument +tables, and looping over them, which is not JIT in the current +`luajit` 2.1). + +The tree generated in the case of DoubleTensor is alone, is the following: +![](doc/tree1.png) +When it includes all the 7 types of tensors: +![](doc/tree7.png) + +## Running it + +We now compare our different setups with matrix sizes of size 2, 10, 100, +and 300 over 100,000,000, 10,000,000, 1,000,000 and 100,000 iterations +respectively. Running time is given is seconds. Experiments were performed +on a MacBook Pro 2.6GHz Quad-core i7, using one core. Overhead per call is +reported, in nano-seconds, computed with the first two columns (w.r.t C +performance). + +| | 2 | 10 | 100 | 300 | overhead | +|:--------------------------------|---------:|---------:|---------:|---------:|-----------:| +| C | 3.82s | 1.16s | 8.74s | 10.34s | 0ns | +| torch7 (luajit+C API) (jit=on) | 73.45s | 8.22s | 9.47s | 10.47s | 701ns | +| torch7 (luajit+C API) (jit=off) | 72.22s | 8.21s | 9.49s | 10.59s | 694ns | +| torch9 (luajit+ffi) (jit=on) | 3.80s | 1.14s | 8.82s | 10.30s | -1ns | +| torch9 (luajit+ffi) (jit=off) | 167.62s | 17.35s | 10.75s | 10.83s | 1619ns | +| torch9 (lua+luaffi) | 256.20s | 26.93s | 11.30s | 10.66s | 2550ns | + +### Comments + +Not suprisingly, the old lua/C API has quite some overheads when calling +short duration C code. + +`luajit` does an impressive job in calling C functions through FFI. It +stays on par with C performance, even when C operations are limited (small +matrix size). `argcheck` is viable even in interpreted mode with luajit, +with only a x2 overhead compared to the lua/C API. + +Lua interpreter (with luaffi library) has clearly more +overheads. `argcheck` might be still very usable (here 2.5ms per call, in a +pretty complicated setup), depending on your use-case. + +## Named arguments + +As mention earlier, named argument calls are expected to be slower. Here is +a comparison against ordered arguments calls, using the same benchmark. In +our case, the overhead is about 1ms per call with luajit (note that with +jit off, the performance is similar, meaning luajit relies mainly on the +interpreter in that case). Our test case is pretty complicated, your +mileage might vary... + +| | 2 | 10 | 100 | 300 | overhead | +|:-----------------------------------------|---------:|---------:|---------:|---------:|-----------:| +| torch9 (luajit+ffi) (jit=on) (ordered) | 3.80s | 1.14s | 8.82s | 10.30s | -1ns | +| torch9 (luajit+ffi) (jit=off) (ordered) | 167.62s | 17.35s | 10.75s | 10.83s | 1628ns | +| torch9 (lua+luaffi) (ordered) | 256.20s | 26.93s | 11.30s | 10.66s | 2550ns | +| torch9 (luajit+ffi) (jit=on) (named) | 110.24s | 11.81s | 9.85s | 10.29s | 1064ns | +| torch9 (luajit+ffi) (jit=off) (named) | 205.99s | 21.92s | 11.08s | 10.72s | 2049ns | +| torch9 (lua+luaffi) (named) | 486.19s | 49.48s | 13.87s | 10.66s | 4828ns | diff --git a/benchmark/doc/tree1.png b/benchmark/doc/tree1.png Binary files differnew file mode 100644 index 0000000..6a5b6d3 --- /dev/null +++ b/benchmark/doc/tree1.png diff --git a/benchmark/doc/tree7.png b/benchmark/doc/tree7.png Binary files differnew file mode 100644 index 0000000..b54fa78 --- /dev/null +++ b/benchmark/doc/tree7.png diff --git a/benchmark/torch7capi.lua b/benchmark/torch7capi.lua new file mode 100644 index 0000000..4694f2a --- /dev/null +++ b/benchmark/torch7capi.lua @@ -0,0 +1,31 @@ +require 'torch' + +local SZ = tonumber(arg[1]) +local N = tonumber(arg[2]) +local scale = tonumber(arg[3]) or 1 + +torch.manualSeed(1111) + +local x = torch.rand(SZ,SZ) +local y = torch.rand(SZ,SZ) + +print('x', x:norm()) +print('y', x:norm()) +print('running') + +local clk = os.clock() +if scale == 1 then + for i=1,N do + torch.add(y, x, 5) + torch.add(y, x, y) + end +else + for i=1,N do + torch.add(y, x, 5) + torch.add(y, x, scale, y) + end +end +print('time (s)', os.clock()-clk) + +print('x', x:norm()) +print('y', y:norm()) diff --git a/benchmark/torch9ffi.lua b/benchmark/torch9ffi.lua new file mode 100644 index 0000000..2a1745b --- /dev/null +++ b/benchmark/torch9ffi.lua @@ -0,0 +1,148 @@ +local argcheck = require 'argcheck' +local ffi = require 'ffi' +local class = require 'class' + +local SZ = tonumber(arg[1]) +local N = tonumber(arg[2]) +local scale = tonumber(arg[3]) or 1 +local dbg = arg[4] == '1' +local named = arg[5] == '1' + +if named then + print('warning: using named arguments!') +end + +ffi.cdef[[ + +typedef struct THLongStorage THLongStorage; +THLongStorage* THLongStorage_newWithSize2(long, long); +void THLongStorage_free(THLongStorage *storage); + +typedef struct THGenerator THGenerator; +THGenerator* THGenerator_new(); +void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_); + +typedef struct THDoubleTensor THDoubleTensor; +THDoubleTensor *THDoubleTensor_new(void); +void THDoubleTensor_free(THDoubleTensor *self); +void THDoubleTensor_rand(THDoubleTensor *r_, THGenerator *_generator, THLongStorage *size); +void THDoubleTensor_add(THDoubleTensor *r_, THDoubleTensor *t, double value); +void THDoubleTensor_cadd(THDoubleTensor *r_, THDoubleTensor *t, double value, THDoubleTensor *src); +double THDoubleTensor_normall(THDoubleTensor *t, double value); + +]] + +local status, C = pcall(ffi.load, 'TH') +if not status then + error('please specify path to libTH in your (DY)LD_LIBRARY_PATH') +end + +local DoubleTensor = class('torch.DoubleTensor', ffi.typeof('THDoubleTensor&')) + +function DoubleTensor.new() + local self = C.THDoubleTensor_new() + self = ffi.cast('THDoubleTensor&', self) + ffi.gc(self, C.THDoubleTensor_free) + return self +end + +function DoubleTensor:norm(l) + l = l or 2 + return tonumber(C.THDoubleTensor_normall(self, l)) +end + +ffi.metatype('THDoubleTensor', getmetatable(DoubleTensor)) + +local _gen = C.THGenerator_new() +C.THRandom_manualSeed(_gen, 1111) + +local function rand(a, b) + local size = C.THLongStorage_newWithSize2(a, b) + local self = DoubleTensor() + C.THDoubleTensor_rand(self, _gen, size) + C.THLongStorage_free(size) + return self +end + +local add +local dotgraph + +for _, RealTensor in ipairs{--'torch.ByteTensor', 'torch.ShortTensor', 'torch.FloatTensor', +--'torch.LongTensor', 'torch.IntTensor', 'torch.CharTensor', +'torch.DoubleTensor'} do + + add = argcheck{ + chain = add, + {name="res", type=RealTensor, opt=true}, + {name="src", type=RealTensor}, + {name="value", type="number"}, + call = + function(res, src, value) + res = res or DoubleTensor() + C.THDoubleTensor_add(res, src, value) + return res + end + } + + add, dotgraph = argcheck{ + debug = dbg, + overload = add, + {name="res", type=RealTensor, opt=true}, + {name="src1", type=RealTensor}, + {name="value", type="number", default=1}, + {name="src2", type=RealTensor}, + call = + function(res, src1, value, src2) + res = res or torch.DoubleTensor() + C.THDoubleTensor_cadd(res, src1, value, src2) + return res + end + } + +end + +if dotgraph then + local f = io.open('argtree.dot', 'w') + f:write(dotgraph) + f:close() +end + +local x = rand(SZ, SZ) +local y = rand(SZ, SZ) + +print('x', x:norm()) +print('y', x:norm()) +print('running') + +if named then + local clk = os.clock() + if scale == 1 then + for i=1,N do + add{res=y, src=x, value=5} + add{res=y, src1=x, src2=y} + end + else + for i=1,N do + add{res=y, src=x, value=5} + add{res=y, src1=x, value=scale, src2=y} + end + end + print('time (s)', os.clock()-clk) +else + local clk = os.clock() + if scale == 1 then + for i=1,N do + add(y, x, 5) + add(y, x, y) + end + else + for i=1,N do + add(y, x, 5) + add(y, x, scale, y) + end + end + print('time (s)', os.clock()-clk) +end + +print('x', x:norm()) +print('y', y:norm()) diff --git a/benchmark/torch9luaffi.lua b/benchmark/torch9luaffi.lua new file mode 100644 index 0000000..161e592 --- /dev/null +++ b/benchmark/torch9luaffi.lua @@ -0,0 +1,159 @@ +local argcheck = require 'argcheck' +local ffi = require 'ffi' + +local env = require 'argcheck.env' + +local SZ = tonumber(arg[1]) +local N = tonumber(arg[2]) +local scale = tonumber(arg[3]) or 1 +local dbg = arg[4] == '1' +local named = arg[5] == '1' + +if named then + print('warning: using named arguments!') +end + +function env.istype(obj, typename) + if type(obj) == 'userdata' then + if typename == 'torch.DoubleTensor' then + return true + else + return false + end + end + return type(obj) == typename +end + +ffi.cdef[[ + +typedef struct THLongStorage THLongStorage; +THLongStorage* THLongStorage_newWithSize2(long, long); +void THLongStorage_free(THLongStorage *storage); + +typedef struct THGenerator THGenerator; +THGenerator* THGenerator_new(); +void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_); + +typedef struct THDoubleTensor THDoubleTensor; +THDoubleTensor *THDoubleTensor_new(void); +void THDoubleTensor_free(THDoubleTensor *self); +void THDoubleTensor_rand(THDoubleTensor *r_, THGenerator *_generator, THLongStorage *size); +void THDoubleTensor_add(THDoubleTensor *r_, THDoubleTensor *t, double value); +void THDoubleTensor_cadd(THDoubleTensor *r_, THDoubleTensor *t, double value, THDoubleTensor *src); +double THDoubleTensor_normall(THDoubleTensor *t, double value); + +]] + +local status, C = pcall(ffi.load, ffi.os == 'OSX' and 'libTH.dylib' or 'libTH.so') +if not status then + error('please specify path to libTH in your (DY)LD_LIBRARY_PATH') +end + +local DoubleTensor = {} + +function DoubleTensor_new() + local self = C.THDoubleTensor_new() + ffi.gc(self, C.THDoubleTensor_free) + return self +end + +function DoubleTensor:norm(l) + l = l or 2 + return tonumber(C.THDoubleTensor_normall(self, l)) +end + +DoubleTensor_mt = {__index=DoubleTensor, __new=DoubleTensor_new} +DoubleTensor = ffi.metatype('THDoubleTensor', DoubleTensor_mt) + +local _gen = C.THGenerator_new() +C.THRandom_manualSeed(_gen, 1111) + +local function rand(a, b) + local size = C.THLongStorage_newWithSize2(a, b) + local self = DoubleTensor() + C.THDoubleTensor_rand(self, _gen, size) + C.THLongStorage_free(size) + return self +end + +local add +local dotgraph + +for _, RealTensor in ipairs{'torch.ByteTensor', 'torch.ShortTensor', 'torch.FloatTensor', +'torch.LongTensor', 'torch.IntTensor', 'torch.CharTensor', +'torch.DoubleTensor'} do + + add = argcheck{ + {name="res", type=RealTensor, opt=true}, + {name="src", type=RealTensor}, + {name="value", type="number"}, + call = + function(res, src, value) + res = res or DoubleTensor() + C.THDoubleTensor_add(res, src, value) + return res + end + } + + add, dotgraph = argcheck{ + debug=dbg, + overload = add, + {name="res", type=RealTensor, opt=true}, + {name="src1", type=RealTensor}, + {name="value", type="number", default=1}, + {name="src2", type=RealTensor}, + call = + function(res, src1, value, src2) + res = res or torch.DoubleTensor() + C.THDoubleTensor_cadd(res, src1, value, src2) + return res + end + } + +end + +if dotgraph then + local f = io.open('argtree.dot', 'w') + f:write(dotgraph) + f:close() +end + +local x = rand(SZ, SZ) +local y = rand(SZ, SZ) + +print('x', x:norm()) +print('y', x:norm()) +print('running') + +if named then + local clk = os.clock() + if scale == 1 then + for i=1,N do + add{res=y, src=x, value=5} + add{res=y, src1=x, src2=y} + end + else + for i=1,N do + add{res=y, src=x, value=5} + add{res=y, src1=x, value=scale, src2=y} + end + end + print('time (s)', os.clock()-clk) +else + local clk = os.clock() + if scale == 1 then + for i=1,N do + add(y, x, 5) + add(y, x, y) + end + else + for i=1,N do + add(y, x, 5) + add(y, x, scale, y) + end + end + print('time (s)', os.clock()-clk) +end + +print('x', x:norm()) +print('y', y:norm()) |