enabling BFGS based optimization **without** curvature calculation
creating cache_file = train-sets/frank.dat.cache
Reading from train-sets/frank.dat
num sources = 1
Num weight bits = 20
learning rate = 10
initial_t = 1
power_t = 0.5
decay_learning_rate = 1
using l2 regularization
m = 7
Allocated 72M for weights and mem
## avg. loss 	der. mag. 	d. m. cond.	 wolfe1    	wolfe2    	mix fraction	curvature 	dir. magnitude	step size 	time      
 1 3.313292e+12	9.499654e+14	1.905397e+13	          	          	          	6.440241e+13	2.698544e+12	2.958580e-01	0.690
 3 4.946559e+11	1.587411e+13	2.431760e+11	 0.500000  	0.000000  	          	          	4.134247e+08	1.000000e+00	1.152     
 4 4.262257e+11	1.360521e+13	2.087762e+11	 0.963277  	0.926554  	          	          	9.978023e+10	1.000000e+00	1.711     
 5 5.509095e+09	1.685125e+10	8.165054e+08	 0.502727  	0.005672  	          	          	4.928468e+09	1.000000e+00	2.350     
 6 2.722915e+09	9.129735e+09	4.115941e+08	 0.848424  	0.697668  	          	          	2.678213e+10	1.000000e+00	3.318     
 7 5.137694e+06	6.706857e+06	1.018132e+05	 0.499525  	-0.000686 	          	          	2.755246e+05	1.000000e+00	4.523     
 8 5.995715e+06	3.517040e+07	5.294065e+05	 -0.644854 	-2.289846 	          	          	(revise x 0.5)	5.000000e-01	4.767
 9 5.019540e+06	2.805249e+06	4.237483e+04	 0.177600  	-0.644881 	          	          	1.362459e+04	1.000000e+00	6.061     
10 4.934764e+06	6.562551e+00	1.255939e-01	 0.499927  	-0.000122 	          	          	7.431789e-03	1.000000e+00	7.549     


finished run
number of examples = 100000
weighted example sum = 1e+05
weighted label sum = -1.577e+11
average loss = 7.556e+11
best constant = -1.577e+06
total feature number = 400000