using l2 regularization = 1
enabling BFGS based optimization **without** curvature calculation
Num weight bits = 20
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
m = 7
Allocated 72M for weights and mem
## avg. loss 	der. mag. 	d. m. cond.	 wolfe1    	wolfe2    	mix fraction	curvature 	dir. magnitude	step size 
creating cache_file = train-sets/rcv1_small.dat.cache
Reading datafile = train-sets/rcv1_small.dat
num sources = 1
 1 0.69315   	0.00186   	0.34333   	          	          	          	0.29016   	135.67795 	1.18327   
 3 0.46631   	0.00773   	0.16072   	 0.558359  	0.205344  	          	          	176.93877 	1.00000   
 4 0.36719   	0.00415   	0.06788   	 0.435725  	-0.010674 	          	          	28.54624  	1.00000   
 5 0.33604   	0.00047   	0.01318   	 0.715574  	0.454858  	          	          	32.82015  	1.00000   
 6 0.32041   	0.00001   	0.00476   	 0.679156  	0.380024  	          	          	22.63070  	1.00000   
 7 0.31063   	0.00005   	0.00350   	 0.755260  	0.517819  	          	          	55.27250  	1.00000   
 8 0.29882   	0.00002   	0.00095   	 0.666044  	0.330579  	          	          	27.55031  	1.00000   
 9 0.29526   	0.00000   	0.00011   	 0.562531  	0.101449  	          	          	2.45273   	1.00000   
10 0.29486   	0.00000   	0.00003   	 0.559816  	0.115043  	          	          	0.80446   	1.00000   
11 0.29475   	0.00000   	0.00001   	 0.504319  	0.006255  	          	          	0.09031   	1.00000   

finished run
number of examples = 200000
weighted example sum = 200000
weighted label sum = -12720
average loss = 0.199705
best constant = -0.0636
total feature number = 15587880