diff options
author | Eric Whyne <ericwhyne@gmail.com> | 2013-10-24 19:12:05 +0400 |
---|---|---|
committer | Eric Whyne <ericwhyne@gmail.com> | 2013-10-24 19:12:05 +0400 |
commit | 73de4001d909a1253536ec8f644091b1c6ae1b63 (patch) | |
tree | e15d7b133850ca4efe50b1dd564dc84ca1e36421 /utl | |
parent | 9379deff7595fb147b55646065d02b3ddaaf0fac (diff) |
Added csv to binary classifier training data utility.
Diffstat (limited to 'utl')
-rwxr-xr-x | utl/convert_csv_to_vw_binary.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/utl/convert_csv_to_vw_binary.py b/utl/convert_csv_to_vw_binary.py new file mode 100755 index 00000000..323a2aaf --- /dev/null +++ b/utl/convert_csv_to_vw_binary.py @@ -0,0 +1,37 @@ +#!/usr/bin/python +# 2013 Eric Whyne http://www.datamungeblog.com/ +import re +import sys + +if len(sys.argv) != 6: + sys.exit('This script converts csv data to a very basic vowpal wabbit binary classifier training data format.\n\nUsage: %s <infile.csv> <outfile.vw> <category index> <positive regex> <negative regex>\n\nCategory index and the regular expression options define where to look for the binary category and how to identify it.' % sys.argv[0]) + +infile = open (sys.argv[1],'r') +outfile = open (sys.argv[2],'w') +category_index = int(sys.argv[3]) +pregex = sys.argv[4] +nregex = sys.argv[5] + +for line in infile: + data = line.split(',') + category = data.pop(category_index) + if re.search(nregex, category): # regex for negative category + category = "-1" + elif re.search(pregex, category): # regex for positive category + category = '1' + else: + print "Regex did not mach record", line + exit() + outline = category + " | " + colnum = 0 + for col in data: + colstr = str(colnum) + col = re.sub(r'\s','',col) # remove all whitespace + if re.search('^[0-9.]*$', col): # test if it's a number + outline = outline + "f" + colstr + ":" + col + " " + else: + outline = outline + col + ' ' + colnum = colnum + 1 + outline = outline + "\n" + outfile.write(outline) + |