Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/vowpal_wabbit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/utl
diff options
context:
space:
mode:
authorEric Whyne <ericwhyne@gmail.com>2013-10-24 19:12:05 +0400
committerEric Whyne <ericwhyne@gmail.com>2013-10-24 19:12:05 +0400
commit73de4001d909a1253536ec8f644091b1c6ae1b63 (patch)
treee15d7b133850ca4efe50b1dd564dc84ca1e36421 /utl
parent9379deff7595fb147b55646065d02b3ddaaf0fac (diff)
Added csv to binary classifier training data utility.
Diffstat (limited to 'utl')
-rwxr-xr-xutl/convert_csv_to_vw_binary.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/utl/convert_csv_to_vw_binary.py b/utl/convert_csv_to_vw_binary.py
new file mode 100755
index 00000000..323a2aaf
--- /dev/null
+++ b/utl/convert_csv_to_vw_binary.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# 2013 Eric Whyne http://www.datamungeblog.com/
+import re
+import sys
+
+if len(sys.argv) != 6:
+ sys.exit('This script converts csv data to a very basic vowpal wabbit binary classifier training data format.\n\nUsage: %s <infile.csv> <outfile.vw> <category index> <positive regex> <negative regex>\n\nCategory index and the regular expression options define where to look for the binary category and how to identify it.' % sys.argv[0])
+
+infile = open (sys.argv[1],'r')
+outfile = open (sys.argv[2],'w')
+category_index = int(sys.argv[3])
+pregex = sys.argv[4]
+nregex = sys.argv[5]
+
+for line in infile:
+ data = line.split(',')
+ category = data.pop(category_index)
+ if re.search(nregex, category): # regex for negative category
+ category = "-1"
+ elif re.search(pregex, category): # regex for positive category
+ category = '1'
+ else:
+ print "Regex did not mach record", line
+ exit()
+ outline = category + " | "
+ colnum = 0
+ for col in data:
+ colstr = str(colnum)
+ col = re.sub(r'\s','',col) # remove all whitespace
+ if re.search('^[0-9.]*$', col): # test if it's a number
+ outline = outline + "f" + colstr + ":" + col + " "
+ else:
+ outline = outline + col + ' '
+ colnum = colnum + 1
+ outline = outline + "\n"
+ outfile.write(outline)
+