Preprocess R data with python

This script can be easily modified to code, recode, or modify csv files prior to loading in R.

#!/usr/local/bin/python
#  transform a raw data file into a happy proto data frame
#

import csv

f = open( 'transposed.csv' )
hnps = csv.reader( f, delimiter = ',' )
header = hnps.next()

verbs = [
	 'mentioned',
	 'indicated',
	 'proposed',
	 'suggested',
	 'recommended',
	 'confessed',
	 'stated',
	 'announced',
	 'muttered',
	 'explained',
	 ]

out_name = 'reshaped.csv'
out = open( out_name, 'w' )
data = csv.writer( out, delimiter = ',' )
data.writerow( [ 'subject', 'score', 'shift', 'length', 'verb' ])

for line in hnps:
    shift_length = line[header.index('V1')]

    try:
	if shift_length[0] == "N":
	    shift = 'N'
	    length = shift_length[1:]
	elif shift_length[0] == "S":
	    shift = 'S'
	    length = shift_length[1:]
    except IndexError:
	continue

    sentence = line[header.index('ResponseID')]
    for v in verbs:
	# we're in trouble if one of the sentences contains more than one verb
	# fortunately, they don't.
	if v in sentence:
	    verb = v
	    break

    for s in range( 1, 193 ):
	try:
	    line[header.index( 'ID.' + str(s))]
	except IndexError:
	    # that cell is empty for this subject
	    continue
	else:
	    # if this subject responded to this item add data point to file
	    score = line[header.index( 'ID.' + str(s))]
	    if score.isdigit():
		data.writerow( [ s, score, shift, length, verb ])

f.close()
out.close()
  posted in: python and data