Private
Public Access
1
0
Files
rowsandall/rowers/dataprep.py
Sander Roosendaal b0d6430e5b new user sample data
2017-06-16 08:46:13 +02:00

1542 lines
43 KiB
Python

# All the data preparation, data cleaning and data mangling should
# be defined here
from rowers.models import Workout, User, Rower,StrokeData
from rowingdata import rowingdata as rrdata
from rowers.tasks import handle_sendemail_unrecognized
from rowers.tasks import handle_zip_file
from rowingdata import rower as rrower
from rowingdata import main as rmain
from rowingdata import get_file_type,get_empower_rigging
from pandas import DataFrame,Series
from pytz import timezone as tz,utc
from django.utils import timezone
from time import strftime,strptime,mktime,time,daylight
from django.utils.timezone import get_current_timezone
thetimezone = get_current_timezone()
from rowingdata import (
TCXParser,RowProParser,ErgDataParser,TCXParserNoHR,
BoatCoachParser,RowPerfectParser,BoatCoachAdvancedParser,
MysteryParser,
painsledDesktopParser,speedcoachParser,ErgStickParser,
SpeedCoach2Parser,FITParser,fitsummarydata,
make_cumvalues,
summarydata,get_file_type,
)
from rowers.models import Team
from rowers.metrics import axes
from async_messages import messages as a_messages
import os
import zipfile
import pandas as pd
import numpy as np
import itertools
import math
from tasks import handle_sendemail_unrecognized
from django.conf import settings
from sqlalchemy import create_engine
import sqlalchemy as sa
import sys
import utils
from utils import lbstoN
from scipy.interpolate import griddata
import django_rq
queue = django_rq.get_queue('default')
queuelow = django_rq.get_queue('low')
queuehigh = django_rq.get_queue('default')
user = settings.DATABASES['default']['USER']
password = settings.DATABASES['default']['PASSWORD']
database_name = settings.DATABASES['default']['NAME']
host = settings.DATABASES['default']['HOST']
port = settings.DATABASES['default']['PORT']
database_url = 'mysql://{user}:{password}@{host}:{port}/{database_name}'.format(
user=user,
password=password,
database_name=database_name,
host=host,
port=port,
)
# Use SQLite local database when we're in debug mode
if settings.DEBUG or user=='':
# database_url = 'sqlite:///db.sqlite3'
database_url = 'sqlite:///'+database_name
# mapping the DB column names to the CSV file column names
columndict = {
'time':'TimeStamp (sec)',
'hr':' HRCur (bpm)',
'pace':' Stroke500mPace (sec/500m)',
'spm':' Cadence (stokes/min)',
'power':' Power (watts)',
'averageforce':' AverageDriveForce (lbs)',
'drivelength':' DriveLength (meters)',
'peakforce':' PeakDriveForce (lbs)',
'distance':' Horizontal (meters)',
'catch':'catch',
'finish':'finish',
'peakforceangle':'peakforceangle',
'wash':'wash',
'slip':'wash',
'workoutstate':' WorkoutState',
}
from scipy.signal import savgol_filter
import datetime
def get_latlon(id):
try:
w = Workout.objects.get(id=id)
except Workout.DoesNotExist:
return False
rowdata = rdata(w.csvfilename)
try:
latitude = rowdata.df.ix[:,' latitude']
longitude = rowdata.df.ix[:,' longitude']
except KeyError:
latitude = 0*rowdata.df.ix[:,'TimeStamp (sec)']
latitude = 0*rowdata.df.ix[:,'TimeStamp (sec)']
return [latitude,longitude]
def get_workouts(ids,userid):
goodids = []
for id in ids:
w = Workout.objects.get(id=id)
if int(w.user.user.id) == int(userid):
goodids.append(id)
return [Workout.objects.get(id=id) for id in goodids]
def filter_df(datadf,fieldname,value,largerthan=True):
try:
x = datadf[fieldname]
except KeyError:
return datadf
if largerthan:
mask = datadf[fieldname] < value
else:
mask = datadf[fieldname] >= value
datadf.loc[mask,fieldname] = np.nan
return datadf
def getsinglecp(df):
thesecs = df['TimeStamp (sec)'].max()-df['TimeStamp (sec)'].min()
if thesecs != 0:
maxt = 2*thesecs
else:
maxt = 1000.
maxlog10 = np.log10(maxt)
logarr = np.arange(50)*maxlog10/50.
logarr = [int(10.**(la)) for la in logarr]
logarr = pd.Series(logarr)
logarr.drop_duplicates(keep='first',inplace=True)
logarr = logarr.values
dfnew = pd.DataFrame({
'time':df['TimeStamp (sec)']-df.ix[0,'TimeStamp (sec)'],
'power':df[' Power (watts)']
})
dfnew['workoutid'] = 0
dfgrouped = dfnew.groupby(['workoutid'])
delta,cpvalue,avgpower = getcp(dfgrouped,logarr)
return delta,cpvalue,avgpower
def getcp(dfgrouped,logarr):
delta = []
cpvalue = []
avgpower = {}
#avgpower[0] = 0
for id,group in dfgrouped:
tt = group['time'].copy()
ww = group['power'].copy()
tmax = tt.max()
newlen = int(tmax/5000.)
if newlen < len(tt):
newt = np.arange(newlen)*tmax/float(newlen)
ww = griddata(tt.values,
ww.values,
newt,method='linear',
rescale=True)
tt = pd.Series(newt)
ww = pd.Series(ww)
try:
avgpower[id] = int(ww.mean())
except ValueError:
avgpower[id] = '---'
if not np.isnan(ww.mean()):
length = len(ww)
dt = []
cpw = []
for i in xrange(length-2):
deltat,wmax = getmaxwattinterval(tt,ww,i)
if not np.isnan(deltat) and not np.isnan(wmax):
dt.append(deltat)
cpw.append(wmax)
dt = pd.Series(dt)
cpw = pd.Series(cpw)
if len(dt):
cpvalues = griddata(dt.values,
cpw.values,
logarr,method='linear',
rescale=True)
for cpv in cpvalues:
cpvalue.append(cpv)
for d in logarr:
delta.append(d)
delta = pd.Series(delta,name='Delta')
cpvalue = pd.Series(cpvalue,name='CP')
return delta,cpvalue,avgpower
def getmaxwattinterval(tt,ww,i):
w_roll = ww.rolling(i+2).mean().dropna()
if len(w_roll):
# now goes with # data points - should be fixed seconds
indexmax = w_roll.idxmax(axis=1)
try:
t_0 = tt.ix[indexmax]
t_1 = tt.ix[indexmax-i]
deltat = 1.0e-3*(t_0-t_1)
wmax = w_roll.ix[indexmax]
except KeyError:
pass
else:
wmax = 0
deltat = 0
return deltat,wmax
def df_resample(datadf):
# time stamps must be in seconds
timestamps = datadf['TimeStamp (sec)'].astype('int')
datadf['timestamps'] = timestamps
newdf = datadf.groupby(['timestamps']).mean()
return newdf
def clean_df_stats(datadf,workstrokesonly=True,ignorehr=True,
ignoreadvanced=False):
# clean data remove zeros and negative values
# bring metrics which have negative values to positive domain
try:
datadf['catch'] = -datadf['catch']
except KeyError:
pass
try:
datadf['peakforceangle'] = datadf['peakforceangle']+1000
except KeyError:
pass
try:
datadf['hr'] = datadf['hr']+10
except KeyError:
pass
datadf=datadf.clip(lower=0)
datadf.replace(to_replace=0,value=np.nan,inplace=True)
# return from positive domain to negative
try:
datadf['catch'] = -datadf['catch']
except KeyError:
pass
try:
datadf['peakforceangle'] = datadf['peakforceangle']-1000
except KeyError:
pass
try:
datadf['hr'] = datadf['hr']-10
except KeyError:
pass
# clean data for useful ranges per column
if not ignorehr:
try:
mask = datadf['hr'] < 30
datadf.loc[mask,'hr'] = np.nan
except KeyError:
pass
try:
mask = datadf['spm'] < 10
datadf.loc[mask,'spm'] = np.nan
except KeyError:
pass
try:
mask = datadf['pace']/1000. > 300.
datadf.loc[mask,'pace'] = np.nan
except KeyError:
pass
try:
mask = datadf['pace']/1000. < 60.
datadf.loc[mask,'pace'] = np.nan
except KeyError:
pass
try:
mask = datadf['spm'] > 60
datadf.loc[mask,'spm'] = np.nan
except KeyError:
pass
try:
mask = datadf['wash'] < 1
datadf.loc[mask,'wash'] = np.nan
except KeyError:
pass
if not ignoreadvanced:
try:
mask = datadf['rhythm'] < 5
datadf.loc[mask,'rhythm'] = np.nan
except KeyError:
pass
try:
mask = datadf['rhythm'] > 70
datadf.loc[mask,'rhythm'] = np.nan
except KeyError:
pass
try:
mask = datadf['power'] < 20
datadf.loc[mask,'power'] = np.nan
except KeyError:
pass
try:
mask = datadf['drivelength'] < 0.5
datadf.loc[mask,'drivelength'] = np.nan
except KeyError:
pass
try:
mask = datadf['forceratio'] < 0.2
datadf.loc[mask,'forceratio'] = np.nan
except KeyError:
pass
try:
mask = datadf['forceratio'] > 1.0
datadf.loc[mask,'forceratio'] = np.nan
except KeyError:
pass
try:
mask = datadf['drivespeed'] < 0.5
datadf.loc[mask,'drivespeed'] = np.nan
except KeyError:
pass
try:
mask = datadf['drivespeed'] > 4
datadf.loc[mask,'drivespeed'] = np.nan
except KeyError:
pass
try:
mask = datadf['driveenergy'] > 2000
datadf.loc[mask,'driveenergy'] = np.nan
except KeyError:
pass
try:
mask = datadf['driveenergy'] < 100
datadf.loc[mask,'driveenergy'] = np.nan
except KeyError:
pass
try:
mask = datadf['catch'] > -30.
datadf.loc[mask,'catch'] = np.nan
except KeyError:
pass
workoutstateswork = [1,4,5,8,9,6,7]
workoutstatesrest = [3]
workoutstatetransition = [0,2,10,11,12,13]
if workstrokesonly=='True' or workstrokesonly==True:
try:
datadf = datadf[~datadf['workoutstate'].isin(workoutstatesrest)]
except:
pass
return datadf
def getstatsfields():
# Get field names and remove those that are not useful in stats
fields = StrokeData._meta.get_fields()
fielddict = {field.name:field.verbose_name for field in fields}
#fielddict.pop('workoutid')
fielddict.pop('ergpace')
fielddict.pop('hr_an')
fielddict.pop('hr_tr')
fielddict.pop('hr_at')
fielddict.pop('hr_ut2')
fielddict.pop('hr_ut1')
fielddict.pop('time')
fielddict.pop('distance')
fielddict.pop('nowindpace')
fielddict.pop('fnowindpace')
fielddict.pop('fergpace')
fielddict.pop('equivergpower')
# fielddict.pop('workoutstate')
fielddict.pop('fpace')
fielddict.pop('pace')
fielddict.pop('id')
fielddict.pop('ftime')
fielddict.pop('x_right')
fielddict.pop('hr_max')
fielddict.pop('hr_bottom')
fielddict.pop('cumdist')
fieldlist = [field for field,value in fielddict.iteritems()]
return fieldlist,fielddict
# A string representation for time deltas
def niceformat(values):
out = []
for v in values:
formattedv = strfdelta(v)
out.append(formattedv)
return out
# A nice printable format for time delta values
def strfdelta(tdelta):
try:
minutes,seconds = divmod(tdelta.seconds,60)
tenths = int(tdelta.microseconds/1e5)
except AttributeError:
minutes,seconds = divmod(tdelta.view(np.int64),60e9)
seconds,rest = divmod(seconds,1e9)
tenths = int(rest/1e8)
res = "{minutes:0>2}:{seconds:0>2}.{tenths:0>1}".format(
minutes=minutes,
seconds=seconds,
tenths=tenths,
)
return res
# A nice printable format for pace values
def nicepaceformat(values):
out = []
for v in values:
formattedv = strfdelta(v)
out.append(formattedv)
return out
# Convert seconds to a Time Delta value, replacing NaN with a 5:50 pace
def timedeltaconv(x):
if np.isfinite(x) and x != 0:
dt = datetime.timedelta(seconds=x)
else:
dt = datetime.timedelta(seconds=350.)
return dt
# Processes painsled CSV file to database
def save_workout_database(f2,r,dosmooth=True,workouttype='rower',
dosummary=True,title='Workout',
workoutsource='unknown',
notes='',totaldist=0,totaltime=0,
summary='',
makeprivate=False,
oarlength=2.89,inboard=0.88,
consistencychecks=True):
message = None
powerperc = 100*np.array([r.pw_ut2,
r.pw_ut1,
r.pw_at,
r.pw_tr,r.pw_an])/r.ftp
# make workout and put in database
rr = rrower(hrmax=r.max,hrut2=r.ut2,
hrut1=r.ut1,hrat=r.at,
hrtr=r.tr,hran=r.an,ftp=r.ftp,
powerperc=powerperc,powerzones=r.powerzones)
row = rdata(f2,rower=rr)
isbreakthrough = False
if workouttype == 'water':
delta,cpvalues,avgpower = getsinglecp(row.df)
if utils.isbreakthrough(delta,cpvalues,r.p0,r.p1,r.p2,r.p3):
isbreakthrough = True
dtavg = row.df['TimeStamp (sec)'].diff().mean()
if dtavg < 1:
newdf = df_resample(row.df)
try:
os.remove(f2)
except:
pass
return new_workout_from_df(r,newdf,
title=title)
checks = row.check_consistency()
allchecks = 1
for key,value in checks.iteritems():
if not value:
allchecks = 0
if consistencychecks:
a_messages.error(r.user,'Failed consistency check: '+key+', autocorrected')
else:
a_messages.error(r.user,'Failed consistency check: '+key+', not corrected')
if not allchecks and consistencychecks:
# row.repair()
pass
if row == 0:
return (0,'Error: CSV data file not found')
if dosmooth:
# auto smoothing
pace = row.df[' Stroke500mPace (sec/500m)'].values
velo = 500./pace
f = row.df['TimeStamp (sec)'].diff().mean()
if f !=0 and not np.isnan(f):
windowsize = 2*(int(10./(f)))+1
else:
windowsize = 1
if not 'originalvelo' in row.df:
row.df['originalvelo'] = velo
if windowsize > 3 and windowsize<len(velo):
velo2 = savgol_filter(velo,windowsize,3)
else:
velo2 = velo
velo3 = pd.Series(velo2)
velo3 = velo3.replace([-np.inf,np.inf],np.nan)
velo3 = velo3.fillna(method='ffill')
pace2 = 500./abs(velo3)
row.df[' Stroke500mPace (sec/500m)'] = pace2
row.df = row.df.fillna(0)
row.write_csv(f2,gzip=True)
try:
os.remove(f2)
except:
pass
# recalculate power data
if workouttype == 'rower' or workouttype == 'dynamic' or workouttype == 'slides':
try:
row.erg_recalculatepower()
row.write_csv(f2,gzip=True)
except:
pass
averagehr = row.df[' HRCur (bpm)'].mean()
maxhr = row.df[' HRCur (bpm)'].max()
if totaldist == 0:
totaldist = row.df['cum_dist'].max()
if totaltime == 0:
totaltime = row.df['TimeStamp (sec)'].max()-row.df['TimeStamp (sec)'].min()
try:
totaltime = totaltime+row.df.ix[0,' ElapsedTime (sec)']
except KeyError:
pass
if np.isnan(totaltime):
totaltime = 0
hours = int(totaltime/3600.)
if hours>23:
message = 'Warning: The workout duration was longer than 23 hours. '
hours = 23
minutes = int((totaltime - 3600.*hours)/60.)
if minutes>59:
minutes = 59
if not message:
message = 'Warning: there is something wrong with the workout duration'
seconds = int(totaltime - 3600.*hours - 60.*minutes)
if seconds > 59:
seconds = 59
if not message:
message = 'Warning: there is something wrong with the workout duration'
tenths = int(10*(totaltime - 3600.*hours - 60.*minutes - seconds))
if tenths > 9:
tenths = 9
if not message:
message = 'Warning: there is something wrong with the workout duration'
duration = "%s:%s:%s.%s" % (hours,minutes,seconds,tenths)
if dosummary:
summary = row.allstats()
#summary = row.summary()
#summary += '\n'
#summary += row.intervalstats()
workoutdate = row.rowdatetime.strftime('%Y-%m-%d')
workoutstarttime = row.rowdatetime.strftime('%H:%M:%S')
workoutstartdatetime = thetimezone.localize(row.rowdatetime).astimezone(utc)
if makeprivate:
privacy = 'private'
else:
privacy = 'visible'
# check for duplicate start times
ws = Workout.objects.filter(startdatetime=workoutstartdatetime,
user=r)
if (len(ws) != 0):
message = "Warning: This workout probably already exists in the database"
privacy = 'private'
# checking for inf values
totaldist = np.nan_to_num(totaldist)
maxhr = np.nan_to_num(maxhr)
averagehr = np.nan_to_num(averagehr)
w = Workout(user=r,name=title,date=workoutdate,
workouttype=workouttype,
duration=duration,distance=totaldist,
weightcategory=r.weightcategory,
starttime=workoutstarttime,
workoutsource=workoutsource,
csvfilename=f2,notes=notes,summary=summary,
maxhr=maxhr,averagehr=averagehr,
startdatetime=workoutstartdatetime,
inboard=inboard,oarlength=oarlength,
privacy=privacy)
w.save()
# submit email task to send email about breakthrough workout
if isbreakthrough:
if settings.DEBUG:
res = handle_sendemail_breakthrough(w.id,r.user.email,
r.user.first_name,
r.user.last_name)
else:
res = queuehigh.enqueue(
handle_sendemail_breakthrough(w.id,
r.user.email,
r.user.first_name,
r.user.last_name))
if privacy == 'visible':
ts = Team.objects.filter(rower=r)
for t in ts:
w.team.add(t)
# put stroke data in database
res = dataprep(row.df,id=w.id,bands=True,
barchart=True,otwpower=True,empower=True,inboard=inboard)
return (w.id,message)
def handle_nonpainsled(f2,fileformat,summary=''):
oarlength = 2.89
inboard = 0.88
# handle RowPro:
if (fileformat == 'rp'):
row = RowProParser(f2)
# handle TCX
if (fileformat == 'tcx'):
row = TCXParser(f2)
# handle Mystery
if (fileformat == 'mystery'):
row = MysteryParser(f2)
# handle TCX no HR
if (fileformat == 'tcxnohr'):
row = TCXParserNoHR(f2)
# handle RowPerfect
if (fileformat == 'rowperfect3'):
row = RowPerfectParser(f2)
# handle ErgData
if (fileformat == 'ergdata'):
row = ErgDataParser(f2)
# handle Mike
if (fileformat == 'bcmike'):
row = BoatCoachAdvancedParser(f2)
# handle BoatCoach
if (fileformat == 'boatcoach'):
row = BoatCoachParser(f2)
# handle painsled desktop
if (fileformat == 'painsleddesktop'):
row = painsledDesktopParser(f2)
# handle speed coach GPS
if (fileformat == 'speedcoach'):
row = speedcoachParser(f2)
# handle speed coach GPS 2
if (fileformat == 'speedcoach2'):
row = SpeedCoach2Parser(f2)
try:
oarlength,inboard = get_empower_rigging(f2)
summary = row.allstats()
except:
pass
# handle ErgStick
if (fileformat == 'ergstick'):
row = ErgStickParser(f2)
# handle FIT
if (fileformat == 'fit'):
row = FITParser(f2)
try:
s = fitsummarydata(f2)
s.setsummary()
summary = s.summarytext
except:
pass
f_to_be_deleted = f2
# should delete file
f2 = f2[:-4]+'o.csv'
row.write_csv(f2,gzip=True)
#os.remove(f2)
try:
os.remove(f_to_be_deleted)
except:
os.remove(f_to_be_deleted+'.gz')
return (f2,summary,oarlength,inboard)
# Create new workout from file and store it in the database
# This routine should be used everywhere in views.py and mailprocessing.py
# Currently there is code duplication
def new_workout_from_file(r,f2,
workouttype='rower',
title='Workout',
makeprivate=False,
notes=''):
message = None
fileformat = get_file_type(f2)
summary = ''
oarlength = 2.89
inboard = 0.88
if len(fileformat)==3 and fileformat[0]=='zip':
f_to_be_deleted = f2
title = os.path.basename(f2)
if settings.DEBUG:
res = handle_zip_file.delay(
r.user.email,title,f2
)
else:
res = queuelow.enqueue(
handle_zip_file,
r.user.email,
title,
f2
)
return -1,message,f2
# Some people try to upload Concept2 logbook summaries
if fileformat == 'c2log':
os.remove(f2)
message = "This C2 logbook summary does not contain stroke data. Please download the Export Stroke Data file from the workout details on the C2 logbook."
return (0,message,f2)
if fileformat == 'nostrokes':
os.remove(f2)
message = "It looks like this file doesn't contain stroke data."
return (0,message,f2)
# Some people try to upload RowPro summary logs
if fileformat == 'rowprolog':
os.remove(f2)
message = "This RowPro logbook summary does not contain stroke data. Please use the Stroke Data CSV file for the individual workout in your log."
return (0,message,f2)
# Sometimes people try an unsupported file type.
# Send an email to info@rowsandall.com with the file attached
# for me to check if it is a bug, or a new file type
# worth supporting
if fileformat == 'unknown':
message = "We couldn't recognize the file type"
if settings.DEBUG:
res = handle_sendemail_unrecognized.delay(f2,
r.user.email)
else:
res = queuehigh.enqueue(handle_sendemail_unrecognized,
f2,r.user.email)
return (0,message,f2)
# handle non-Painsled by converting it to painsled compatible CSV
if (fileformat != 'csv'):
try:
f2,summary,oarlength,inboard = handle_nonpainsled(f2,
fileformat,
summary=summary)
except:
errorstring = str(sys.exc_info()[0])
message = 'Something went wrong: '+errorstring
return (0,message,'')
dosummary = (fileformat != 'fit')
id,message = save_workout_database(f2,r,
workouttype=workouttype,
makeprivate=makeprivate,
dosummary=dosummary,
workoutsource=fileformat,
summary=summary,
inboard=inboard,oarlength=oarlength,
title=title)
return (id,message,f2)
# Create new workout from data frame and store it in the database
# This routine should be used everywhere in views.py and mailprocessing.py
# Currently there is code duplication
def new_workout_from_df(r,df,
title='New Workout',
parent=None):
message = None
summary = ''
if parent:
oarlength = parent.oarlength
inboard = parent.inboard
workouttype = parent.workouttype
notes=parent.notes
summary=parent.summary
makeprivate=parent.privacy
startdatetime=parent.startdatetime
else:
oarlength = 2.89
inboard = 0.88
workouttype = 'rower'
notes=''
summary=''
makeprivate=False
startdatetime = timezone.now()
timestr = strftime("%Y%m%d-%H%M%S")
csvfilename ='media/Fusion_'+timestr+'.csv'
df.rename(columns = columndict,inplace=True)
starttimeunix = mktime(startdatetime.utctimetuple())
df[' ElapsedTime (sec)'] = df['TimeStamp (sec)']
df['TimeStamp (sec)'] = df['TimeStamp (sec)']+starttimeunix
row = rrdata(df=df)
row.write_csv(csvfilename,gzip=True)
#res = df.to_csv(csvfilename+'.gz',index_label='index',
# compression='gzip')
id,message = save_workout_database(csvfilename,r,
workouttype=workouttype,
title=title,
notes=notes,
oarlength=oarlength,
inboard=inboard,
makeprivate=makeprivate,
dosmooth=False,
consistencychecks=False)
return (id,message)
# Compare the data from the CSV file and the database
# Currently only calculates number of strokes. To be expanded with
# more elaborate testing if needed
def compare_data(id):
row = Workout.objects.get(id=id)
f1 = row.csvfilename
try:
rowdata = rdata(f1)
l1 = len(rowdata.df)
except AttributeError:
rowdata = 0
l1 = 0
engine = create_engine(database_url, echo=False)
query = sa.text('SELECT COUNT(*) FROM strokedata WHERE workoutid={id};'.format(
id=id,
))
with engine.connect() as conn, conn.begin():
try:
res = conn.execute(query)
l2 = res.fetchall()[0][0]
except:
print "Database Locked"
conn.close()
engine.dispose()
lfile = l1
ldb = l2
return l1==l2 and l1 != 0,ldb,lfile
# Repair data for workouts where the CSV file is lost (or the DB entries
# don't exist)
def repair_data(verbose=False):
ws = Workout.objects.all()
for w in ws:
if verbose:
sys.stdout.write(".")
test,ldb,lfile = compare_data(w.id)
if not test:
if verbose:
print w.id,lfile,ldb
try:
rowdata = rdata(w.csvfilename)
if rowdata and len(rowdata.df):
update_strokedata(w.id,rowdata.df)
except IOError, AttributeError:
pass
if lfile==0:
# if not ldb - delete workout
try:
data = read_df_sql(w.id)
try:
datalength = len(data)
except AttributeError:
datalength = 0
if datalength != 0:
data.rename(columns = columndict,inplace=True)
res = data.to_csv(w.csvfilename+'.gz',
index_label='index',
compression='gzip')
print 'adding csv file'
else:
print w.id,' No stroke records anywhere'
w.delete()
except:
print 'failed'
print str(sys.exc_info()[0])
pass
# A wrapper around the rowingdata class, with some error catching
def rdata(file,rower=rrower()):
try:
res = rrdata(csvfile=file,rower=rower)
except IOError,IndexError:
try:
res = rrdata(csvfile=file+'.gz',rower=rower)
except IOError,IndexError:
res = 0
except:
res = 0
return res
# Remove all stroke data for workout ID from database
def delete_strokedata(id):
engine = create_engine(database_url, echo=False)
query = sa.text('DELETE FROM strokedata WHERE workoutid={id};'.format(
id=id,
))
with engine.connect() as conn, conn.begin():
try:
result = conn.execute(query)
except:
print "Database Locked"
conn.close()
engine.dispose()
# Replace stroke data in DB with data from CSV file
def update_strokedata(id,df):
delete_strokedata(id)
rowdata = dataprep(df,id=id,bands=True,barchart=True,otwpower=True)
# Test that all data are of a numerical time
def testdata(time,distance,pace,spm):
t1 = np.issubdtype(time,np.number)
t2 = np.issubdtype(distance,np.number)
t3 = np.issubdtype(pace,np.number)
t4 = np.issubdtype(spm,np.number)
return t1 and t2 and t3 and t4
# Get data from DB for one workout (fetches all data). If data
# is not in DB, read from CSV file (and create DB entry)
def getrowdata_db(id=0,doclean=False):
data = read_df_sql(id)
data['x_right'] = data['x_right']/1.0e6
if data.empty:
rowdata,row = getrowdata(id=id)
if rowdata:
data = dataprep(rowdata.df,id=id,bands=True,barchart=True,otwpower=True)
else:
data = pd.DataFrame() # returning empty dataframe
else:
row = Workout.objects.get(id=id)
if doclean:
data = clean_df_stats(data,ignorehr=True)
# these two lines seem redundant ??
#data['averageforce'] = data['averageforce']
#data['peakforce'] = data['peakforce']
return data,row
# Fetch a subset of the data from the DB
def getsmallrowdata_db(columns,ids=[],doclean=True,workstrokesonly=True,
convertnewtons=False):
prepmultipledata(ids)
data = read_cols_df_sql(ids,columns)
if convertnewtons:
if 'peakforce' in columns:
data['peakforce'] = data['peakforce']*lbstoN
if 'averageforce' in columns:
data['averageforce'] = data['averageforce']*lbstoN
if doclean:
data = clean_df_stats(data,ignorehr=True,
workstrokesonly=workstrokesonly)
return data
# Fetch both the workout and the workout stroke data (from CSV file)
def getrowdata(id=0):
# check if valid ID exists (workout exists)
row = Workout.objects.get(id=id)
f1 = row.csvfilename
# get user
r = row.user
u = r.user
rr = rrower(hrmax=r.max,hrut2=r.ut2,
hrut1=r.ut1,hrat=r.at,
hrtr=r.tr,hran=r.an,ftp=r.ftp)
rowdata = rdata(f1,rower=rr)
return rowdata,row
# Checks if all rows for a list of workout IDs have entries in the
# stroke_data table. If this is not the case, it creates the stroke
# data
# In theory, this should never yield any work, but it's a good
# safety net for programming errors elsewhere in the app
# Also used heavily when I moved from CSV file only to CSV+Stroke data
def prepmultipledata(ids,verbose=False):
query = sa.text('SELECT DISTINCT workoutid FROM strokedata')
engine = create_engine(database_url, echo=False)
with engine.connect() as conn, conn.begin():
res = conn.execute(query)
res = list(itertools.chain.from_iterable(res.fetchall()))
conn.close()
engine.dispose()
try:
ids2 = [int(id) for id in ids]
except ValueError:
ids2 = ids
res = list(set(ids2)-set(res))
for id in res:
rowdata,row = getrowdata(id=id)
if verbose:
print id
if rowdata and len(rowdata.df):
data = dataprep(rowdata.df,id=id,bands=True,barchart=True,otwpower=True)
return res
# Read a set of columns for a set of workout ids, returns data as a
# pandas dataframe
def read_cols_df_sql(ids,columns):
# drop columns that are not in offical list
# axx = [ax[0] for ax in axes]
axx = [f.name for f in StrokeData._meta.get_fields()]
for c in columns:
if not c in axx:
columns.remove(c)
columns = list(columns)+['distance','spm']
columns = [x for x in columns if x != 'None']
columns = list(set(columns))
cls = ''
engine = create_engine(database_url, echo=False)
for column in columns:
cls += column+', '
cls = cls[:-2]
if len(ids) == 0:
query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid=0'.format(
columns = cls,
))
elif len(ids) == 1:
query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid={id}'.format(
id = ids[0],
columns = cls,
))
else:
query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid IN {ids}'.format(
columns = cls,
ids = tuple(ids),
))
connection = engine.raw_connection()
df = pd.read_sql_query(query,engine)
df = df.fillna(value=0)
try:
df['peakforce'] = df['peakforce']*lbstoN
except KeyError:
pass
try:
df['averageforce'] = df['averageforce']*lbstoN
except KeyError:
pass
engine.dispose()
return df
# Read stroke data from the DB for a Workout ID. Returns a pandas dataframe
def read_df_sql(id):
engine = create_engine(database_url, echo=False)
df = pd.read_sql_query(sa.text('SELECT * FROM strokedata WHERE workoutid={id}'.format(
id=id)), engine)
engine.dispose()
df = df.fillna(value=0)
try:
df['peakforce'] = df['peakforce']*lbstoN
except KeyError:
pass
try:
df['averageforce'] = df['averageforce']*lbstoN
except KeyError:
pass
return df
# Get the necessary data from the strokedata table in the DB.
# For the flex plot
def smalldataprep(therows,xparam,yparam1,yparam2):
df = pd.DataFrame()
if yparam2 == 'None':
yparam2 = 'power'
df[xparam] = []
df[yparam1] = []
df[yparam2] = []
df['distance'] = []
df['spm'] = []
for workout in therows:
f1 = workout.csvfilename
try:
rowdata = dataprep(rrdata(f1).df)
rowdata = pd.DataFrame({xparam: rowdata[xparam],
yparam1: rowdata[yparam1],
yparam2: rowdata[yparam2],
'distance': rowdata['distance'],
'spm': rowdata['spm'],
}
)
df = pd.concat([df,rowdata],ignore_index=True)
except IOError:
try:
rowdata = dataprep(rrdata(f1+'.gz').df)
rowdata = pd.DataFrame({xparam: rowdata[xparam],
yparam1: rowdata[yparam1],
yparam2: rowdata[yparam2],
'distance': rowdata['distance'],
'spm': rowdata['spm'],
}
)
df = pd.concat([df,rowdata],ignore_index=True)
except IOError:
pass
try:
df['peakforce'] = df['peakforce']*lbstoN
except KeyError:
pass
try:
df['averageforce'] = df['averageforce']*lbstoN
except KeyError:
pass
return df
# data fusion
def datafusion(id1,id2,columns,offset):
df1,w1 = getrowdata_db(id=id1)
df1 = df1.drop([#'cumdist',
'hr_ut2',
'hr_ut1',
'hr_at',
'hr_tr',
'hr_an',
'hr_max',
'ftime',
'fpace',
'workoutid',
'id'],
1,errors='ignore')
# Add coordinates to DataFrame
latitude,longitude = get_latlon(id1)
df1[' latitude'] = latitude
df1[' longitude'] = longitude
df2 = getsmallrowdata_db(['time']+columns,ids=[id2],doclean=False)
offsetmillisecs = offset.seconds*1000+offset.microseconds/1000.
offsetmillisecs += offset.days*(3600*24*1000)
df2['time'] = df2['time']+offsetmillisecs
keep1 = {c:c for c in set(df1.columns)}
for c in columns:
keep1.pop(c)
for c in df1.columns:
if not c in keep1:
df1 = df1.drop(c,1,errors='ignore')
df = pd.concat([df1,df2],ignore_index=True)
df = df.sort_values(['time'])
df = df.interpolate(method='linear',axis=0,limit_direction='both',
limit=10)
df.fillna(method='bfill',inplace=True)
# Some new stuff to try out
df = df.groupby('time',axis=0).mean()
df['time'] = df.index
df.reset_index(drop=True,inplace=True)
df['time'] = df['time']/1000.
df['pace'] = df['pace']/1000.
df['cum_dist'] = df['cumdist']
return df
# This is the main routine.
# it reindexes, sorts, filters, and smooths the data, then
# saves it to the stroke_data table in the database
# Takes a rowingdata object's DataFrame as input
def dataprep(rowdatadf,id=0,bands=True,barchart=True,otwpower=True,
empower=True,inboard=0.88):
if rowdatadf.empty:
return 0
rowdatadf.set_index([range(len(rowdatadf))],inplace=True)
t = rowdatadf.ix[:,'TimeStamp (sec)']
t = pd.Series(t-rowdatadf.ix[0,'TimeStamp (sec)'])
row_index = rowdatadf.ix[:,' Stroke500mPace (sec/500m)'] > 3000
rowdatadf.loc[row_index,' Stroke500mPace (sec/500m)'] = 3000.
p = rowdatadf.ix[:,' Stroke500mPace (sec/500m)']
hr = rowdatadf.ix[:,' HRCur (bpm)']
spm = rowdatadf.ix[:,' Cadence (stokes/min)']
cumdist = rowdatadf.ix[:,'cum_dist']
power = rowdatadf.ix[:,' Power (watts)']
averageforce = rowdatadf.ix[:,' AverageDriveForce (lbs)']
drivelength = rowdatadf.ix[:,' DriveLength (meters)']
try:
workoutstate = rowdatadf.ix[:,' WorkoutState']
except KeyError:
workoutstate = 0*hr
peakforce = rowdatadf.ix[:,' PeakDriveForce (lbs)']
forceratio = averageforce/peakforce
forceratio = forceratio.fillna(value=0)
try:
drivetime = rowdatadf.ix[:,' DriveTime (ms)']
recoverytime = rowdatadf.ix[:,' StrokeRecoveryTime (ms)']
rhythm = 100.*drivetime/(recoverytime+drivetime)
rhythm = rhythm.fillna(value=0)
except:
rhythm = 0.0*forceratio
f = rowdatadf['TimeStamp (sec)'].diff().mean()
if f != 0:
windowsize = 2*(int(10./(f)))+1
else:
windowsize = 1
if windowsize <= 3:
windowsize = 5
if windowsize > 3 and windowsize<len(hr):
spm = savgol_filter(spm,windowsize,3)
hr = savgol_filter(hr,windowsize,3)
drivelength = savgol_filter(drivelength,windowsize,3)
forceratio = savgol_filter(forceratio,windowsize,3)
try:
t2 = t.fillna(method='ffill').apply(lambda x: timedeltaconv(x))
except TypeError:
t2 = 0*t
p2 = p.fillna(method='ffill').apply(lambda x: timedeltaconv(x))
try:
drivespeed = drivelength/rowdatadf[' DriveTime (ms)']*1.0e3
except TypeError:
drivespeed = 0.0*rowdatadf['TimeStamp (sec)']
drivespeed = drivespeed.fillna(value=0)
driveenergy = drivelength*averageforce*lbstoN
distance = rowdatadf.ix[:,'cum_dist']
data = DataFrame(
dict(
time = t*1e3,
hr = hr,
pace = p*1e3,
spm = spm,
cumdist = cumdist,
ftime = niceformat(t2),
fpace = nicepaceformat(p2),
driveenergy=driveenergy,
power=power,
workoutstate=workoutstate,
averageforce=averageforce,
drivelength=drivelength,
peakforce=peakforce,
forceratio=forceratio,
distance=distance,
drivespeed=drivespeed,
rhythm=rhythm,
)
)
if bands:
# HR bands
data['hr_ut2'] = rowdatadf.ix[:,'hr_ut2']
data['hr_ut1'] = rowdatadf.ix[:,'hr_ut1']
data['hr_at'] = rowdatadf.ix[:,'hr_at']
data['hr_tr'] = rowdatadf.ix[:,'hr_tr']
data['hr_an'] = rowdatadf.ix[:,'hr_an']
data['hr_max'] = rowdatadf.ix[:,'hr_max']
data['hr_bottom'] = 0.0*data['hr']
if barchart:
# time increments for bar chart
time_increments = rowdatadf.ix[:,' ElapsedTime (sec)'].diff()
time_increments[0] = time_increments[1]
time_increments = 0.5*time_increments+0.5*np.abs(time_increments)
x_right = (t2+time_increments.apply(lambda x:timedeltaconv(x)))
data['x_right'] = x_right
if empower:
try:
wash = rowdatadf.ix[:,'wash']
except KeyError:
wash = 0*power
try:
catch = rowdatadf.ix[:,'catch']
except KeyError:
catch = 0*power
try:
finish = rowdatadf.ix[:,'finish']
except KeyError:
finish = 0*power
try:
peakforceangle = rowdatadf.ix[:,'peakforceangle']
except KeyError:
peakforceangle = 0*power
if data['driveenergy'].mean() == 0:
try:
driveenergy = rowdatadf.ix[:,'driveenergy']
except KeyError:
driveenergy = 0*power
else:
driveenergy = data['driveenergy']
arclength = (inboard-0.05)*(np.radians(finish)-np.radians(catch))
if arclength.mean()>0:
drivelength = arclength
elif drivelength.mean() == 0:
drivelength = driveenergy/(averageforce*4.44822)
try:
slip = rowdatadf.ix[:,'slip']
except KeyError:
slip = 0*power
totalangle = finish-catch
effectiveangle = finish-wash-catch-slip
if windowsize > 3 and windowsize<len(slip):
wash = savgol_filter(wash,windowsize,3)
slip = savgol_filter(slip,windowsize,3)
catch = savgol_filter(catch,windowsize,3)
finish = savgol_filter(finish,windowsize,3)
peakforceangle = savgol_filter(peakforceangle,windowsize,3)
driveenergy = savgol_filter(driveenergy,windowsize,3)
drivelength = savgol_filter(drivelength,windowsize,3)
totalangle = savgol_filter(totalangle,windowsize,3)
effectiveangle = savgol_filter(effectiveangle,windowsize,3)
data['wash'] = wash
data['catch'] = catch
data['slip'] = slip
data['finish'] = finish
data['peakforceangle'] = peakforceangle
data['driveenergy'] = driveenergy
data['drivelength'] = drivelength
data['totalangle'] = totalangle
data['effectiveangle'] = effectiveangle
if otwpower:
try:
nowindpace = rowdatadf.ix[:,'nowindpace']
except KeyError:
nowindpace = p
try:
equivergpower = rowdatadf.ix[:,'equivergpower']
except KeyError:
equivergpower = 0*p+50.
nowindpace2 = nowindpace.apply(lambda x: timedeltaconv(x))
ergvelo = (equivergpower/2.8)**(1./3.)
ergpace = 500./ergvelo
ergpace[ergpace == np.inf] = 240.
ergpace2 = ergpace.apply(lambda x: timedeltaconv(x))
data['ergpace'] = ergpace*1e3
data['nowindpace'] = nowindpace*1e3
data['equivergpower'] = equivergpower
data['fergpace'] = nicepaceformat(ergpace2)
data['fnowindpace'] = nicepaceformat(nowindpace2)
data = data.replace([-np.inf,np.inf],np.nan)
data = data.fillna(method='ffill')
# write data if id given
if id != 0:
data['workoutid'] = id
engine = create_engine(database_url, echo=False)
with engine.connect() as conn, conn.begin():
data.to_sql('strokedata',engine,if_exists='append',index=False)
conn.close()
engine.dispose()
return data