rowsandall/rowers/dataprep.py

# All the data preparation, data cleaning and data mangling should
# be defined here
from rowers.models import Workout, User, Rower,StrokeData
from rowingdata import rowingdata as rrdata

from rowers.tasks import handle_sendemail_unrecognized
from rowers.tasks import handle_zip_file

from rowingdata import rower as rrower
from rowingdata import main as rmain

from rowingdata import get_file_type,get_empower_rigging

from pandas import DataFrame,Series
from pytz import timezone as tz,utc
from django.utils import timezone
from time import strftime,strptime,mktime,time,daylight
from django.utils.timezone import get_current_timezone
thetimezone = get_current_timezone()
from rowingdata import (
    TCXParser,RowProParser,ErgDataParser,TCXParserNoHR,
    BoatCoachParser,RowPerfectParser,BoatCoachAdvancedParser,
    MysteryParser,
    painsledDesktopParser,speedcoachParser,ErgStickParser,
    SpeedCoach2Parser,FITParser,fitsummarydata,
    make_cumvalues,
    summarydata,get_file_type,
    )

from rowers.models import Team
from rowers.metrics import axes
from async_messages import messages as a_messages
import os
import zipfile
import pandas as pd
import numpy as np
import itertools
import math
from tasks import handle_sendemail_unrecognized

from django.conf import settings
from sqlalchemy import create_engine
import sqlalchemy as sa
import sys

import utils
from utils import lbstoN
from scipy.interpolate import griddata

import django_rq
queue = django_rq.get_queue('default')
queuelow = django_rq.get_queue('low')
queuehigh = django_rq.get_queue('default')


user = settings.DATABASES['default']['USER']
password = settings.DATABASES['default']['PASSWORD']
database_name = settings.DATABASES['default']['NAME']
host = settings.DATABASES['default']['HOST']
port = settings.DATABASES['default']['PORT']

database_url = 'mysql://{user}:{password}@{host}:{port}/{database_name}'.format(
    user=user,
    password=password,
    database_name=database_name,
    host=host,
    port=port,
    )

# Use SQLite local database when we're in debug mode
if settings.DEBUG or user=='':
    # database_url = 'sqlite:///db.sqlite3'
    database_url = 'sqlite:///'+database_name


# mapping the DB column names to the CSV file column names
columndict = {
    'time':'TimeStamp (sec)',
    'hr':' HRCur (bpm)',
    'pace':' Stroke500mPace (sec/500m)',
    'spm':' Cadence (stokes/min)',
    'power':' Power (watts)',
    'averageforce':' AverageDriveForce (lbs)',
    'drivelength':' DriveLength (meters)',
    'peakforce':' PeakDriveForce (lbs)',
    'distance':' Horizontal (meters)',
    'catch':'catch',
    'finish':'finish',
    'peakforceangle':'peakforceangle',
    'wash':'wash',
    'slip':'wash',
    'workoutstate':' WorkoutState',
    }

from scipy.signal import savgol_filter

import datetime

def get_latlon(id):
    try:
        w = Workout.objects.get(id=id)
    except Workout.DoesNotExist:
        return False

    rowdata = rdata(w.csvfilename)
    try:
	latitude = rowdata.df.ix[:,' latitude']
        longitude = rowdata.df.ix[:,' longitude']
    except KeyError:
        latitude = 0*rowdata.df.ix[:,'TimeStamp (sec)']
        latitude = 0*rowdata.df.ix[:,'TimeStamp (sec)']

    return [latitude,longitude]

def get_workouts(ids,userid):
    goodids = []
    for id in ids:
        w = Workout.objects.get(id=id)
        if int(w.user.user.id) == int(userid):
            goodids.append(id)

    return [Workout.objects.get(id=id) for id in goodids]

def filter_df(datadf,fieldname,value,largerthan=True):

    try:
        x = datadf[fieldname]
    except KeyError:
        return datadf


    if largerthan:
        mask = datadf[fieldname] < value
    else:
        mask = datadf[fieldname] >= value

    datadf.loc[mask,fieldname] = np.nan


    return datadf

def getsinglecp(df):
    thesecs = df['TimeStamp (sec)'].max()-df['TimeStamp (sec)'].min()
    if thesecs != 0:
        maxt = 2*thesecs
    else:
        maxt = 1000.

    maxlog10 = np.log10(maxt)
    logarr = np.arange(50)*maxlog10/50.
    logarr = [int(10.**(la)) for la in logarr]
    logarr = pd.Series(logarr)
    logarr.drop_duplicates(keep='first',inplace=True)

    logarr = logarr.values


    dfnew = pd.DataFrame({
        'time':df['TimeStamp (sec)']-df.ix[0,'TimeStamp (sec)'],
        'power':df[' Power (watts)']
    })

    dfnew['workoutid'] = 0

    dfgrouped = dfnew.groupby(['workoutid'])
    delta,cpvalue,avgpower = getcp(dfgrouped,logarr)

    return delta,cpvalue,avgpower

def getcp(dfgrouped,logarr):
    delta = []
    cpvalue = []
    avgpower = {}
    #avgpower[0] = 0

    for id,group in dfgrouped:
        tt = group['time'].copy()
        ww = group['power'].copy()

        tmax = tt.max()
        newlen = int(tmax/5000.)
        if newlen < len(tt):
            newt = np.arange(newlen)*tmax/float(newlen)
            ww = griddata(tt.values,
                          ww.values,
                          newt,method='linear',
                          rescale=True)

            tt = pd.Series(newt)
            ww = pd.Series(ww)

        try:
            avgpower[id] = int(ww.mean())
        except ValueError:
            avgpower[id] = '---'
        if not np.isnan(ww.mean()):
            length = len(ww)
            dt = []
            cpw = []
            for i in xrange(length-2):
                deltat,wmax = getmaxwattinterval(tt,ww,i)
                if not np.isnan(deltat) and not np.isnan(wmax):
                    dt.append(deltat)
                    cpw.append(wmax)


            dt = pd.Series(dt)
            cpw = pd.Series(cpw)
            if len(dt):

                cpvalues = griddata(dt.values,
                                    cpw.values,
                                    logarr,method='linear',
                                    rescale=True)

                for cpv in cpvalues:
                    cpvalue.append(cpv)
                for d in logarr:
                    delta.append(d)

    delta = pd.Series(delta,name='Delta')
    cpvalue = pd.Series(cpvalue,name='CP')
    return delta,cpvalue,avgpower

def getmaxwattinterval(tt,ww,i):
    w_roll = ww.rolling(i+2).mean().dropna()
    if len(w_roll):
        # now goes with # data points - should be fixed seconds
        indexmax = w_roll.idxmax(axis=1)
        try:
            t_0 = tt.ix[indexmax]
            t_1 = tt.ix[indexmax-i]
            deltat = 1.0e-3*(t_0-t_1)
            wmax = w_roll.ix[indexmax]
        except KeyError:
            pass
    else:
        wmax = 0
        deltat = 0

    return deltat,wmax

def df_resample(datadf):
    # time stamps must be in seconds
    timestamps = datadf['TimeStamp (sec)'].astype('int')
    datadf['timestamps'] = timestamps
    newdf = datadf.groupby(['timestamps']).mean()
    return newdf


def clean_df_stats(datadf,workstrokesonly=True,ignorehr=True,
                   ignoreadvanced=False):
    # clean data remove zeros and negative values

    # bring metrics which have negative values to positive domain
    try:
        datadf['catch'] = -datadf['catch']
    except KeyError:
        pass

    try:
        datadf['peakforceangle'] = datadf['peakforceangle']+1000
    except KeyError:
        pass

    try:
        datadf['hr'] = datadf['hr']+10
    except KeyError:
        pass

    datadf=datadf.clip(lower=0)
    datadf.replace(to_replace=0,value=np.nan,inplace=True)


    # return from positive domain to negative
    try:
        datadf['catch'] = -datadf['catch']
    except KeyError:
        pass

    try:
        datadf['peakforceangle'] = datadf['peakforceangle']-1000
    except KeyError:
        pass

    try:
        datadf['hr'] = datadf['hr']-10
    except KeyError:
        pass

    # clean data for useful ranges per column
    if not ignorehr:
        try:
            mask = datadf['hr'] < 30
            datadf.loc[mask,'hr'] = np.nan
        except KeyError:
            pass

    try:
        mask = datadf['spm'] < 10
        datadf.loc[mask,'spm'] = np.nan
    except KeyError:
        pass


    try:
        mask = datadf['pace']/1000. > 300.
        datadf.loc[mask,'pace'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['pace']/1000. < 60.
        datadf.loc[mask,'pace'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['spm'] > 60
        datadf.loc[mask,'spm'] = np.nan
    except KeyError:
        pass


    try:
        mask = datadf['wash'] < 1
        datadf.loc[mask,'wash'] = np.nan
    except KeyError:
        pass


    if not ignoreadvanced:
        try:
            mask = datadf['rhythm'] < 5
            datadf.loc[mask,'rhythm'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['rhythm'] > 70
            datadf.loc[mask,'rhythm'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['power'] < 20
            datadf.loc[mask,'power'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['drivelength'] < 0.5
            datadf.loc[mask,'drivelength'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['forceratio'] < 0.2
            datadf.loc[mask,'forceratio'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['forceratio'] > 1.0
            datadf.loc[mask,'forceratio'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['drivespeed'] < 0.5
            datadf.loc[mask,'drivespeed'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['drivespeed'] > 4
            datadf.loc[mask,'drivespeed'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['driveenergy'] > 2000
            datadf.loc[mask,'driveenergy'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['driveenergy'] < 100
            datadf.loc[mask,'driveenergy'] = np.nan
        except KeyError:
            pass


        try:
            mask = datadf['catch'] > -30.
            datadf.loc[mask,'catch'] = np.nan
        except KeyError:
            pass


    workoutstateswork = [1,4,5,8,9,6,7]
    workoutstatesrest = [3]
    workoutstatetransition = [0,2,10,11,12,13]

    if workstrokesonly=='True' or workstrokesonly==True:
        try:
            datadf = datadf[~datadf['workoutstate'].isin(workoutstatesrest)]
        except:
            pass

    return datadf

def getstatsfields():
    # Get field names and remove those that are not useful in stats
    fields = StrokeData._meta.get_fields()

    fielddict = {field.name:field.verbose_name for field in fields}

    #fielddict.pop('workoutid')
    fielddict.pop('ergpace')
    fielddict.pop('hr_an')
    fielddict.pop('hr_tr')
    fielddict.pop('hr_at')
    fielddict.pop('hr_ut2')
    fielddict.pop('hr_ut1')
    fielddict.pop('time')
    fielddict.pop('distance')
    fielddict.pop('nowindpace')
    fielddict.pop('fnowindpace')
    fielddict.pop('fergpace')
    fielddict.pop('equivergpower')
#    fielddict.pop('workoutstate')
    fielddict.pop('fpace')
    fielddict.pop('pace')
    fielddict.pop('id')
    fielddict.pop('ftime')
    fielddict.pop('x_right')
    fielddict.pop('hr_max')
    fielddict.pop('hr_bottom')
    fielddict.pop('cumdist')

    fieldlist = [field for field,value in fielddict.iteritems()]

    return fieldlist,fielddict


# A string representation for time deltas
def niceformat(values):
    out = []
    for v in values:
	formattedv = strfdelta(v)
	out.append(formattedv)

    return out

# A nice printable format for time delta values
def strfdelta(tdelta):
    try:
	minutes,seconds = divmod(tdelta.seconds,60)
	tenths = int(tdelta.microseconds/1e5)
    except AttributeError:
	minutes,seconds = divmod(tdelta.view(np.int64),60e9)
	seconds,rest = divmod(seconds,1e9)
	tenths = int(rest/1e8)
    res = "{minutes:0>2}:{seconds:0>2}.{tenths:0>1}".format(
	minutes=minutes,
	seconds=seconds,
	tenths=tenths,
	)

    return res

# A nice printable format for pace values
def nicepaceformat(values):
    out = []
    for v in values:
	formattedv = strfdelta(v)
	out.append(formattedv)


    return out

# Convert seconds to a Time Delta value, replacing NaN with a 5:50 pace
def timedeltaconv(x):
    if np.isfinite(x) and x != 0:
        dt = datetime.timedelta(seconds=x)
    else:
        dt = datetime.timedelta(seconds=350.)


    return dt

# Processes painsled CSV file to database
def save_workout_database(f2,r,dosmooth=True,workouttype='rower',
                          dosummary=True,title='Workout',
                          workoutsource='unknown',
                          notes='',totaldist=0,totaltime=0,
                          summary='',
                          makeprivate=False,
                          oarlength=2.89,inboard=0.88,
                          consistencychecks=True):
    message = None
    powerperc = 100*np.array([r.pw_ut2,
                              r.pw_ut1,
                              r.pw_at,
                              r.pw_tr,r.pw_an])/r.ftp

    # make workout and put in database
    rr = rrower(hrmax=r.max,hrut2=r.ut2,
		hrut1=r.ut1,hrat=r.at,
		hrtr=r.tr,hran=r.an,ftp=r.ftp,
                powerperc=powerperc,powerzones=r.powerzones)
    row = rdata(f2,rower=rr)

    isbreakthrough = False
    if workouttype == 'water':
        delta,cpvalues,avgpower = getsinglecp(row.df)
        if utils.isbreakthrough(delta,cpvalues,r.p0,r.p1,r.p2,r.p3):
            isbreakthrough = True

    dtavg = row.df['TimeStamp (sec)'].diff().mean()

    if dtavg < 1:
        newdf = df_resample(row.df)
        try:
            os.remove(f2)
        except:
            pass
        return new_workout_from_df(r,newdf,
                                   title=title)

    checks = row.check_consistency()
    allchecks = 1
    for key,value in checks.iteritems():
        if not value:
            allchecks = 0
            if consistencychecks:
                a_messages.error(r.user,'Failed consistency check: '+key+', autocorrected')
            else:
                a_messages.error(r.user,'Failed consistency check: '+key+', not corrected')

    if not allchecks and consistencychecks:
        # row.repair()
        pass


    if row == 0:
	return (0,'Error: CSV data file not found')

    if dosmooth:
        # auto smoothing
        pace = row.df[' Stroke500mPace (sec/500m)'].values
        velo = 500./pace

        f = row.df['TimeStamp (sec)'].diff().mean()
        if f !=0 and not np.isnan(f):
            windowsize = 2*(int(10./(f)))+1
        else:
            windowsize = 1
        if not 'originalvelo' in row.df:
	    row.df['originalvelo'] = velo

        if windowsize > 3 and windowsize<len(velo):
	    velo2 = savgol_filter(velo,windowsize,3)
        else:
	    velo2 = velo

        velo3 = pd.Series(velo2)
        velo3 = velo3.replace([-np.inf,np.inf],np.nan)
        velo3 = velo3.fillna(method='ffill')

        pace2 = 500./abs(velo3)

        row.df[' Stroke500mPace (sec/500m)'] = pace2

        row.df = row.df.fillna(0)

        row.write_csv(f2,gzip=True)
        try:
            os.remove(f2)
        except:
            pass

    # recalculate power data
    if workouttype == 'rower' or workouttype == 'dynamic' or workouttype == 'slides':
	try:
	    row.erg_recalculatepower()
            row.write_csv(f2,gzip=True)
	except:
	    pass

    averagehr = row.df[' HRCur (bpm)'].mean()
    maxhr = row.df[' HRCur (bpm)'].max()

    if totaldist == 0:
        totaldist = row.df['cum_dist'].max()
    if totaltime == 0:
        totaltime = row.df['TimeStamp (sec)'].max()-row.df['TimeStamp (sec)'].min()
        try:
            totaltime = totaltime+row.df.ix[0,' ElapsedTime (sec)']
        except KeyError:
            pass

    if np.isnan(totaltime):
        totaltime = 0

    hours = int(totaltime/3600.)
    if hours>23:
        message = 'Warning: The workout duration was longer than 23 hours. '
        hours = 23

    minutes = int((totaltime - 3600.*hours)/60.)
    if minutes>59:
        minutes = 59
        if not message:
            message = 'Warning: there is something wrong with the workout duration'

    seconds = int(totaltime - 3600.*hours - 60.*minutes)
    if seconds > 59:
        seconds = 59
        if not message:
            message = 'Warning: there is something wrong with the workout duration'

    tenths = int(10*(totaltime - 3600.*hours - 60.*minutes - seconds))
    if tenths > 9:
        tenths = 9
        if not message:
            message = 'Warning: there is something wrong with the workout duration'

    duration = "%s:%s:%s.%s" % (hours,minutes,seconds,tenths)

    if dosummary:
        summary = row.allstats()
        #summary = row.summary()
        #summary += '\n'
        #summary += row.intervalstats()

    workoutdate = row.rowdatetime.strftime('%Y-%m-%d')
    workoutstarttime = row.rowdatetime.strftime('%H:%M:%S')
    workoutstartdatetime = thetimezone.localize(row.rowdatetime).astimezone(utc)

    if makeprivate:
        privacy = 'private'
    else:
        privacy = 'visible'

    # check for duplicate start times
    ws = Workout.objects.filter(startdatetime=workoutstartdatetime,
				    user=r)
    if (len(ws) != 0):
	message =  "Warning: This workout probably already exists in the database"
        privacy = 'private'

    # checking for inf values
    totaldist = np.nan_to_num(totaldist)
    maxhr = np.nan_to_num(maxhr)
    averagehr = np.nan_to_num(averagehr)


    w = Workout(user=r,name=title,date=workoutdate,
		workouttype=workouttype,
		duration=duration,distance=totaldist,
		weightcategory=r.weightcategory,
		starttime=workoutstarttime,
                workoutsource=workoutsource,
		csvfilename=f2,notes=notes,summary=summary,
		maxhr=maxhr,averagehr=averagehr,
		startdatetime=workoutstartdatetime,
                inboard=inboard,oarlength=oarlength,
                privacy=privacy)


    w.save()

    # submit email task to send email about breakthrough workout
    if isbreakthrough:
        if settings.DEBUG:
            res = handle_sendemail_breakthrough(w.id,r.user.email,
                                                r.user.first_name,
                                                r.user.last_name)
        else:
            res = queuehigh.enqueue(
                handle_sendemail_breakthrough(w.id,
                                              r.user.email,
                                              r.user.first_name,
                                              r.user.last_name))

    if privacy == 'visible':
        ts = Team.objects.filter(rower=r)
        for t in ts:
            w.team.add(t)

    # put stroke data in database
    res = dataprep(row.df,id=w.id,bands=True,
                   barchart=True,otwpower=True,empower=True,inboard=inboard)

    return (w.id,message)

def handle_nonpainsled(f2,fileformat,summary=''):
    oarlength = 2.89
    inboard = 0.88
    # handle RowPro:
    if (fileformat == 'rp'):
	row = RowProParser(f2)
	# handle TCX
    if (fileformat ==  'tcx'):
	row = TCXParser(f2)

    # handle Mystery
    if (fileformat == 'mystery'):
	row = MysteryParser(f2)

    # handle TCX no HR
    if (fileformat == 'tcxnohr'):
	row = TCXParserNoHR(f2)

    # handle RowPerfect
    if (fileformat == 'rowperfect3'):
        row = RowPerfectParser(f2)

    # handle ErgData
    if (fileformat == 'ergdata'):
	row = ErgDataParser(f2)

    # handle Mike
    if (fileformat == 'bcmike'):
        row = BoatCoachAdvancedParser(f2)

    # handle BoatCoach
    if (fileformat == 'boatcoach'):
	row = BoatCoachParser(f2)

    # handle painsled desktop
    if (fileformat == 'painsleddesktop'):
	row = painsledDesktopParser(f2)

    # handle speed coach GPS
    if (fileformat == 'speedcoach'):
	row = speedcoachParser(f2)

    # handle speed coach GPS 2
    if (fileformat == 'speedcoach2'):
	row = SpeedCoach2Parser(f2)
        try:
            oarlength,inboard = get_empower_rigging(f2)
            summary = row.allstats()
        except:
            pass


    # handle ErgStick
    if (fileformat == 'ergstick'):
	row = ErgStickParser(f2)

    # handle FIT
    if (fileformat == 'fit'):
	row = FITParser(f2)
        try:
	    s = fitsummarydata(f2)
            s.setsummary()
	    summary = s.summarytext
        except:
            pass


    f_to_be_deleted = f2
    # should delete file
    f2 = f2[:-4]+'o.csv'
    row.write_csv(f2,gzip=True)

    #os.remove(f2)
    try:
	os.remove(f_to_be_deleted)
    except:
        os.remove(f_to_be_deleted+'.gz')

    return (f2,summary,oarlength,inboard)

# Create new workout from file and store it in the database
# This routine should be used everywhere in views.py and mailprocessing.py
# Currently there is code duplication
def new_workout_from_file(r,f2,
                          workouttype='rower',
                          title='Workout',
                          makeprivate=False,
                          notes=''):
    message = None
    fileformat = get_file_type(f2)
    summary = ''
    oarlength = 2.89
    inboard = 0.88
    if len(fileformat)==3 and fileformat[0]=='zip':
        f_to_be_deleted = f2
        title = os.path.basename(f2)
        if settings.DEBUG:
	    res = handle_zip_file.delay(
                r.user.email,title,f2
            )

	else:
	    res = queuelow.enqueue(
                handle_zip_file,
                r.user.email,
                title,
                f2
            )

        return -1,message,f2

    # Some people try to upload Concept2 logbook summaries
    if fileformat == 'c2log':
        os.remove(f2)
        message = "This C2 logbook summary does not contain stroke data. Please download the Export Stroke Data file from the workout details on the C2 logbook."
        return (0,message,f2)

    if fileformat == 'nostrokes':
        os.remove(f2)
        message = "It looks like this file doesn't contain stroke data."
        return (0,message,f2)

    # Some people try to upload RowPro summary logs
    if fileformat == 'rowprolog':
        os.remove(f2)
        message = "This RowPro logbook summary does not contain stroke data. Please use the Stroke Data CSV file for the individual workout in your log."
        return (0,message,f2)

    # Sometimes people try an unsupported file type.
    # Send an email to info@rowsandall.com with the file attached
    # for me to check if it is a bug, or a new file type
    # worth supporting
    if fileformat == 'unknown':
	message = "We couldn't recognize the file type"
	if settings.DEBUG:
	    res = handle_sendemail_unrecognized.delay(f2,
						      r.user.email)

	else:
	    res = queuehigh.enqueue(handle_sendemail_unrecognized,
				    f2,r.user.email)
        return (0,message,f2)

    # handle non-Painsled by converting it to painsled compatible CSV
    if (fileformat != 'csv'):
        try:
            f2,summary,oarlength,inboard = handle_nonpainsled(f2,
                                                              fileformat,
                                                              summary=summary)
        except:
            errorstring = str(sys.exc_info()[0])
            message = 'Something went wrong: '+errorstring
            return (0,message,'')


    dosummary = (fileformat != 'fit')
    id,message = save_workout_database(f2,r,
                                       workouttype=workouttype,
                                       makeprivate=makeprivate,
                                       dosummary=dosummary,
                                       workoutsource=fileformat,
                                       summary=summary,
                                       inboard=inboard,oarlength=oarlength,
                                       title=title)

    return (id,message,f2)

# Create new workout from data frame and store it in the database
# This routine should be used everywhere in views.py and mailprocessing.py
# Currently there is code duplication
def new_workout_from_df(r,df,
                        title='New Workout',
                        parent=None):

    message = None

    summary = ''
    if parent:
        oarlength = parent.oarlength
        inboard = parent.inboard
        workouttype = parent.workouttype
        notes=parent.notes
        summary=parent.summary
        makeprivate=parent.privacy
        startdatetime=parent.startdatetime
    else:
        oarlength = 2.89
        inboard = 0.88
        workouttype = 'rower'
        notes=''
        summary=''
        makeprivate=False
        startdatetime = timezone.now()

    timestr = strftime("%Y%m%d-%H%M%S")

    csvfilename ='media/Fusion_'+timestr+'.csv'


    df.rename(columns = columndict,inplace=True)
    starttimeunix = mktime(startdatetime.utctimetuple())
    df[' ElapsedTime (sec)'] = df['TimeStamp (sec)']
    df['TimeStamp (sec)'] = df['TimeStamp (sec)']+starttimeunix

    row = rrdata(df=df)
    row.write_csv(csvfilename,gzip=True)

    #res =  df.to_csv(csvfilename+'.gz',index_label='index',
    #                 compression='gzip')

    id,message = save_workout_database(csvfilename,r,
                                       workouttype=workouttype,
                                       title=title,
                                       notes=notes,
                                       oarlength=oarlength,
                                       inboard=inboard,
                                       makeprivate=makeprivate,
                                       dosmooth=False,
                                       consistencychecks=False)


    return (id,message)


# Compare the data from the CSV file and the database
# Currently only calculates number of strokes. To be expanded with
# more elaborate testing if needed
def compare_data(id):
    row = Workout.objects.get(id=id)
    f1 = row.csvfilename
    try:
        rowdata = rdata(f1)
        l1 = len(rowdata.df)
    except AttributeError:
        rowdata = 0
        l1 = 0

    engine = create_engine(database_url, echo=False)
    query = sa.text('SELECT COUNT(*) FROM strokedata WHERE workoutid={id};'.format(
        id=id,
    ))
    with engine.connect() as conn, conn.begin():
        try:
            res = conn.execute(query)
            l2 = res.fetchall()[0][0]
        except:
            print "Database Locked"
    conn.close()
    engine.dispose()
    lfile = l1
    ldb = l2
    return l1==l2 and l1 != 0,ldb,lfile

# Repair data for workouts where the CSV file is lost (or the DB entries
# don't exist)
def repair_data(verbose=False):
    ws = Workout.objects.all()
    for w in ws:
        if verbose:
            sys.stdout.write(".")
        test,ldb,lfile = compare_data(w.id)
        if not test:
            if verbose:
                print w.id,lfile,ldb
            try:
                rowdata = rdata(w.csvfilename)
                if rowdata and len(rowdata.df):
                    update_strokedata(w.id,rowdata.df)

            except IOError, AttributeError:
                pass

            if lfile==0:
                # if not ldb - delete workout

                try:
                    data = read_df_sql(w.id)
                    try:
                        datalength = len(data)
                    except AttributeError:
                        datalength = 0

                    if datalength != 0:
                        data.rename(columns = columndict,inplace=True)
                        res = data.to_csv(w.csvfilename+'.gz',
                                          index_label='index',
                                          compression='gzip')
                        print 'adding csv file'
                    else:
                        print w.id,' No stroke records anywhere'
                        w.delete()
                except:
                    print 'failed'
                    print str(sys.exc_info()[0])
                    pass

# A wrapper around the rowingdata class, with some error catching
def rdata(file,rower=rrower()):
    try:
	res = rrdata(csvfile=file,rower=rower)
    except IOError,IndexError:
        try:
            res = rrdata(csvfile=file+'.gz',rower=rower)
        except IOError,IndexError:
	    res = 0
        except:
            res = 0

    return res

# Remove all stroke data for workout ID from database
def delete_strokedata(id):
    engine = create_engine(database_url, echo=False)
    query = sa.text('DELETE FROM strokedata WHERE workoutid={id};'.format(
        id=id,
    ))
    with engine.connect() as conn, conn.begin():
        try:
            result = conn.execute(query)
        except:
            print "Database Locked"
    conn.close()
    engine.dispose()

# Replace stroke data in DB with data from CSV file
def update_strokedata(id,df):
    delete_strokedata(id)
    rowdata = dataprep(df,id=id,bands=True,barchart=True,otwpower=True)

# Test that all data are of a numerical time
def testdata(time,distance,pace,spm):
    t1 = np.issubdtype(time,np.number)
    t2 = np.issubdtype(distance,np.number)
    t3 = np.issubdtype(pace,np.number)
    t4 = np.issubdtype(spm,np.number)

    return t1 and t2 and t3 and t4

# Get data from DB for one workout (fetches all data). If data
# is not in DB, read from CSV file (and create DB entry)
def getrowdata_db(id=0,doclean=False):
    data = read_df_sql(id)
    data['x_right'] = data['x_right']/1.0e6

    if data.empty:
        rowdata,row = getrowdata(id=id)
        if rowdata:
            data = dataprep(rowdata.df,id=id,bands=True,barchart=True,otwpower=True)
        else:
            data = pd.DataFrame() # returning empty dataframe
    else:
        row = Workout.objects.get(id=id)

    if doclean:
        data = clean_df_stats(data,ignorehr=True)

    # these two lines seem redundant ??
    #data['averageforce'] = data['averageforce']
    #data['peakforce'] = data['peakforce']

    return data,row

# Fetch a subset of the data from the DB
def getsmallrowdata_db(columns,ids=[],doclean=True,workstrokesonly=True,
                       convertnewtons=False):
    prepmultipledata(ids)
    data = read_cols_df_sql(ids,columns)

    if convertnewtons:
        if 'peakforce' in columns:
            data['peakforce'] = data['peakforce']*lbstoN
        if 'averageforce' in columns:
            data['averageforce'] = data['averageforce']*lbstoN

    if doclean:
        data = clean_df_stats(data,ignorehr=True,
                              workstrokesonly=workstrokesonly)


    return data

# Fetch both the workout and the workout stroke data (from CSV file)
def getrowdata(id=0):

    # check if valid ID exists (workout exists)
    row = Workout.objects.get(id=id)

    f1 = row.csvfilename

    # get user

    r = row.user
    u = r.user

    rr = rrower(hrmax=r.max,hrut2=r.ut2,
		hrut1=r.ut1,hrat=r.at,
		hrtr=r.tr,hran=r.an,ftp=r.ftp)

    rowdata = rdata(f1,rower=rr)

    return rowdata,row

# Checks if all rows for a list of workout IDs have entries in the
# stroke_data table. If this is not the case, it creates the stroke
# data
# In theory, this should never yield any work, but it's a good
# safety net for programming errors elsewhere in the app
# Also used heavily when I moved from CSV file only to CSV+Stroke data
def prepmultipledata(ids,verbose=False):
    query = sa.text('SELECT DISTINCT workoutid FROM strokedata')
    engine = create_engine(database_url, echo=False)

    with engine.connect() as conn, conn.begin():
        res = conn.execute(query)
        res = list(itertools.chain.from_iterable(res.fetchall()))
    conn.close()
    engine.dispose()

    try:
        ids2 = [int(id) for id in ids]
    except ValueError:
        ids2 = ids

    res = list(set(ids2)-set(res))
    for id in res:
        rowdata,row = getrowdata(id=id)
        if verbose:
            print id
        if rowdata and len(rowdata.df):
            data = dataprep(rowdata.df,id=id,bands=True,barchart=True,otwpower=True)
    return res

# Read a set of columns for a set of workout ids, returns data as a
# pandas dataframe
def read_cols_df_sql(ids,columns):
    # drop columns that are not in offical list
#    axx = [ax[0] for ax in axes]
    axx = [f.name for f in StrokeData._meta.get_fields()]
    for c in columns:
        if not c in axx:
            columns.remove(c)

    columns = list(columns)+['distance','spm']
    columns = [x for x in columns if x != 'None']
    columns = list(set(columns))
    cls = ''
    engine = create_engine(database_url, echo=False)

    for column in columns:
        cls += column+', '
    cls = cls[:-2]
    if len(ids) == 0:
        query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid=0'.format(
            columns = cls,
            ))
    elif len(ids) == 1:
        query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid={id}'.format(
            id = ids[0],
            columns = cls,
            ))
    else:
        query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid IN {ids}'.format(
            columns = cls,
            ids = tuple(ids),
        ))

    connection = engine.raw_connection()
    df = pd.read_sql_query(query,engine)
    df = df.fillna(value=0)

    try:
        df['peakforce'] = df['peakforce']*lbstoN
    except KeyError:
        pass

    try:
        df['averageforce'] = df['averageforce']*lbstoN
    except KeyError:
        pass

    engine.dispose()
    return df

# Read stroke data from the DB for a Workout ID. Returns a pandas dataframe
def read_df_sql(id):
    engine = create_engine(database_url, echo=False)

    df = pd.read_sql_query(sa.text('SELECT * FROM strokedata WHERE workoutid={id}'.format(
        id=id)), engine)

    engine.dispose()
    df = df.fillna(value=0)
    try:
        df['peakforce'] = df['peakforce']*lbstoN
    except KeyError:
        pass

    try:
        df['averageforce'] = df['averageforce']*lbstoN
    except KeyError:
        pass

    return df

# Get the necessary data from the strokedata table in the DB.
# For the flex plot
def smalldataprep(therows,xparam,yparam1,yparam2):
    df = pd.DataFrame()
    if yparam2 == 'None':
        yparam2 = 'power'
    df[xparam] = []
    df[yparam1] = []
    df[yparam2] = []
    df['distance'] = []
    df['spm'] = []
    for workout in therows:
        f1 = workout.csvfilename

        try:
            rowdata = dataprep(rrdata(f1).df)

            rowdata = pd.DataFrame({xparam: rowdata[xparam],
                                    yparam1: rowdata[yparam1],
                                    yparam2: rowdata[yparam2],
                                    'distance': rowdata['distance'],
                                    'spm': rowdata['spm'],
            }
            )
            df = pd.concat([df,rowdata],ignore_index=True)
        except IOError:
            try:
                rowdata = dataprep(rrdata(f1+'.gz').df)
                rowdata = pd.DataFrame({xparam: rowdata[xparam],
                                        yparam1: rowdata[yparam1],
                                        yparam2: rowdata[yparam2],
                                        'distance': rowdata['distance'],
                                        'spm': rowdata['spm'],
                }
                )
                df = pd.concat([df,rowdata],ignore_index=True)
            except IOError:
                pass

    try:
        df['peakforce'] = df['peakforce']*lbstoN
    except KeyError:
        pass

    try:
        df['averageforce'] = df['averageforce']*lbstoN
    except KeyError:
        pass

    return df

# data fusion
def datafusion(id1,id2,columns,offset):
    df1,w1 = getrowdata_db(id=id1)
    df1 = df1.drop([#'cumdist',
                    'hr_ut2',
		    'hr_ut1',
		    'hr_at',
		    'hr_tr',
		    'hr_an',
		    'hr_max',
                    'ftime',
                    'fpace',
                    'workoutid',
                    'id'],
                   1,errors='ignore')

    # Add coordinates to DataFrame
    latitude,longitude = get_latlon(id1)

    df1[' latitude'] = latitude
    df1[' longitude'] = longitude


    df2 = getsmallrowdata_db(['time']+columns,ids=[id2],doclean=False)
    offsetmillisecs = offset.seconds*1000+offset.microseconds/1000.
    offsetmillisecs += offset.days*(3600*24*1000)
    df2['time'] = df2['time']+offsetmillisecs


    keep1 = {c:c for c in set(df1.columns)}

    for c in columns:
        keep1.pop(c)


    for c in df1.columns:
        if not c in keep1:
            df1 = df1.drop(c,1,errors='ignore')

    df = pd.concat([df1,df2],ignore_index=True)
    df = df.sort_values(['time'])
    df = df.interpolate(method='linear',axis=0,limit_direction='both',
                        limit=10)
    df.fillna(method='bfill',inplace=True)

    # Some new stuff to try out
    df = df.groupby('time',axis=0).mean()
    df['time'] = df.index
    df.reset_index(drop=True,inplace=True)

    df['time'] = df['time']/1000.
    df['pace'] = df['pace']/1000.
    df['cum_dist'] = df['cumdist']

    return df

# This is the main routine.
# it reindexes, sorts, filters, and smooths the data, then
# saves it to the stroke_data table in the database
# Takes a rowingdata object's DataFrame as input
def dataprep(rowdatadf,id=0,bands=True,barchart=True,otwpower=True,
             empower=True,inboard=0.88):
    if rowdatadf.empty:
        return 0

    rowdatadf.set_index([range(len(rowdatadf))],inplace=True)
    t = rowdatadf.ix[:,'TimeStamp (sec)']
    t = pd.Series(t-rowdatadf.ix[0,'TimeStamp (sec)'])

    row_index = rowdatadf.ix[:,' Stroke500mPace (sec/500m)'] > 3000
    rowdatadf.loc[row_index,' Stroke500mPace (sec/500m)'] = 3000.

    p = rowdatadf.ix[:,' Stroke500mPace (sec/500m)']
    hr = rowdatadf.ix[:,' HRCur (bpm)']
    spm = rowdatadf.ix[:,' Cadence (stokes/min)']
    cumdist = rowdatadf.ix[:,'cum_dist']

    power = rowdatadf.ix[:,' Power (watts)']
    averageforce = rowdatadf.ix[:,' AverageDriveForce (lbs)']
    drivelength = rowdatadf.ix[:,' DriveLength (meters)']
    try:
        workoutstate = rowdatadf.ix[:,' WorkoutState']
    except KeyError:
        workoutstate = 0*hr

    peakforce = rowdatadf.ix[:,' PeakDriveForce (lbs)']

    forceratio = averageforce/peakforce
    forceratio = forceratio.fillna(value=0)

    try:
        drivetime = rowdatadf.ix[:,' DriveTime (ms)']
        recoverytime = rowdatadf.ix[:,' StrokeRecoveryTime (ms)']
        rhythm = 100.*drivetime/(recoverytime+drivetime)
        rhythm = rhythm.fillna(value=0)
    except:
        rhythm = 0.0*forceratio

    f = rowdatadf['TimeStamp (sec)'].diff().mean()
    if f != 0:
        windowsize = 2*(int(10./(f)))+1
    else:
        windowsize = 1
    if windowsize <= 3:
	windowsize = 5

    if windowsize > 3 and windowsize<len(hr):
	spm = savgol_filter(spm,windowsize,3)
	hr = savgol_filter(hr,windowsize,3)
	drivelength = savgol_filter(drivelength,windowsize,3)
        forceratio = savgol_filter(forceratio,windowsize,3)

    try:
        t2 = t.fillna(method='ffill').apply(lambda x: timedeltaconv(x))
    except TypeError:
        t2 = 0*t


    p2 = p.fillna(method='ffill').apply(lambda x: timedeltaconv(x))

    try:
        drivespeed = drivelength/rowdatadf[' DriveTime (ms)']*1.0e3
    except TypeError:
        drivespeed = 0.0*rowdatadf['TimeStamp (sec)']

    drivespeed = drivespeed.fillna(value=0)
    driveenergy = drivelength*averageforce*lbstoN
    distance = rowdatadf.ix[:,'cum_dist']


    data = DataFrame(
	dict(
	    time = t*1e3,
	    hr = hr,
	    pace = p*1e3,
	    spm = spm,
	    cumdist = cumdist,
	    ftime = niceformat(t2),
	    fpace = nicepaceformat(p2),
	    driveenergy=driveenergy,
	    power=power,
            workoutstate=workoutstate,
	    averageforce=averageforce,
	    drivelength=drivelength,
	    peakforce=peakforce,
            forceratio=forceratio,
	    distance=distance,
	    drivespeed=drivespeed,
            rhythm=rhythm,
	    )
	)

    if bands:
	# HR bands
	data['hr_ut2'] = rowdatadf.ix[:,'hr_ut2']
	data['hr_ut1'] = rowdatadf.ix[:,'hr_ut1']
	data['hr_at'] = rowdatadf.ix[:,'hr_at']
	data['hr_tr'] = rowdatadf.ix[:,'hr_tr']
	data['hr_an'] = rowdatadf.ix[:,'hr_an']
	data['hr_max'] = rowdatadf.ix[:,'hr_max']
	data['hr_bottom'] = 0.0*data['hr']

    if barchart:
	# time increments for bar chart
	time_increments = rowdatadf.ix[:,' ElapsedTime (sec)'].diff()
	time_increments[0] = time_increments[1]
	time_increments = 0.5*time_increments+0.5*np.abs(time_increments)
	x_right =  (t2+time_increments.apply(lambda x:timedeltaconv(x)))

	data['x_right'] = x_right

    if empower:
        try:
            wash = rowdatadf.ix[:,'wash']
        except KeyError:
            wash = 0*power

        try:
            catch = rowdatadf.ix[:,'catch']
        except KeyError:
            catch = 0*power

        try:
            finish = rowdatadf.ix[:,'finish']
        except KeyError:
            finish = 0*power

        try:
            peakforceangle = rowdatadf.ix[:,'peakforceangle']
        except KeyError:
            peakforceangle = 0*power


        if data['driveenergy'].mean() == 0:
            try:
                driveenergy = rowdatadf.ix[:,'driveenergy']
            except KeyError:
                driveenergy = 0*power
        else:
            driveenergy = data['driveenergy']


        arclength = (inboard-0.05)*(np.radians(finish)-np.radians(catch))
        if arclength.mean()>0:
            drivelength = arclength
        elif drivelength.mean() == 0:
            drivelength = driveenergy/(averageforce*4.44822)

        try:
            slip = rowdatadf.ix[:,'slip']
        except KeyError:
            slip = 0*power

        totalangle = finish-catch
        effectiveangle = finish-wash-catch-slip
        if windowsize > 3 and windowsize<len(slip):
            wash = savgol_filter(wash,windowsize,3)
            slip = savgol_filter(slip,windowsize,3)
            catch = savgol_filter(catch,windowsize,3)
            finish = savgol_filter(finish,windowsize,3)
            peakforceangle = savgol_filter(peakforceangle,windowsize,3)
            driveenergy = savgol_filter(driveenergy,windowsize,3)
            drivelength = savgol_filter(drivelength,windowsize,3)
            totalangle = savgol_filter(totalangle,windowsize,3)
            effectiveangle = savgol_filter(effectiveangle,windowsize,3)

        data['wash'] = wash
        data['catch'] = catch
        data['slip'] = slip
        data['finish'] = finish
        data['peakforceangle'] = peakforceangle
        data['driveenergy'] = driveenergy
        data['drivelength'] = drivelength
        data['totalangle'] = totalangle
        data['effectiveangle'] = effectiveangle

    if otwpower:
	try:
	    nowindpace = rowdatadf.ix[:,'nowindpace']
	except KeyError:
	    nowindpace = p
	try:
	    equivergpower = rowdatadf.ix[:,'equivergpower']
	except KeyError:
	    equivergpower = 0*p+50.

	nowindpace2 = nowindpace.apply(lambda x: timedeltaconv(x))
	ergvelo = (equivergpower/2.8)**(1./3.)

	ergpace = 500./ergvelo
	ergpace[ergpace == np.inf] = 240.
	ergpace2 = ergpace.apply(lambda x: timedeltaconv(x))

	data['ergpace'] = ergpace*1e3
	data['nowindpace'] = nowindpace*1e3
	data['equivergpower'] = equivergpower
	data['fergpace'] = nicepaceformat(ergpace2)
	data['fnowindpace'] = nicepaceformat(nowindpace2)

    data = data.replace([-np.inf,np.inf],np.nan)
    data = data.fillna(method='ffill')

    # write data if id given
    if id != 0:
        data['workoutid'] = id
        engine = create_engine(database_url, echo=False)
        with engine.connect() as conn, conn.begin():
            data.to_sql('strokedata',engine,if_exists='append',index=False)
        conn.close()
        engine.dispose()
    return data