rowsandall/rowers/dataprep.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

# All the data preparation, data cleaning and data mangling should
# be defined here
from __future__ import unicode_literals, absolute_import
from rowers.models import Workout, StrokeData,Team

import pytz

from rowingdata import rowingdata as rrdata

from rowingdata import rower as rrower

from shutil import copyfile

from rowingdata import (
    get_file_type, get_empower_rigging,get_empower_firmware
    )

from rowers.tasks import handle_sendemail_unrecognized
from rowers.tasks import handle_zip_file

from pandas import DataFrame, Series

from django.utils import timezone
from django.utils.timezone import get_current_timezone
from django_mailbox.models import Message,Mailbox,MessageAttachment

from time import strftime
import arrow

thetimezone = get_current_timezone()
from rowingdata import (
    TCXParser, RowProParser, ErgDataParser,
    CoxMateParser,
    BoatCoachParser, RowPerfectParser, BoatCoachAdvancedParser,
    MysteryParser, BoatCoachOTWParser,QuiskeParser,
    painsledDesktopParser, speedcoachParser, ErgStickParser,
    SpeedCoach2Parser, FITParser, fitsummarydata,
    RitmoTimeParser,KinoMapParser,
    make_cumvalues,cumcpdata,ExcelTemplate,
    summarydata, get_file_type,
)

from rowingdata.csvparsers import HumonParser


from rowers.metrics import axes,calc_trimp,rowingmetrics
from rowers.models import strokedatafields

#allowedcolumns = [item[0] for item in rowingmetrics]
allowedcolumns = [key for key,value in strokedatafields.items()]

from async_messages import messages as a_messages
import os
import zipfile
import pandas as pd
import numpy as np
import itertools
import math
from rowers.tasks import (
    handle_sendemail_unrecognized, handle_sendemail_breakthrough,
    handle_sendemail_hard, handle_updatecp,handle_updateergcp,
    handle_calctrimp,
)

from django.conf import settings
from sqlalchemy import create_engine
import sqlalchemy as sa
import sys
import rowers.utils as utils
import rowers.datautils as datautils
from rowers.utils import lbstoN,myqueue,is_ranking_piece,wavg

from timezonefinder import TimezoneFinder

import django_rq
queue = django_rq.get_queue('default')
queuelow = django_rq.get_queue('low')
queuehigh = django_rq.get_queue('default')

from rowsandall_app.settings import SITE_URL
from rowers.mytypes import otwtypes,otetypes

from rowers.database import *
from rowers.opaque import encoder


# mapping the DB column names to the CSV file column names
columndict = {
    'time': 'TimeStamp (sec)',
    'hr': ' HRCur (bpm)',
    'velo': ' AverageBoatSpeed (m/s)',
    'pace': ' Stroke500mPace (sec/500m)',
    'spm': ' Cadence (stokes/min)',
    'power': ' Power (watts)',
    'averageforce': ' AverageDriveForce (lbs)',
    'drivelength': ' DriveLength (meters)',
    'peakforce': ' PeakDriveForce (lbs)',
    'distance': ' Horizontal (meters)',
    'catch': 'catch',
    'finish': 'finish',
    'peakforceangle': 'peakforceangle',
    'wash': 'wash',
    'slip': 'slip',
    'workoutstate': ' WorkoutState',
    'cumdist': 'cum_dist',
}

from scipy.signal import savgol_filter

import datetime


def get_latlon(id):
    try:
        w = Workout.objects.get(id=id)
    except Workout.DoesNotExist:
        return False


    rowdata = rdata(w.csvfilename)

    if rowdata.df.empty:
        return [pd.Series([]), pd.Series([])]

    try:
        try:
            latitude = rowdata.df.loc[:, ' latitude']
            longitude = rowdata.df.loc[:, ' longitude']
        except KeyError:
            latitude = 0 * rowdata.df.loc[:, 'TimeStamp (sec)']
            longitude = 0 * rowdata.df.loc[:, 'TimeStamp (sec)']
        return [latitude, longitude]
    except AttributeError:
        return [pd.Series([]), pd.Series([])]

    return [pd.Series([]), pd.Series([])]

def workout_summary_to_df(
        rower,
        startdate=datetime.datetime(1970,1,1),
        enddate=timezone.now()+timezone.timedelta(days=1)):

    ws = Workout.objects.filter(user=rower).order_by("startdatetime")

    types = []
    names = []
    startdatetimes = []
    timezones = []
    distances = []
    durations = []
    weightcategories = []
    adaptivetypes = []
    weightvalues = []
    notes = []
    tcx_links = []
    csv_links = []
    rscores = []
    trimps = []

    for w in ws:
        types.append(w.workouttype)
        names.append(w.name)
        startdatetimes.append(w.startdatetime)
        timezones.append(w.timezone)
        distances.append(w.distance)
        durations.append(w.duration)
        weightcategories.append(w.weightcategory)
        adaptivetypes.append(w.adaptiveclass)
        weightvalues.append(w.weightvalue)
        notes.append(w.notes)
        tcx_link = SITE_URL+'/rowers/workout/{id}/emailtcx'.format(
            id=encoder.encode_hex(w.id)
            )
        tcx_links.append(tcx_link)
        csv_link = SITE_URL+'/rowers/workout/{id}/emailcsv'.format(
            id=encoder.encode_hex(w.id)
            )
        csv_links.append(csv_link)
        trimps.append(workout_trimp(w)[0])
        rscore = workout_rscore(w)
        rscores.append(int(rscore[0]))

    df = pd.DataFrame({
        'name':names,
        'date':startdatetimes,
        'timezone':timezones,
        'type':types,
        'distance (m)':distances,
        'duration ':durations,
        'weight category':weightcategories,
        'adaptive classification':adaptivetypes,
        'weight (kg)':weightvalues,
        'notes':notes,
        'Stroke Data TCX':tcx_links,
        'Stroke Data CSV':csv_links,
        'TRIMP Training Load':trimps,
        'TSS Training Load':rscores,
    })

    return df

def get_workouts(ids, userid):
    goodids = []
    for id in ids:
        w = Workout.objects.get(id=id)
        if int(w.user.user.id) == int(userid):
            goodids.append(id)

    return [Workout.objects.get(id=id) for id in goodids]


def filter_df(datadf, fieldname, value, largerthan=True):

    try:
        x = datadf[fieldname]
    except KeyError:
        return datadf

    if largerthan:
        mask = datadf[fieldname] < value
    else:
        mask = datadf[fieldname] >= value

    datadf.loc[mask, fieldname] = np.nan

    return datadf

# joins workouts
def join_workouts(r,ids,title='Joined Workout',
                  parent=None,
                  setprivate=False,
                  forceunit='lbs'):

    message = None

    summary = ''
    if parent:
        oarlength = parent.oarlength
        inboard = parent.inboard
        workouttype = parent.workouttype
        notes = parent.notes
        summary = parent.summary
        if parent.privacy == 'hidden':
            makeprivate = True
        else:
            makeprivate = False

        startdatetime = parent.startdatetime
    else:
        oarlength = 2.89
        inboard = 0.88
        workouttype = 'rower'
        notes = ''
        summary = ''
        makeprivate = False
        startdatetime = timezone.now()

    if setprivate == True and makeprivate == False:
        makeprivate = True
    elif setprivate == False and makeprivate == True:
        makeprivate = False


    # reorder in chronological order
    ws = Workout.objects.filter(id__in=ids).order_by("startdatetime")

    if not parent:
        parent = ws[0]
        oarlength = parent.oarlength
        inboard = parent.inboard
        workouttype = parent.workouttype
        notes = parent.notes
        summary = parent.summary

    files = [w.csvfilename for w in ws]

    row = rdata(files[0])

    files = files[1:]

    while len(files):
        row2 = rdata(files[0])
        if row2 != 0:
            row = row+row2
        files = files[1:]

    timestr = strftime("%Y%m%d-%H%M%S")
    csvfilename = 'media/df_' + timestr + '.csv'

    row.write_csv(csvfilename,gzip=True)
    id, message = save_workout_database(csvfilename, r,
                                        workouttype=workouttype,
                                        title=title,
                                        notes=notes,
                                        oarlength=oarlength,
                                        inboard=inboard,
                                        makeprivate=makeprivate,
                                        dosmooth=False,
                                        consistencychecks=False)

    return (id, message)


def df_resample(datadf):
    # time stamps must be in seconds
    timestamps = datadf['TimeStamp (sec)'].astype('int')
    datadf['timestamps'] = timestamps
    newdf = datadf.groupby(['timestamps']).mean()
    return newdf


def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
                   ignoreadvanced=False):
    # clean data remove zeros and negative values

    # bring metrics which have negative values to positive domain
    if datadf.empty:
        return datadf
    try:
        datadf['catch'] = -datadf['catch']
    except KeyError:
        pass

    try:
        datadf['peakforceangle'] = datadf['peakforceangle'] + 1000
    except KeyError:
        pass

    try:
        datadf['hr'] = datadf['hr'] + 10
    except KeyError:
        pass

    # protect 0 spm values from being nulled
    try:
        datadf['spm'] = datadf['spm'] + 1.0
    except (KeyError,TypeError) as e:
        pass

    try:
        datadf = datadf.clip(lower=0)
    except TypeError:
        pass

    datadf.replace(to_replace=0, value=np.nan, inplace=True)

    # bring spm back to real values
    try:
        datadf['spm'] = datadf['spm'] - 1
    except (TypeError,KeyError) as e:
        pass

    # return from positive domain to negative
    try:
        datadf['catch'] = -datadf['catch']
    except KeyError:
        pass

    try:
        datadf['peakforceangle'] = datadf['peakforceangle'] - 1000
    except KeyError:
        pass

    try:
        datadf['hr'] = datadf['hr'] - 10
    except KeyError:
        pass

    # clean data for useful ranges per column
    if not ignorehr:
        try:
            mask = datadf['hr'] < 30
            datadf.loc[mask, 'hr'] = np.nan
        except KeyError:
            pass

    try:
        mask = datadf['spm'] < 0
        datadf.loc[mask,'spm'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['efficiency'] > 200.
        datadf.loc[mask, 'efficiency'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['spm'] < 10
        datadf.loc[mask, 'spm'] = np.nan
    except KeyError:
        pass


    try:
        mask = datadf['pace'] / 1000. > 300.
        datadf.loc[mask, 'pace'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['efficiency'] < 0.
        datadf.loc[mask, 'efficiency'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['pace'] / 1000. < 60.
        datadf.loc[mask, 'pace'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['spm'] > 60
        datadf.loc[mask, 'spm'] = np.nan
    except KeyError:
        pass

    try:
        mask = datadf['wash'] < 1
        datadf.loc[mask, 'wash'] = np.nan
    except KeyError:
        pass

    if not ignoreadvanced:
        try:
            mask = datadf['rhythm'] < 5
            datadf.loc[mask, 'rhythm'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['rhythm'] > 70
            datadf.loc[mask, 'rhythm'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['power'] < 20
            datadf.loc[mask, 'power'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['drivelength'] < 0.5
            datadf.loc[mask, 'drivelength'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['forceratio'] < 0.2
            datadf.loc[mask, 'forceratio'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['forceratio'] > 1.0
            datadf.loc[mask, 'forceratio'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['drivespeed'] < 0.5
            datadf.loc[mask, 'drivespeed'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['drivespeed'] > 4
            datadf.loc[mask, 'drivespeed'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['driveenergy'] > 2000
            datadf.loc[mask, 'driveenergy'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['driveenergy'] < 100
            datadf.loc[mask, 'driveenergy'] = np.nan
        except KeyError:
            pass

        try:
            mask = datadf['catch'] > -30.
            datadf.loc[mask, 'catch'] = np.nan
        except KeyError:
            pass

    workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
    workoutstatesrest = [3]
    workoutstatetransition = [0, 2, 10, 11, 12, 13]

    if workstrokesonly == 'True' or workstrokesonly == True:
        try:
            datadf = datadf[~datadf['workoutstate'].isin(workoutstatesrest)]
        except:
            pass

    return datadf


def getstatsfields():
    # Get field names and remove those that are not useful in stats
    fields = StrokeData._meta.get_fields()

    fielddict = {field.name: field.verbose_name for field in fields}

    # fielddict.pop('workoutid')
    fielddict.pop('ergpace')
    fielddict.pop('hr_an')
    fielddict.pop('hr_tr')
    fielddict.pop('hr_at')
    fielddict.pop('hr_ut2')
    fielddict.pop('hr_ut1')
    fielddict.pop('time')
    fielddict.pop('distance')
    fielddict.pop('nowindpace')
    fielddict.pop('fnowindpace')
    fielddict.pop('fergpace')
    fielddict.pop('equivergpower')
#    fielddict.pop('workoutstate')
    fielddict.pop('fpace')
    fielddict.pop('pace')
    fielddict.pop('id')
    fielddict.pop('ftime')
    fielddict.pop('x_right')
    fielddict.pop('hr_max')
    fielddict.pop('hr_bottom')
    fielddict.pop('cumdist')

    try:
        fieldlist = [field for field, value in fielddict.iteritems()]
    except AttributeError:
        fieldlist = [field for field, value in fielddict.items()]

    return fieldlist, fielddict


# A string representation for time deltas
def niceformat(values):
    out = []
    for v in values:
        formattedv = strfdelta(v)
        out.append(formattedv)

    return out

# A nice printable format for time delta values


def strfdelta(tdelta):
    try:
        minutes, seconds = divmod(tdelta.seconds, 60)
        tenths = int(tdelta.microseconds / 1e5)
    except AttributeError:
        minutes, seconds = divmod(tdelta.view(np.int64), 60e9)
        seconds, rest = divmod(seconds, 1e9)
        tenths = int(rest / 1e8)
    res = "{minutes:0>2}:{seconds:0>2}.{tenths:0>1}".format(
        minutes=minutes,
        seconds=seconds,
        tenths=tenths,
    )

    return res

def timedelta_to_seconds(tdelta):
    return 60.*tdelta.minute+tdelta.second


# A nice printable format for pace values


def nicepaceformat(values):
    out = []
    for v in values:
        formattedv = strfdelta(v)
        out.append(formattedv)

    return out

# Convert seconds to a Time Delta value, replacing NaN with a 5:50 pace


def timedeltaconv(x):
    if np.isfinite(x) and x != 0 and x > 0 and x < 175000:
        dt = datetime.timedelta(seconds=x)
    else:
        dt = datetime.timedelta(seconds=350.)

    return dt


def paceformatsecs(values):
    out = []
    for v in values:
        td = timedeltaconv(v)
        formattedv = strfdelta(td)
        out.append(formattedv)

    return out

def fitnessmetric_to_sql(m,table='powertimefitnessmetric',debug=False):
    engine = create_engine(database_url, echo=False)
    columns = ', '.join(m.keys())
    placeholders  = ", ".join(["?"] * len(m))

    query = "INSERT into %s ( %s ) Values (%s)" % (table, columns, placeholders)

    values = tuple(m[key] for key in m.keys())
    with engine.connect() as conn, conn.begin():
        result = conn.execute(query,values)

    conn.close()
    engine.dispose()

    return 1


def getcpdata_sql(rower_id,table='cpdata'):
    engine = create_engine(database_url, echo=False)
    query = sa.text('SELECT * from {table} WHERE user={rower_id};'.format(
        rower_id=rower_id,
        table=table,
        ))
    connection = engine.raw_connection()
    df = pd.read_sql_query(query, engine)

    return df

def deletecpdata_sql(rower_id,table='cpdata'):
    engine = create_engine(database_url, echo=False)
    query = sa.text('DELETE from {table} WHERE user={rower_id};'.format(
        rower_id=rower_id,
        table=table,
        ))
    with engine.connect() as conn, conn.begin():
        try:
            result = conn.execute(query)
        except:
            print("Database locked")
    conn.close()
    engine.dispose()


def updatecpdata_sql(rower_id,delta,cp,table='cpdata',distance=[]):
    deletecpdata_sql(rower_id)
    df = pd.DataFrame(
        {
            'delta':delta,
            'cp':cp,
            'user':rower_id
        }
    )

    if not distance.empty:
        df['distance'] = distance

    engine = create_engine(database_url, echo=False)
    with engine.connect() as conn, conn.begin():
        df.to_sql(table, engine, if_exists='append', index=False)
    conn.close()
    engine.dispose()


def runcpupdate(
        rower,type='water',
        startdate=timezone.now()-datetime.timedelta(days=365),
        enddate=timezone.now()+datetime.timedelta(days=5)
):
    if type == 'water':
        theworkouts = Workout.objects.filter(
            user=rower,rankingpiece=True,
	    workouttype='water',
	    startdatetime__gte=startdate,
	    startdatetime__lte=enddate
        )
        table = 'cpdata'
    else:
        theworkouts = Workout.objects.filter(
            user=rower,rankingpiece=True,
	    workouttype__in=[
                'rower',
                'dynamic',
                'slides'
            ],
	    startdatetime__gte=startdate,
	    startdatetime__lte=enddate
        )
        table = 'cpergdata'

    theids = [w.id for w in theworkouts]

    job = myqueue(
        queue,
        handle_updatecp,
        rower.id,
        theids,
        table=table)

    return job

def fetchcperg(rower,theworkouts):
    theids = [int(w.id) for w in theworkouts]
    thefilenames = [w.csvfilename for w in theworkouts]
    cpdf = getcpdata_sql(rower.id,table='ergcpdata')

    job = myqueue(
        queue,
        handle_updateergcp,
        rower.id,
        thefilenames)

    return cpdf


def fetchcp(rower,theworkouts,table='cpdata'):
    # get all power data from database (plus workoutid)
    theids = [int(w.id) for w in theworkouts]
    columns = ['power','workoutid','time']
    df = getsmallrowdata_db(columns,ids=theids)
    df.dropna(inplace=True,axis=0)
    if df.empty:
        avgpower2 = {}
        for id  in theids:
            avgpower2[id] = 0
        return pd.Series([]),pd.Series([]),avgpower2

    try:
        dfgrouped = df.groupby(['workoutid'])
    except KeyError:
        avgpower2 = {}
        return pd.Series([]),pd.Series([]),avgpower2
    try:
        avgpower2 = dict(dfgrouped.mean()['power'].astype(int))
    except KeyError:
        avgpower2 = {}
        for id in theids:
            avgpower2[id] = 0
        return pd.Series([]),pd.Series([]),avgpower2

    cpdf = getcpdata_sql(rower.id,table=table)

    if not cpdf.empty:
        return cpdf['delta'],cpdf['cp'],avgpower2
    else:
        job = myqueue(queue,
                      handle_updatecp,
                      rower.id,
                      theids,
                      table=table)

        return pd.Series([]),pd.Series([]),avgpower2


    return pd.Series([]),pd.Series([]),avgpower2


# create a new workout from manually entered data
def create_row_df(r,distance,duration,startdatetime,workouttype='rower',
                  avghr=None,avgpwr=None,avgspm=None,
                  rankingpiece = False,
                  duplicate=False,
                  title='Manual entry',notes='',weightcategory='hwt',
                  adaptiveclass='None'):

    if duration is not None:
        totalseconds = duration.hour*3600.
        totalseconds += duration.minute*60.
        totalseconds += duration.second
        totalseconds += duration.microsecond/1.e6
    else:
        totalseconds = 60.

    if distance is None:
        distance = 0

    try:
        nr_strokes = int(distance/10.)
    except TypeError:
        nr_strokes = int(20.*totalseconds)

    if nr_strokes == 0:
        nr_strokes = 100

    unixstarttime = arrow.get(startdatetime).timestamp


    if not avgspm:
        try:
            spm = 60.*nr_strokes/totalseconds
        except ZeroDivisionError:
            spm = 20.
    else:
        spm = avgspm

    step = totalseconds/float(nr_strokes)

    elapsed = np.arange(nr_strokes)*totalseconds/(float(nr_strokes-1))

    dstep = distance/float(nr_strokes)

    d = np.arange(nr_strokes)*distance/(float(nr_strokes-1))

    unixtime = unixstarttime + elapsed

    try:
        pace = 500.*totalseconds/distance
    except ZeroDivisionError:
        pace = 240.

    if workouttype in ['rower','slides','dynamic']:
        try:
            velo = distance/totalseconds
        except ZeroDivisionError:
            velo = 2.4
        power = 2.8*velo**3
    elif avgpwr is not None:
        power = avgpwr
    else:
        power = 0

    if avghr is not None:
        hr = avghr
    else:
        hr = 0

    df = pd.DataFrame({
        'TimeStamp (sec)': unixtime,
	' Horizontal (meters)': d,
        ' Cadence (stokes/min)': spm,
        ' Stroke500mPace (sec/500m)':pace,
        ' ElapsedTime (sec)':elapsed,
        ' Power (watts)':power,
        ' HRCur (bpm)':hr,
    })

    timestr = strftime("%Y%m%d-%H%M%S")

    csvfilename = 'media/df_' + timestr + '.csv'
    df[' ElapsedTime (sec)'] = df['TimeStamp (sec)']

    row = rrdata(df=df)

    row.write_csv(csvfilename, gzip = True)

    id, message = save_workout_database(csvfilename, r,
                                        title=title,
                                        notes=notes,
                                        rankingpiece=rankingpiece,
                                        duplicate=duplicate,
                                        dosmooth=False,
                                        workouttype=workouttype,
                                        consistencychecks=False,
                                        weightcategory=weightcategory,
                                        adaptiveclass=adaptiveclass,
                                        totaltime=totalseconds)

    return (id, message)

from rowers.utils import totaltime_sec_to_string

# Processes painsled CSV file to database
def save_workout_database(f2, r, dosmooth=True, workouttype='rower',
                          boattype='1x',
                          adaptiveclass='None',
                          weightcategory='hwt',
                          dosummary=True, title='Workout',
                          workoutsource='unknown',
                          notes='', totaldist=0, totaltime=0,
                          rankingpiece=False,
                          duplicate=False,
                          summary='',
                          makeprivate=False,
                          oarlength=2.89, inboard=0.88,
                          forceunit='lbs',
                          consistencychecks=False):

    message = None
    powerperc = 100 * np.array([r.pw_ut2,
                                r.pw_ut1,
                                r.pw_at,
                                r.pw_tr, r.pw_an]) / r.ftp

    # make workout and put in database
    rr = rrower(hrmax=r.max, hrut2=r.ut2,
                hrut1=r.ut1, hrat=r.at,
                hrtr=r.tr, hran=r.an, ftp=r.ftp,
                powerperc=powerperc, powerzones=r.powerzones)
    row = rdata(f2, rower=rr)

    if row.df.empty:
        return (0, 'Error: CSV data file was empty')

    dtavg = row.df['TimeStamp (sec)'].diff().mean()

    if dtavg < 1:
        newdf = df_resample(row.df)
        try:
            os.remove(f2)
        except:
            pass
        return new_workout_from_df(r, newdf,
                                   title=title,boattype=boattype,
                                   workouttype=workouttype,
                                   workoutsource=workoutsource)
    try:
        checks = row.check_consistency()
        allchecks = 1
        for key, value in checks.items():
            if not value:
                allchecks = 0
                if consistencychecks:
                    a_messages.error(
                        r.user, 'Failed consistency check: ' + key + ', autocorrected')
                else:
                    pass
                    # a_messages.error(r.user,'Failed consistency check: '+key+', not corrected')
    except ZeroDivisionError:
        pass

    if not allchecks and consistencychecks:
        # row.repair()
        pass

    if row == 0:
        return (0, 'Error: CSV data file not found')

    if dosmooth:
        # auto smoothing
        pace = row.df[' Stroke500mPace (sec/500m)'].values
        velo = 500. / pace

        f = row.df['TimeStamp (sec)'].diff().mean()
        if f != 0 and not np.isnan(f):
            windowsize = 2 * (int(10. / (f))) + 1
        else:
            windowsize = 1
        if not 'originalvelo' in row.df:
            row.df['originalvelo'] = velo

        if windowsize > 3 and windowsize < len(velo):
            velo2 = savgol_filter(velo, windowsize, 3)
        else:
            velo2 = velo

        velo3 = pd.Series(velo2)
        velo3 = velo3.replace([-np.inf, np.inf], np.nan)
        velo3 = velo3.fillna(method='ffill')

        pace2 = 500. / abs(velo3)

        row.df[' Stroke500mPace (sec/500m)'] = pace2

        row.df = row.df.fillna(0)

        row.write_csv(f2, gzip=True)
        try:
            os.remove(f2)
        except:
            pass

    # recalculate power data
    if workouttype == 'rower' or workouttype == 'dynamic' or workouttype == 'slides':
        try:
            row.erg_recalculatepower()
            row.write_csv(f2, gzip=True)
        except:
            pass

    averagehr = row.df[' HRCur (bpm)'].mean()
    maxhr = row.df[' HRCur (bpm)'].max()

    if totaldist == 0:
        totaldist = row.df['cum_dist'].max()
    if totaltime == 0:
        totaltime = row.df['TimeStamp (sec)'].max(
        ) - row.df['TimeStamp (sec)'].min()
        try:
            totaltime = totaltime + row.df.loc[:, ' ElapsedTime (sec)'].iloc[0]
        except KeyError:
            pass

    if np.isnan(totaltime):
        totaltime = 0


    if dosummary:
        summary = row.allstats()


    timezone_str = 'UTC'
    try:
        workoutstartdatetime = timezone.make_aware(row.rowdatetime)
    except ValueError:
        workoutstartdatetime = row.rowdatetime

    try:
        latavg = row.df[' latitude'].mean()
        lonavg = row.df[' longitude'].mean()

        tf = TimezoneFinder()
        try:
            timezone_str = tf.timezone_at(lng=lonavg, lat=latavg)
        except ValueError:
            timezone_str = 'UTC'
        if timezone_str == None:
            timezone_str = tf.closest_timezone_at(lng=lonavg,
                                                  lat=latavg)
            if timezone_str == None:
                timezone_str = r.defaulttimezone
        try:
            workoutstartdatetime = pytz.timezone(timezone_str).localize(
                row.rowdatetime
            )
        except ValueError:
            workoutstartdatetime = workoutstartdatetime.astimezone(
                pytz.timezone(timezone_str)
            )
    except KeyError:
        timezone_str = r.defaulttimezone

    duration = totaltime_sec_to_string(totaltime)

    workoutdate = workoutstartdatetime.astimezone(
        pytz.timezone(timezone_str)
    ).strftime('%Y-%m-%d')
    workoutstarttime = workoutstartdatetime.astimezone(
        pytz.timezone(timezone_str)
    ).strftime('%H:%M:%S')


    if makeprivate:
        privacy = 'hidden'
    else:
        privacy = 'visible'

    # checking for inf values

    totaldist = np.nan_to_num(totaldist)
    maxhr = np.nan_to_num(maxhr)
    averagehr = np.nan_to_num(averagehr)

    dragfactor = 0
    if workouttype in otetypes:
        dragfactor = row.dragfactor

    t = datetime.datetime.strptime(duration,"%H:%M:%S.%f")
    delta = datetime.timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)

    workoutenddatetime = workoutstartdatetime+delta

    # check for duplicate start times and duration
    ws = Workout.objects.filter(user=r,date=workoutdate,duplicate=False).exclude(
        startdatetime__gt=workoutenddatetime
    )

    ws2 = []

    for ww in ws:
        t = ww.duration
        delta = datetime.timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)
        enddatetime = ww.startdatetime+delta
        if enddatetime > workoutstartdatetime:
            ws2.append(ww)


    if (len(ws2) != 0):
        message = "Warning: This workout overlaps with an existing one and was marked as a duplicate"
        duplicate = True


    w = Workout(user=r, name=title, date=workoutdate,
                workouttype=workouttype,
                boattype=boattype,
                dragfactor=dragfactor,
                duration=duration, distance=totaldist,
                weightcategory=weightcategory,
                adaptiveclass=adaptiveclass,
                starttime=workoutstarttime,
                duplicate=duplicate,
                workoutsource=workoutsource,
                rankingpiece=rankingpiece,
                forceunit=forceunit,
                csvfilename=f2, notes=notes, summary=summary,
                maxhr=maxhr, averagehr=averagehr,
                startdatetime=workoutstartdatetime,
                inboard=inboard, oarlength=oarlength,
                timezone=timezone_str,
                privacy=privacy)

    try:
        w.save()
    except ValidationError:
        w.startdatetime = timezone.now()
        w.save()

    if privacy == 'visible':
        ts = Team.objects.filter(rower=r)
        for t in ts:
            w.team.add(t)

    # put stroke data in database
    res = dataprep(row.df, id=w.id, bands=True,
                   barchart=True, otwpower=True, empower=True, inboard=inboard)

    rscore,normp = workout_rscore(w)
    trimp,hrtss = workout_trimp(w)

    isbreakthrough = False
    ishard = False
    if workouttype == 'water':
        df = getsmallrowdata_db(['power', 'workoutid', 'time'], ids=[w.id])
        try:
            powermean = df['power'].mean()
        except KeyError:
            powermean = 0

        if powermean != 0:
            thesecs = totaltime
            maxt = 1.05 * thesecs
            if maxt > 0:
                logarr = datautils.getlogarr(maxt)
                dfgrouped = df.groupby(['workoutid'])
                delta, cpvalues, avgpower = datautils.getcp(dfgrouped, logarr)

                res, btvalues, res2 = utils.isbreakthrough(
                    delta, cpvalues, r.p0, r.p1, r.p2, r.p3, r.cpratio)
            else:
                res = 0
                res2 = 0
            if res:
                isbreakthrough = True
                res = datautils.updatecp(delta, cpvalues, r)
            if res2 and not isbreakthrough:
                ishard = True

    # submit email task to send email about breakthrough workout
    if isbreakthrough:
        a_messages.info(
            r.user, 'It looks like you have a new breakthrough workout'
        )
        if r.getemailnotifications and not r.emailbounced:
            job = myqueue(queuehigh,handle_sendemail_breakthrough,
                          w.id,
                          r.user.email,
                          r.user.first_name,
                          r.user.last_name,
                          btvalues=btvalues.to_json())

    # submit email task to send email about breakthrough workout
    if ishard:
        a_messages.info(r.user, 'That was a pretty hard workout')
        if r.getemailnotifications and not r.emailbounced:
            job = myqueue(queuehigh,handle_sendemail_hard,
                          w.id,
                          r.user.email,
                          r.user.first_name,
                          r.user.last_name,
                          btvalues=btvalues.to_json())


    return (w.id, message)

parsers = {
    'kinomap': KinoMapParser,
    'xls': ExcelTemplate,
    'rp': RowProParser,
    'tcx':TCXParser,
    'mystery':MysteryParser,
    'ritmotime':RitmoTimeParser,
    'quiske': QuiskeParser,
    'rowperfect3': RowPerfectParser,
    'coxmate': CoxMateParser,
    'bcmike': BoatCoachAdvancedParser,
    'boatcoach': BoatCoachParser,
    'boatcoachotw': BoatCoachOTWParser,
    'painsleddesktop': painsledDesktopParser,
    'speedcoach': speedcoachParser,
    'speedcoach2': SpeedCoach2Parser,
    'ergstick': ErgStickParser,
    'fit': FITParser,
    'ergdata': ErgDataParser,
    'humon': HumonParser,
    }

def parsenonpainsled(fileformat,f2,summary):

    try:
        row = parsers[fileformat](f2)
        hasrecognized = True
    except KeyError:
        hasrecognized = False


    # handle speed coach GPS 2
    if (fileformat == 'speedcoach2'):
        oarlength, inboard = get_empower_rigging(f2)
        empowerfirmware = get_empower_firmware(f2)
        if empowerfirmware != '':
            fileformat = fileformat+'v'+str(empowerfirmware)
        else:
            fileformat = 'speedcoach2v0'
        summary = row.allstats()


    # handle FIT
    if (fileformat == 'fit'):
        try:
            s = fitsummarydata(f2)
            s.setsummary()
            summary = s.summarytext
        except:
            pass
        hasrecognized = True

    return row,hasrecognized,summary,fileformat

def handle_nonpainsled(f2, fileformat, summary=''):
    oarlength = 2.89
    inboard = 0.88
    hasrecognized = False

    try:
        row,hasrecognized,summary,fileformat = parsenonpainsled(fileformat,f2,summary)
    except:
        pass


    # Handle c2log
    if (fileformat == 'c2log' or fileformat == 'rowprolog'):
        return (0,0,0,0,0)

    if not hasrecognized:
        return (0,0,0,0,0)

    f_to_be_deleted = f2
    # should delete file
    f2 = f2[:-4] + 'o.csv'
    try:
        row2 = rrdata(df = row.df)
        row2.write_csv(f2, gzip=True)
    except:
        return (0,0,0,0,0)

    # os.remove(f2)
    try:
        os.remove(f_to_be_deleted)
    except:
        try:
            os.remove(f_to_be_deleted + '.gz')
        except:
            pass

    return (f2, summary, oarlength, inboard, fileformat)

# Create new workout from file and store it in the database
# This routine should be used everywhere in views.py and mailprocessing.py
# Currently there is code duplication


def new_workout_from_file(r, f2,
                          workouttype='rower',
                          workoutsource=None,
                          title='Workout',
                          boattype='1x',
                          makeprivate=False,
                          notes=''):
    message = None
    try:
        fileformat = get_file_type(f2)
    except IOError:
        os.remove(f2)
        message = "Rowsandall could not process this file. The extension is supported but the file seems corrupt. Contact info@rowsandall.com if you think this is incorrect."
        return (0, message, f2)

    summary = ''
    oarlength = 2.89
    inboard = 0.88
    if len(fileformat) == 3 and fileformat[0] == 'zip':
        f_to_be_deleted = f2
        workoutsbox = Mailbox.objects.filter(name='workouts')[0]
        msg = Message(mailbox=workoutsbox,
                      from_header=r.user.email,
                      subject = title)
        msg.save()
        f3 = 'media/mailbox_attachments/'+f2[6:]
        copyfile(f2,f3)
        f3 = f3[6:]
        a = MessageAttachment(message=msg,document=f3)
        a.save()

        return -1, message, f2

    # Some people try to upload Concept2 logbook summaries
    if fileformat == 'c2log':
        os.remove(f2)
        message = "This  summary does not contain stroke data. Use the files containing stroke by stroke data."
        return (0, message, f2)

    if fileformat == 'nostrokes':
        os.remove(f2)
        message = "It looks like this file doesn't contain stroke data."
        return (0, message, f2)

    if fileformat == 'kml':
        os.remove(f2)
        message = "KML files are not supported"
        return (0, message, f2)

    # Some people upload corrupted zip files
    if fileformat == 'notgzip':
        os.remove(f2)
        message = "Rowsandall could not process this file. The extension is supported but the file seems corrupt. Contact info@rowsandall.com if you think this is incorrect."
        return (0, message, f2)

    # Some people try to upload RowPro summary logs
    if fileformat == 'rowprolog':
        os.remove(f2)
        message = "This RowPro logbook summary does not contain stroke data. Please use the Stroke Data CSV file for the individual workout in your log."
        return (0, message, f2)

    # Sometimes people try an unsupported file type.
    # Send an email to info@rowsandall.com with the file attached
    # for me to check if it is a bug, or a new file type
    # worth supporting
    if fileformat == 'unknown':
        message = "We couldn't recognize the file type"
        f4 = f2[:-5]+'a'+f2[-5:]
        copyfile(f2,f4)
        job = myqueue(queuehigh,
                      handle_sendemail_unrecognized,
                      f4,
                      r.user.email)

        return (0, message, f2)

    # handle non-Painsled by converting it to painsled compatible CSV
    if (fileformat != 'csv'):
        try:
            f2, summary, oarlength, inboard, fileformat = handle_nonpainsled(
                f2,
                fileformat,
                summary=summary
            )
            if not f2:
                message = 'Something went wrong'
                return (0, message, '')
        except Exception as e:
            errorstring = str(sys.exc_info()[0])
            message = 'Something went wrong: ' + e.message
            return (0, message, '')

    dosummary = (fileformat != 'fit' and 'speedcoach2' not in fileformat)
    dosummary = dosummary or summary == ''

    if workoutsource is None:
        workoutsource = fileformat

    id, message = save_workout_database(
        f2, r,
        workouttype=workouttype,
        weightcategory=r.weightcategory,
        adaptiveclass=r.adaptiveclass,
        boattype=boattype,
        makeprivate=makeprivate,
        dosummary=dosummary,
        workoutsource=workoutsource,
        summary=summary,
        inboard=inboard, oarlength=oarlength,
        title=title
    )

    return (id, message, f2)


def split_workout(r, parent, splitsecond, splitmode):
    data, row = getrowdata_db(id=parent.id)
    latitude, longitude = get_latlon(parent.id)
    if not latitude.empty and not longitude.empty:
        data[' latitude'] = latitude
        data[' longitude'] = longitude

    data['time'] = data['time'] / 1000.

    data1 = data[data['time'] <= splitsecond].copy()
    data2 = data[data['time'] > splitsecond].copy()

    data1 = data1.sort_values(['time'])
    data1 = data1.interpolate(method='linear', axis=0, limit_direction='both',
                              limit=10)
    data1.fillna(method='bfill', inplace=True)

    # Some new stuff to try out
    data1 = data1.groupby('time', axis=0).mean()
    data1['time'] = data1.index
    data1.reset_index(drop=True, inplace=True)

    data2 = data2.sort_values(['time'])
    data2 = data2.interpolate(method='linear', axis=0, limit_direction='both',
                              limit=10)
    data2.fillna(method='bfill', inplace=True)

    # Some new stuff to try out
    data2 = data2.groupby('time', axis=0).mean()
    data2['time'] = data2.index
    data2.reset_index(drop=True, inplace=True)

    data1['pace'] = data1['pace'] / 1000.
    data2['pace'] = data2['pace'] / 1000.

    data1.drop_duplicates(subset='time', inplace=True)
    data2.drop_duplicates(subset='time', inplace=True)

    messages = []
    ids = []

    if 'keep first' in splitmode:
        if 'firstprivate' in splitmode:
            setprivate = True
        else:
            setprivate = False

        id, message = new_workout_from_df(r, data1,
                                          title=parent.name + ' (1)',
                                          parent=parent,
                                          setprivate=setprivate,
                                          forceunit='N')
        messages.append(message)
        ids.append(encoder.encode_hex(id))
    if 'keep second' in splitmode:
        data2['cumdist'] = data2['cumdist'] - data2.iloc[
            0,
            data2.columns.get_loc('cumdist')
        ]
        data2['distance'] = data2['distance'] - data2.iloc[
            0,
            data2.columns.get_loc('distance')
            ]
        data2['time'] = data2['time'] - data2.iloc[
            0,
            data2.columns.get_loc('time')
            ]
        if 'secondprivate' in splitmode:
            setprivate = True
        else:
            setprivate = False

        dt = datetime.timedelta(seconds=splitsecond)

        id, message = new_workout_from_df(r, data2,
                                          title=parent.name + ' (2)',
                                          parent=parent,
                                          setprivate=setprivate,
                                          dt=dt, forceunit='N')
        messages.append(message)
        ids.append(encoder.encode_hex(id))

    if not 'keep original' in splitmode:
        if 'keep second' in splitmode or 'keep first' in splitmode:
            parent.delete()
            messages.append('Deleted Workout: ' + parent.name)
        else:
            messages.append('That would delete your workout')
            ids.append(encoder.encode_hex(parent.id))
    elif 'originalprivate' in splitmode:
        parent.privacy = 'hidden'
        parent.save()

    return ids, messages

# Create new workout from data frame and store it in the database
# This routine should be used everywhere in views.py and mailprocessing.py
# Currently there is code duplication


def new_workout_from_df(r, df,
                        title='New Workout',
                        workoutsource='unknown',
                        boattype='1x',
                        workouttype='rower',
                        parent=None,
                        setprivate=False,
                        forceunit='lbs',
                        dt=datetime.timedelta()):

    message = None

    summary = ''
    if parent:
        oarlength = parent.oarlength
        inboard = parent.inboard

        workoutsource = parent.workoutsource
        workouttype = parent.workouttype
        boattype = parent.boattype
        notes = parent.notes
        summary = parent.summary
        if parent.privacy == 'hidden':
            makeprivate = True
        else:
            makeprivate = False

        startdatetime = parent.startdatetime + dt
    else:
        oarlength = 2.89
        inboard = 0.88
        notes = ''
        summary = ''
        makeprivate = False
        startdatetime = timezone.now()

    if setprivate:
        makeprivate = True

    timestr = strftime("%Y%m%d-%H%M%S")

    csvfilename = 'media/df_' + timestr + '.csv'
    if forceunit == 'N':
        # change to lbs for now
        df['peakforce'] /= lbstoN
        df['averageforce'] /= lbstoN

    df.rename(columns=columndict, inplace=True)

    #starttimeunix = mktime(startdatetime.utctimetuple())
    starttimeunix = arrow.get(startdatetime).timestamp
    df[' ElapsedTime (sec)'] = df['TimeStamp (sec)']

    df['TimeStamp (sec)'] = df['TimeStamp (sec)'] + starttimeunix

    row = rrdata(df=df)
    row.write_csv(csvfilename, gzip=True)

    # res =  df.to_csv(csvfilename+'.gz',index_label='index',
    #                 compression='gzip')

    id, message = save_workout_database(csvfilename, r,
                                        workouttype=workouttype,
                                        boattype=boattype,
                                        title=title,
                                        workoutsource=workoutsource,
                                        notes=notes,
                                        oarlength=oarlength,
                                        inboard=inboard,
                                        makeprivate=makeprivate,
                                        dosmooth=False,
                                        consistencychecks=False)

    return (id, message)


# Compare the data from the CSV file and the database
# Currently only calculates number of strokes. To be expanded with
# more elaborate testing if needed
def compare_data(id):
    row = Workout.objects.get(id=id)
    f1 = row.csvfilename
    try:
        rowdata = rdata(f1)
        l1 = len(rowdata.df)
    except AttributeError:
        rowdata = 0
        l1 = 0

    engine = create_engine(database_url, echo=False)
    query = sa.text('SELECT COUNT(*) FROM strokedata WHERE workoutid={id};'.format(
        id=id,
    ))
    with engine.connect() as conn, conn.begin():
        try:
            res = conn.execute(query)
            l2 = res.fetchall()[0][0]
        except:
            print("Database Locked")
    conn.close()
    engine.dispose()
    lfile = l1
    ldb = l2
    return l1 == l2 and l1 != 0, ldb, lfile

# Repair data for workouts where the CSV file is lost (or the DB entries
# don't exist)


def repair_data(verbose=False):
    ws = Workout.objects.all()
    for w in ws:
        if verbose:
            sys.stdout.write(".")
        test, ldb, lfile = compare_data(w.id)
        if not test:
            if verbose:
                print(w.id, lfile, ldb)
            try:
                rowdata = rdata(w.csvfilename)
                if rowdata and len(rowdata.df):
                    update_strokedata(w.id, rowdata.df)

            except (IOError, AttributeError):
                pass

            if lfile == 0:
                # if not ldb - delete workout

                try:
                    data = read_df_sql(w.id)
                    try:
                        datalength = len(data)
                    except AttributeError:
                        datalength = 0

                    if datalength != 0:
                        data.rename(columns=columndict, inplace=True)
                        res = data.to_csv(w.csvfilename + '.gz',
                                          index_label='index',
                                          compression='gzip')
                    else:
                        w.delete()
                except:
                    pass

# A wrapper around the rowingdata class, with some error catching


def rdata(file, rower=rrower()):
    try:
        res = rrdata(csvfile=file, rower=rower)
    except (IOError, IndexError):
        try:
            res = rrdata(csvfile=file + '.gz', rower=rower)
        except (IOError, IndexError):
            res = rrdata()
        except:
            res = rrdata()
    except EOFError:
        res = rrdata()

    return res

# Remove all stroke data for workout ID from database


def delete_strokedata(id):
    engine = create_engine(database_url, echo=False)
    query = sa.text('DELETE FROM strokedata WHERE workoutid={id};'.format(
        id=id,
    ))
    with engine.connect() as conn, conn.begin():
        try:
            result = conn.execute(query)
        except:
            print("Database Locked")
    conn.close()
    engine.dispose()

# Replace stroke data in DB with data from CSV file


def update_strokedata(id, df):
    delete_strokedata(id)
    rowdata = dataprep(df, id=id, bands=True, barchart=True, otwpower=True)

# Test that all data are of a numerical time


def testdata(time, distance, pace, spm):
    t1 = np.issubdtype(time, np.number)
    t2 = np.issubdtype(distance, np.number)
    t3 = np.issubdtype(pace, np.number)
    t4 = np.issubdtype(spm, np.number)

    return t1 and t2 and t3 and t4

# Get data from DB for one workout (fetches all data). If data
# is not in DB, read from CSV file (and create DB entry)


def getrowdata_db(id=0, doclean=False, convertnewtons=True,
                  checkefficiency=True):
    data = read_df_sql(id)
    data['x_right'] = data['x_right'] / 1.0e6
    data['deltat'] = data['time'].diff()

    if data.empty:
        rowdata, row = getrowdata(id=id)
        if not rowdata.empty:
            data = dataprep(rowdata.df, id=id, bands=True,
                            barchart=True, otwpower=True)
        else:
            data = pd.DataFrame()  # returning empty dataframe
    else:
        row = Workout.objects.get(id=id)


    if not data.empty and data['efficiency'].mean() == 0 and data['power'].mean() != 0 and checkefficiency == True:
        data = add_efficiency(id=id)

    if doclean:
        data = clean_df_stats(data, ignorehr=True)

    return data, row

# Fetch a subset of the data from the DB


def getsmallrowdata_db(columns, ids=[], doclean=True, workstrokesonly=True):
    prepmultipledata(ids)
    data,extracols = read_cols_df_sql(ids, columns)
    if extracols and len(ids)==1:
        w = Workout.objects.get(id=ids[0])
        row = rdata(w.csvfilename)
        try:
            row.set_instroke_metrics()
        except AttributeError:
            pass

        try:
            f = row.df['TimeStamp (sec)'].diff().mean()
        except (AttributeError,KeyError) as e:
            f = 0

        if f != 0 and not np.isnan(f):
            windowsize = 2 * (int(10. / (f))) + 1
        else:
            windowsize = 1
        for c in extracols:
            try:
                cdata = row.df[c]
                cdata.fillna(inplace=True,method='bfill')
                # This doesn't work because sometimes data are duplicated at save
                try:
                    cdata2 = savgol_filter(cdata.values,windowsize,3)
                    data[c] = cdata2
                except ValueError:
                    data[c] = cdata


            except (KeyError, AttributeError):
                data[c] = 0


    # convert newtons

    if doclean:
        data = clean_df_stats(data, ignorehr=True,
                              workstrokesonly=workstrokesonly)
        data.dropna(axis=1,how='all',inplace=True)
        data.dropna(axis=0,how='any',inplace=True)


    return data

# Fetch both the workout and the workout stroke data (from CSV file)


def getrowdata(id=0):

    # check if valid ID exists (workout exists)
    row = Workout.objects.get(id=id)

    f1 = row.csvfilename

    # get user

    r = row.user
    u = r.user

    rr = rrower(hrmax=r.max, hrut2=r.ut2,
                hrut1=r.ut1, hrat=r.at,
                hrtr=r.tr, hran=r.an, ftp=r.ftp)

    rowdata = rdata(f1, rower=rr)

    return rowdata, row

# Checks if all rows for a list of workout IDs have entries in the
# stroke_data table. If this is not the case, it creates the stroke
# data
# In theory, this should never yield any work, but it's a good
# safety net for programming errors elsewhere in the app
# Also used heavily when I moved from CSV file only to CSV+Stroke data


def prepmultipledata(ids, verbose=False):
    query = sa.text('SELECT DISTINCT workoutid FROM strokedata')
    engine = create_engine(database_url, echo=False)

    with engine.connect() as conn, conn.begin():
        res = conn.execute(query)
        res = list(itertools.chain.from_iterable(res.fetchall()))
    conn.close()
    engine.dispose()

    try:
        ids2 = [int(id) for id in ids]
    except ValueError:
        ids2 = ids

    res = list(set(ids2) - set(res))
    for id in res:
        rowdata, row = getrowdata(id=id)
        if verbose:
            print(id)
        if rowdata and len(rowdata.df):
            data = dataprep(rowdata.df, id=id, bands=True,
                            barchart=True, otwpower=True)
    return res

# Read a set of columns for a set of workout ids, returns data as a
# pandas dataframe


def read_cols_df_sql(ids, columns, convertnewtons=True):
    # drop columns that are not in offical list
    #    axx = [ax[0] for ax in axes]
    prepmultipledata(ids)
    axx = [f.name for f in StrokeData._meta.get_fields()]

    extracols = []

    columns2 = list(columns)

    for c in columns:
        if not c in axx:
            columns2.remove(c)
            extracols.append(c)

    columns = list(columns2) + ['distance', 'spm', 'workoutid']
    columns = [x for x in columns if x != 'None']
    columns = list(set(columns))
    cls = ''
    ids = [int(id) for id in ids]
    engine = create_engine(database_url, echo=False)

    for column in columns:
        cls += column + ', '
    cls = cls[:-2]
    if len(ids) == 0:
        query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid=0'.format(
            columns=cls,
        ))
    elif len(ids) == 1:
        query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid={id}'.format(
            id=ids[0],
            columns=cls,
        ))
    else:
        query = sa.text('SELECT {columns} FROM strokedata WHERE workoutid IN {ids}'.format(
            columns=cls,
            ids=tuple(ids),
        ))


    connection = engine.raw_connection()
    df = pd.read_sql_query(query, engine)


    df = df.fillna(value=0)

    if 'peakforce' in columns:
        funits = ((w.id, w.forceunit)
                  for w in Workout.objects.filter(id__in=ids))
        for id, u in funits:
            if u == 'lbs':
                mask = df['workoutid'] == id
                df.loc[mask, 'peakforce'] = df.loc[mask, 'peakforce'] * lbstoN
    if 'averageforce' in columns:
        funits = ((w.id, w.forceunit)
                  for w in Workout.objects.filter(id__in=ids))
        for id, u in funits:
            if u == 'lbs':
                mask = df['workoutid'] == id
                df.loc[mask, 'averageforce'] = df.loc[mask,
                                                      'averageforce'] * lbstoN

    engine.dispose()
    return df,extracols

# Read stroke data from the DB for a Workout ID. Returns a pandas dataframe


def read_df_sql(id):
    engine = create_engine(database_url, echo=False)

    df = pd.read_sql_query(sa.text('SELECT * FROM strokedata WHERE workoutid={id}'.format(
        id=id)), engine)

    engine.dispose()
    df = df.fillna(value=0)

    funit = Workout.objects.get(id=id).forceunit

    if funit == 'lbs':
        try:
            df['peakforce'] = df['peakforce'] * lbstoN
        except KeyError:
            pass

        try:
            df['averageforce'] = df['averageforce'] * lbstoN
        except KeyError:
            pass

    return df

# Get the necessary data from the strokedata table in the DB.
# For the flex plot


def smalldataprep(therows, xparam, yparam1, yparam2):
    df = pd.DataFrame()
    if yparam2 == 'None':
        yparam2 = 'power'
    df[xparam] = []
    df[yparam1] = []
    df[yparam2] = []
    df['distance'] = []
    df['spm'] = []
    for workout in therows:
        f1 = workout.csvfilename

        try:
            rowdata = dataprep(rrdata(csvfile=f1).df)

            rowdata = pd.DataFrame({xparam: rowdata[xparam],
                                    yparam1: rowdata[yparam1],
                                    yparam2: rowdata[yparam2],
                                    'distance': rowdata['distance'],
                                    'spm': rowdata['spm'],
                                    }
                                   )
            if workout.forceunit == 'lbs':
                try:
                    rowdata['peakforce'] *= lbstoN
                except KeyError:
                    pass

                try:
                    rowdata['averageforce'] *= lbstoN
                except KeyError:
                    pass

            df = pd.concat([df, rowdata], ignore_index=True)
        except IOError:
            try:
                rowdata = dataprep(rrdata(csvfile=f1 + '.gz').df)
                rowdata = pd.DataFrame({xparam: rowdata[xparam],
                                        yparam1: rowdata[yparam1],
                                        yparam2: rowdata[yparam2],
                                        'distance': rowdata['distance'],
                                        'spm': rowdata['spm'],
                                        }
                                       )
                if workout.forceunit == 'lbs':
                    try:
                        rowdata['peakforce'] *= lbstoN
                    except KeyError:
                        pass

                    try:
                        rowdata['averageforce'] *= lbstoN
                    except KeyError:
                        pass
                df = pd.concat([df, rowdata], ignore_index=True)
            except IOError:
                pass

    return df

# data fusion


def datafusion(id1, id2, columns, offset):
    workout1 = Workout.objects.get(id=id1)
    workout2 = Workout.objects.get(id=id2)

    df1, w1 = getrowdata_db(id=id1)
    df1 = df1.drop([  # 'cumdist',
        'hr_ut2',
        'hr_ut1',
        'hr_at',
        'hr_tr',
        'hr_an',
        'hr_max',
        'ftime',
        'fpace',
        'workoutid',
        'id'],
        1, errors='ignore')

    # Add coordinates to DataFrame
    latitude, longitude = get_latlon(id1)

    df1[' latitude'] = latitude
    df1[' longitude'] = longitude

    df2 = getsmallrowdata_db(['time'] + columns, ids=[id2], doclean=False)

    forceunit = 'N'

    offsetmillisecs = offset.seconds * 1000 + offset.microseconds / 1000.
    offsetmillisecs += offset.days * (3600 * 24 * 1000)
    df2['time'] = df2['time'] + offsetmillisecs

    keep1 = {c: c for c in set(df1.columns)}

    for c in columns:
        keep1.pop(c)

    for c in df1.columns:
        if not c in keep1:
            df1 = df1.drop(c, 1, errors='ignore')

    df = pd.concat([df1, df2], ignore_index=True)
    df = df.sort_values(['time'])
    df = df.interpolate(method='linear', axis=0, limit_direction='both',
                        limit=10)
    df.fillna(method='bfill', inplace=True)

    # Some new stuff to try out
    df = df.groupby('time', axis=0).mean()
    df['time'] = df.index
    df.reset_index(drop=True, inplace=True)

    df['time'] = df['time'] / 1000.
    df['pace'] = df['pace'] / 1000.
    df['cum_dist'] = df['cumdist']

    return df, forceunit


def fix_newtons(id=0, limit=3000):
    # rowdata,row = getrowdata_db(id=id,doclean=False,convertnewtons=False)
    rowdata = getsmallrowdata_db(['peakforce'], ids=[id], doclean=False)
    try:
        #avgforce = rowdata['averageforce']
        peakforce = rowdata['peakforce']
        if peakforce.mean() > limit:
            w = Workout.objects.get(id=id)
            print("fixing ", id)
            rowdata = rdata(w.csvfilename)
            if rowdata and len(rowdata.df):
                update_strokedata(w.id, rowdata.df)
    except KeyError:
        pass

def remove_invalid_columns(df):
    for c in df.columns:
        if not c in allowedcolumns:
            df.drop(labels=c,axis=1,inplace=True)

    return df

def add_efficiency(id=0):
    rowdata, row = getrowdata_db(id=id,
                                 doclean=False,
                                 convertnewtons=False,
                                 checkefficiency=False)
    power = rowdata['power']
    pace = rowdata['pace'] / 1.0e3
    velo = 500. / pace
    ergpw = 2.8 * velo**3
    efficiency = 100. * ergpw / power

    efficiency = efficiency.replace([-np.inf, np.inf], np.nan)
    efficiency.fillna(method='ffill')
    rowdata['efficiency'] = efficiency

    rowdata = remove_invalid_columns(rowdata)
    rowdata = rowdata.replace([-np.inf, np.inf], np.nan)
    rowdata = rowdata.fillna(method='ffill')

    delete_strokedata(id)
    if id != 0:
        rowdata['workoutid'] = id
        engine = create_engine(database_url, echo=False)
        with engine.connect() as conn, conn.begin():
            rowdata.to_sql('strokedata', engine,
                           if_exists='append', index=False)
        conn.close()
        engine.dispose()
    return rowdata

# This is the main routine.
# it reindexes, sorts, filters, and smooths the data, then
# saves it to the stroke_data table in the database
# Takes a rowingdata object's DataFrame as input


def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
             empower=True, inboard=0.88, forceunit='lbs'):
    if rowdatadf.empty:
        return 0

    #rowdatadf.set_index([range(len(rowdatadf))], inplace=True)
    t = rowdatadf.loc[:, 'TimeStamp (sec)']
    t = pd.Series(t - rowdatadf.loc[:, 'TimeStamp (sec)'].iloc[0])

    row_index = rowdatadf.loc[:, ' Stroke500mPace (sec/500m)'] > 3000
    rowdatadf.loc[row_index, ' Stroke500mPace (sec/500m)'] = 3000.

    p = rowdatadf.loc[:, ' Stroke500mPace (sec/500m)']
    try:
        velo = rowdatadf.loc[:,' AverageBoatSpeed (m/s)']
    except KeyError:
        velo = 500./p

    hr = rowdatadf.loc[:, ' HRCur (bpm)']
    spm = rowdatadf.loc[:, ' Cadence (stokes/min)']
    cumdist = rowdatadf.loc[:, 'cum_dist']
    power = rowdatadf.loc[:, ' Power (watts)']
    averageforce = rowdatadf.loc[:, ' AverageDriveForce (lbs)']
    drivelength = rowdatadf.loc[:, ' DriveLength (meters)']
    try:
        workoutstate = rowdatadf.loc[:, ' WorkoutState']
    except KeyError:
        workoutstate = 0 * hr

    peakforce = rowdatadf.loc[:, ' PeakDriveForce (lbs)']

    forceratio = averageforce / peakforce
    forceratio = forceratio.fillna(value=0)

    try:
        drivetime = rowdatadf.loc[:, ' DriveTime (ms)']
        recoverytime = rowdatadf.loc[:, ' StrokeRecoveryTime (ms)']
        rhythm = 100. * drivetime / (recoverytime + drivetime)
        rhythm = rhythm.fillna(value=0)
    except:
        rhythm = 0.0 * forceratio

    f = rowdatadf['TimeStamp (sec)'].diff().mean()
    if f != 0 and not np.isinf(f):
        try:
            windowsize = 2 * (int(10. / (f))) + 1
        except ValueError:
            windowsize = 1
    else:
        windowsize = 1
    if windowsize <= 3:
        windowsize = 5

    if windowsize > 3 and windowsize < len(hr):
        spm = savgol_filter(spm, windowsize, 3)
        hr = savgol_filter(hr, windowsize, 3)
        drivelength = savgol_filter(drivelength, windowsize, 3)
        forceratio = savgol_filter(forceratio, windowsize, 3)

    try:
        t2 = t.fillna(method='ffill').apply(lambda x: timedeltaconv(x))
    except TypeError:
        t2 = 0 * t

    p2 = p.fillna(method='ffill').apply(lambda x: timedeltaconv(x))

    try:
        drivespeed = drivelength / rowdatadf[' DriveTime (ms)'] * 1.0e3
    except TypeError:
        drivespeed = 0.0 * rowdatadf['TimeStamp (sec)']

    drivespeed = drivespeed.fillna(value=0)

    try:
        driveenergy = rowdatadf['driveenergy']
    except KeyError:
        if forceunit == 'lbs':
            driveenergy = drivelength * averageforce * lbstoN
        else:
            drivenergy = drivelength * averageforce

    distance = rowdatadf.loc[:, 'cum_dist']
    velo = 500. / p

    distanceperstroke = 60. * velo / spm

    data = DataFrame(
        dict(
            time=t * 1e3,
            hr=hr,
            pace=p * 1e3,
            spm=spm,
            velo=velo,
            cumdist=cumdist,
            ftime=niceformat(t2),
            fpace=nicepaceformat(p2),
            driveenergy=driveenergy,
            power=power,
            workoutstate=workoutstate,
            averageforce=averageforce,
            drivelength=drivelength,
            peakforce=peakforce,
            forceratio=forceratio,
            distance=distance,
            drivespeed=drivespeed,
            rhythm=rhythm,
            distanceperstroke=distanceperstroke,
        )
    )


    if bands:
        # HR bands
        data['hr_ut2'] = rowdatadf.loc[:, 'hr_ut2']
        data['hr_ut1'] = rowdatadf.loc[:, 'hr_ut1']
        data['hr_at'] = rowdatadf.loc[:, 'hr_at']
        data['hr_tr'] = rowdatadf.loc[:, 'hr_tr']
        data['hr_an'] = rowdatadf.loc[:, 'hr_an']
        data['hr_max'] = rowdatadf.loc[:, 'hr_max']
        data['hr_bottom'] = 0.0 * data['hr']

    try:
        tel = rowdatadf.loc[:, ' ElapsedTime (sec)']
    except KeyError:
        rowdatadf[' ElapsedTime (sec)'] = rowdatadf['TimeStamp (sec)']

    if barchart:
        # time increments for bar chart
        time_increments = rowdatadf.loc[:, ' ElapsedTime (sec)'].diff()
        try:
            time_increments.iloc[0] = time_increments.iloc[1]
        except KeyError:
            time_increments.iloc[0] = 1.

        time_increments = 0.5 * time_increments + 0.5 * np.abs(time_increments)
        x_right = (t2 + time_increments.apply(lambda x: timedeltaconv(x)))

        data['x_right'] = x_right

    if empower:
        try:
            wash = rowdatadf.loc[:, 'wash']
        except KeyError:
            wash = 0 * power

        try:
            catch = rowdatadf.loc[:, 'catch']
        except KeyError:
            catch = 0 * power

        try:
            finish = rowdatadf.loc[:, 'finish']
        except KeyError:
            finish = 0 * power

        try:
            peakforceangle = rowdatadf.loc[:, 'peakforceangle']
        except KeyError:
            peakforceangle = 0 * power

        if data['driveenergy'].mean() == 0:
            try:
                driveenergy = rowdatadf.loc[:, 'driveenergy']
            except KeyError:
                driveenergy = power * 60 / spm
        else:
            driveenergy = data['driveenergy']

        arclength = (inboard - 0.05) * (np.radians(finish) - np.radians(catch))
        if arclength.mean() > 0:
            drivelength = arclength
        elif drivelength.mean() == 0:
            drivelength = driveenergy / (averageforce * 4.44822)

        try:
            slip = rowdatadf.loc[:, 'slip']
        except KeyError:
            slip = 0 * power

        try:
            totalangle = finish - catch
            effectiveangle = finish - wash - catch - slip
        except ValueError:
            totalangle = 0 * power
            effectiveangle = 0 * power

        if windowsize > 3 and windowsize < len(slip):
            try:
                wash = savgol_filter(wash, windowsize, 3)
            except TypeError:
                pass
            try:
                slip = savgol_filter(slip, windowsize, 3)
            except TypeError:
                pass
            try:
                catch = savgol_filter(catch, windowsize, 3)
            except TypeError:
                pass
            try:
                finish = savgol_filter(finish, windowsize, 3)
            except TypeError:
                pass
            try:
                peakforceangle = savgol_filter(peakforceangle, windowsize, 3)
            except TypeError:
                pass
            try:
                driveenergy = savgol_filter(driveenergy, windowsize, 3)
            except TypeError:
                pass
            try:
                drivelength = savgol_filter(drivelength, windowsize, 3)
            except TypeError:
                pass
            try:
                totalangle = savgol_filter(totalangle, windowsize, 3)
            except TypeError:
                pass
            try:
                effectiveangle = savgol_filter(effectiveangle, windowsize, 3)
            except TypeError:
                pass

        velo = 500. / p

        ergpw = 2.8 * velo**3
        efficiency = 100. * ergpw / power

        efficiency = efficiency.replace([-np.inf, np.inf], np.nan)
        efficiency.fillna(method='ffill')

        try:
            data['wash'] = wash
            data['catch'] = catch
            data['slip'] = slip
            data['finish'] = finish
            data['peakforceangle'] = peakforceangle
            data['driveenergy'] = driveenergy
            data['drivelength'] = drivelength
            data['totalangle'] = totalangle
            data['effectiveangle'] = effectiveangle
            data['efficiency'] = efficiency
        except ValueError:
            pass

    if otwpower:
        try:
            nowindpace = rowdatadf.loc[:, 'nowindpace']
        except KeyError:
            nowindpace = p
        try:
            equivergpower = rowdatadf.loc[:, 'equivergpower']
        except KeyError:
            equivergpower = 0 * p + 50.

        nowindpace2 = nowindpace.apply(lambda x: timedeltaconv(x))
        ergvelo = (equivergpower / 2.8)**(1. / 3.)

        ergpace = 500. / ergvelo
        ergpace[ergpace == np.inf] = 240.
        ergpace2 = ergpace.apply(lambda x: timedeltaconv(x))

        data['ergpace'] = ergpace * 1e3
        data['nowindpace'] = nowindpace * 1e3
        data['equivergpower'] = equivergpower
        data['fergpace'] = nicepaceformat(ergpace2)
        data['fnowindpace'] = nicepaceformat(nowindpace2)

    data = data.replace([-np.inf, np.inf], np.nan)
    data = data.fillna(method='ffill')

    # write data if id given
    if id != 0:
        data['workoutid'] = id

        engine = create_engine(database_url, echo=False)
        with engine.connect() as conn, conn.begin():
            data.to_sql('strokedata', engine, if_exists='append', index=False)
        conn.close()
        engine.dispose()
    return data


def workout_trimp(w):
    r = w.user

    if w.trimp > 0:
        return w.trimp,w.hrtss

    r = w.user
    ftp = float(r.ftp)
    if w.workouttype in otwtypes:
        ftp = ftp*(100.-r.otwslack)/100.

    if r.hrftp == 0:
        hrftp = (r.an+r.tr)/2.
        r.hrftp = int(hrftp)
        r.save()


    if w.averagehr is None:
        rowdata = rdata(w.csvfilename)

        try:
            avghr = rowdata.df[' HRCur (bpm)'].mean()
            maxhr = rowdata.df[' HRCur (bpm)'].max()
        except KeyError:
            avghr = None
            maxhr = None

        w.averagehr = avghr
        w.maxhr = maxhr
        w.save()

    job = myqueue(
        queuehigh,
        handle_calctrimp,
        w.id,
        w.csvfilename,
        ftp,
        r.sex,
        r.hrftp,
        r.max,
        r.rest)

    return 0,0

def workout_rscore(w):
    if w.rscore > 0:
        return w.rscore,w.normp

    r = w.user
    ftp = float(r.ftp)
    if w.workouttype in otwtypes:
        ftp = ftp*(100.-r.otwslack)/100.

    if r.hrftp == 0:
        hrftp = (r.an+r.tr)/2.
        r.hrftp = int(hrftp)
        r.save()


    job = myqueue(
        queuehigh,
        handle_calctrimp,
        w.id,
        w.csvfilename,
        ftp,
        r.sex,
        r.hrftp,
        r.max,
        r.rest)

    return 0,0

def workout_normv(w,pp=4.0):
    if w.normv > 0:
        return w.normv,w.normw

    r = w.user
    ftp = float(r.ftp)
    if w.workouttype in otwtypes:
        ftp = ftp*(100.-r.otwslack)/100.

    if r.hrftp == 0:
        hrftp = (r.an+r.tr)/2.
        r.hrftp = int(hrftp)
        r.save()


    job = myqueue(
        queuehigh,
        handle_calctrimp,
        w.id,
        w.csvfilename,
        ftp,
        r.sex,
        r.hrftp,
        r.max,
        r.rest)

    return 0,0