Private
Public Access
1
0

updated dataprepnodjango

This commit is contained in:
Sander Roosendaal
2019-10-23 20:04:28 +02:00
parent fa373d781f
commit c0e8e448e3
3 changed files with 172 additions and 146 deletions

View File

@@ -15,6 +15,7 @@ from rowingdata import rowingdata as rrdata
from rowingdata import rower as rrower
import shutil
from shutil import copyfile
from rowingdata import (
@@ -1651,75 +1652,7 @@ def new_workout_from_df(r, df,
return (id, message)
# Compare the data from the CSV file and the database
# Currently only calculates number of strokes. To be expanded with
# more elaborate testing if needed
def compare_data(id):
row = Workout.objects.get(id=id)
f1 = row.csvfilename
try:
rowdata = rdata(f1)
l1 = len(rowdata.df)
except AttributeError:
rowdata = 0
l1 = 0
engine = create_engine(database_url, echo=False)
query = sa.text('SELECT COUNT(*) FROM strokedata WHERE workoutid={id};'.format(
id=id,
))
with engine.connect() as conn, conn.begin():
try:
res = conn.execute(query)
l2 = res.fetchall()[0][0]
except:
print("Database Locked")
conn.close()
engine.dispose()
lfile = l1
ldb = l2
return l1 == l2 and l1 != 0, ldb, lfile
# Repair data for workouts where the CSV file is lost (or the DB entries
# don't exist)
def repair_data(verbose=False):
ws = Workout.objects.all()
for w in ws:
if verbose:
sys.stdout.write(".")
test, ldb, lfile = compare_data(w.id)
if not test:
if verbose:
print(w.id, lfile, ldb)
try:
rowdata = rdata(w.csvfilename)
if rowdata and len(rowdata.df):
update_strokedata(w.id, rowdata.df)
except (IOError, AttributeError):
pass
if lfile == 0:
# if not ldb - delete workout
try:
data = read_df_sql(w.id)
try:
datalength = len(data)
except AttributeError:
datalength = 0
if datalength != 0:
data.rename(columns=columndict, inplace=True)
res = data.to_csv(w.csvfilename + '.gz',
index_label='index',
compression='gzip')
else:
w.delete()
except:
pass
# A wrapper around the rowingdata class, with some error catching
@@ -1745,17 +1678,11 @@ def rdata(file, rower=rrower()):
def delete_strokedata(id):
engine = create_engine(database_url, echo=False)
query = sa.text('DELETE FROM strokedata WHERE workoutid={id};'.format(
id=id,
))
with engine.connect() as conn, conn.begin():
try:
result = conn.execute(query)
except:
print("Database Locked")
conn.close()
engine.dispose()
dirname = 'media/strokedata_{id}.parquet.gz'.format(id=id)
try:
shutil.rmtree(dirname)
except FileNotFoundError:
pass
# Replace stroke data in DB with data from CSV file
@@ -1782,7 +1709,6 @@ def testdata(time, distance, pace, spm):
def getrowdata_db(id=0, doclean=False, convertnewtons=True,
checkefficiency=True):
data = read_df_sql(id)
data['x_right'] = data['x_right'] / 1.0e6
data['deltat'] = data['time'].diff()
if data.empty:
@@ -2010,6 +1936,66 @@ def prepmultipledata(ids, verbose=False):
def read_cols_df_sql(ids, columns, convertnewtons=True):
# drop columns that are not in offical list
# axx = [ax[0] for ax in axes]
extracols = []
columns = list(columns) + ['distance', 'spm', 'workoutid']
columns = [x for x in columns if x != 'None']
columns = list(set(columns))
ids = [int(id) for id in ids]
if len(ids) == 0:
return pd.DataFrame(),extracols
elif len(ids) == 1:
try:
filename = 'media/strokedata_{id}.parquet.gz'.format(id=ids[0])
df = pd.read_parquet(filename,columns=columns)
except OSError:
rowdata,row = getrowdata(id=ids[0])
if rowdata and len(rowdata.df):
datadf = dataprep(rowdata.df,id=ids[0],bands=True,otwpower=True,barchart=True)
df = pd.read_parquet(filename,columns=columns)
else:
data = []
filenames = ['media/strokedata_{id}.parquet.gz'.format(id=id) for id in ids]
for id,f in zip(ids,filenames):
try:
df = pd.read_parquet(f,columns=columns)
data.append(df)
except OSError:
rowdata,row = getrowdata(id=id)
if rowdata and len(rowdata.df):
datadf = dataprep(rowdata.df,id=id,bands=True,otwpower=True,barchart=True)
df = pd.read_parquet(f,columns=columns)
data.append(df)
df = pd.concat(data,axis=0)
df = df.fillna(value=0)
if 'peakforce' in columns:
funits = ((w.id, w.forceunit)
for w in Workout.objects.filter(id__in=ids))
for id, u in funits:
if u == 'lbs':
mask = df['workoutid'] == id
df.loc[mask, 'peakforce'] = df.loc[mask, 'peakforce'] * lbstoN
if 'averageforce' in columns:
funits = ((w.id, w.forceunit)
for w in Workout.objects.filter(id__in=ids))
for id, u in funits:
if u == 'lbs':
mask = df['workoutid'] == id
df.loc[mask, 'averageforce'] = df.loc[mask,
'averageforce'] * lbstoN
return df,extracols
def read_cols_df_sql_old(ids, columns, convertnewtons=True):
# drop columns that are not in offical list
# axx = [ax[0] for ax in axes]
prepmultipledata(ids)
axx = [f.name for f in StrokeData._meta.get_fields()]
@@ -2076,8 +2062,34 @@ def read_cols_df_sql(ids, columns, convertnewtons=True):
# Read stroke data from the DB for a Workout ID. Returns a pandas dataframe
def read_df_sql(id):
try:
f = 'media/strokedata_{id}.parquet.gz'.format(id=id)
df = pd.read_parquet(f)
except OSError:
rowdata,row = getrowdata(id=ids[0])
if rowdata and len(rowdata.df):
data = dataprep(rowdata.df,id=ids[0],bands=True,otwpower=True,barchart=True)
df = pd.read_parquet(f)
df = df.fillna(value=0)
funit = Workout.objects.get(id=id).forceunit
if funit == 'lbs':
try:
df['peakforce'] = df['peakforce'] * lbstoN
except KeyError:
pass
try:
df['averageforce'] = df['averageforce'] * lbstoN
except KeyError:
pass
return df
def read_df_sql_old(id):
engine = create_engine(database_url, echo=False)
df = pd.read_sql_query(sa.text('SELECT * FROM strokedata WHERE workoutid={id} ORDER BY time ASC'.format(
@@ -2269,14 +2281,13 @@ def add_efficiency(id=0):
rowdata = rowdata.fillna(method='ffill')
delete_strokedata(id)
if id != 0:
rowdata['workoutid'] = id
engine = create_engine(database_url, echo=False)
with engine.connect() as conn, conn.begin():
rowdata.to_sql('strokedata', engine,
if_exists='append', index=False)
conn.close()
engine.dispose()
filename = 'media/strokedata_{id}.parquet.gz'.format(id=id)
df = dd.from_pandas(rowdata,npartitions=1)
df.to_parquet(filename,engine='fastparquet',compression='GZIP')
return rowdata
# This is the main routine.
@@ -2563,14 +2574,8 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
filename = 'media/strokedata_{id}.parquet.gz'.format(id=id)
df = dd.from_pandas(data,npartitions=1)
#df = df.loc[:,~df.columns.duplicated()]
# data.to_csv(filename,compression='gzip')
df.to_parquet(filename,engine='fastparquet',compression='GZIP')
# data.to_parquet(filename,engine='fastparquet',compression='gzip')
# table = pa.Table.from_pandas(data)
#pq.write_table(table,filename)
return data