diff --git a/rowers/dataprep.py b/rowers/dataprep.py index f6de443b..5983bb91 100644 --- a/rowers/dataprep.py +++ b/rowers/dataprep.py @@ -529,7 +529,7 @@ def setcp(workout, background=False, recurrance=True): ['power', 'workoutid', 'time'], ids=[workout.id]) if strokesdf.is_empty(): - return pl.DataFrame({'delta': [], 'cp': []}), pd.Series(dtype='float'), pd.Series(dtype='float') + return pl.DataFrame({'delta': [], 'cp': []}), pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64) totaltime = strokesdf['time'].max() maxt = totaltime/1000. @@ -544,7 +544,7 @@ def setcp(workout, background=False, recurrance=True): elif os.path.exists(csvfilename+'.gz'): # pragma: no cover csvfile = csvfilename+'.gz' else: # pragma: no cover - return pd.DataFrame({'delta': [], 'cp': []}), pd.Series(dtype='float'), pd.Series(dtype='float') + return pl.DataFrame({'delta': [], 'cp': []}), pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64) csvfile = os.path.abspath(csvfile) @@ -558,7 +558,7 @@ def setcp(workout, background=False, recurrance=True): grpc.channel_ready_future(channel).result(timeout=10) except grpc.FutureTimeoutError: # pragma: no cover dologging('metrics.log','grpc channel time out in setcp') - return pd.DataFrame({'delta': [], 'cp': []}), pd.Series(dtype='float'), pd.Series(dtype='float') + return pl.DataFrame({'delta': [], 'cp': []}), pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64) stub = metrics_pb2_grpc.MetricsStub(channel) req = metrics_pb2.CPRequest(filename = csvfile, filetype = "CSV", tarr = logarr) @@ -567,7 +567,7 @@ def setcp(workout, background=False, recurrance=True): response = stub.GetCP(req, timeout=60) except Exception as e: dologging('metrics.log', traceback.format_exc()) - return pd.DataFrame({'delta': [], 'cp': []}), pd.Series(dtype='float'), pd.Series(dtype='float') + return pl.DataFrame({'delta': [], 'cp': []}), pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64) delta = pl.Series(np.array(response.delta)) cpvalues = pl.Series(np.array(response.power)) @@ -735,32 +735,22 @@ def fetchcp_new(rower, workouts): data = [] for workout in workouts: - cpfile = 'media/cpdata_{id}.parquet.gz'.format(id=workout.id) - try: - df, delta, cpvalues = setcp(workout) - df = pd.read_parquet(cpfile) - df['workout'] = str(workout) - df['url'] = workout.url() - data.append(df) - except: - # CP data file doesn't exist yet. has to be created - df, delta, cpvalues = setcp(workout) - df = df.with_columns((pl.lit(str(workout))).alias("workout")) - df = df.with_columns((pl.lit(workout.url())).alias("url")) + df, delta, cpvalues = setcp(workout) + df = df.drop('id') + df = df.with_columns((pl.lit(str(workout))).alias("workout")) + df = df.with_columns((pl.lit(workout.url())).alias("url")) + if not df.is_empty(): data.append(df) if len(data) == 0: - return pl.Series(dtype='float'), pl.Series(dtype='float'), 0, pl.Series(dtype='float'), pl.Series(dtype='float') + return pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64), 0, pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64) if len(data) > 1: df = pl.concat(data) - #df = df.to_pandas() - - try: df = df.group_by(pl.col("delta")).agg(pl.max("cp"), pl.max("workout"), pl.max("url")).sort("delta") except (KeyError, ColumnNotFoundError): # pragma: no cover - return pd.Series(dtype='float'), pd.Series(dtype='float'), 0, pd.Series(dtype='float'), pd.Series(dtype='float') + return pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64), 0, pl.Series(dtype=pl.Float64), pl.Series(dtype=pl.Float64) df = df.filter(pl.col("cp")>20) diff --git a/rowers/dataroutines.py b/rowers/dataroutines.py index dad0dfc6..9f9e571c 100644 --- a/rowers/dataroutines.py +++ b/rowers/dataroutines.py @@ -32,6 +32,7 @@ import zipfile import os from rowers.models import strokedatafields import polars as pl +import polars.selectors as cs from polars.exceptions import ( ColumnNotFoundError, SchemaError, ComputeError, InvalidOperationError @@ -176,6 +177,19 @@ columndict = { 'cumdist': 'cum_dist', } +def remove_nulls_pl(data): + data = data.lazy().fill_nan(None) + data = data.select(cs.by_dtype(pl.NUMERIC_DTYPES)).collect() + data = data[[s.name for s in data if not s.is_infinite().sum()]] + data = data[[s.name for s in data if not (s.null_count() == data.height)]] + if not data.is_empty(): + try: + data = data.drop_nulls() + except: + pass + + return data + def get_video_data(w, groups=['basic'], mode='water'): modes = [mode, 'both', 'basic'] @@ -798,14 +812,11 @@ def clean_df_stats_pl(datadf, workstrokesonly=True, ignorehr=True, datadf = datadf.filter( pl.col("spm") >=0, - pl.col("efficiency")<=200, pl.col("spm")>=10, pl.col("pace")<=300*1000., - pl.col("efficiency")>=0, pl.col("pace")>=60*1000, pl.col("power")<=5000, pl.col("spm")<=120, - pl.col("wash")>=1 ) @@ -823,7 +834,10 @@ def clean_df_stats_pl(datadf, workstrokesonly=True, ignorehr=True, datadf = datadf.filter(pl.col("rhythm")>=0, pl.col("rhythm")<=70, pl.col("power")>=20, + pl.col("efficiency")<=200, pl.col("drivelength")>=0.5, + pl.col("wash")>=1, + pl.col("efficiency")>=0, pl.col("forceratio")>=0.2, pl.col("forceratio")<=1.0, pl.col("drivespeed")>=0.5, @@ -833,6 +847,7 @@ def clean_df_stats_pl(datadf, workstrokesonly=True, ignorehr=True, pl.col("catch")<=-30) + # workoutstateswork = [1, 4, 5, 8, 9, 6, 7] workoutstatesrest = [3] # workoutstatetransition = [0, 2, 10, 11, 12, 13] @@ -850,6 +865,8 @@ def clean_df_stats_pl(datadf, workstrokesonly=True, ignorehr=True, if ratio < 0.01 or after[workoutid] < 2: return data_orig + + return datadf @@ -1541,9 +1558,8 @@ def getsmallrowdata_pl(columns, ids=[], doclean=True, workstrokesonly=True, comp workstrokesonly=workstrokesonly, for_chart=for_chart) - data = data.fill_nan(None).drop_nulls() - return data + data = remove_nulls_pl(data) if not df.is_empty(): df = df.fill_nan(None).drop_nulls() diff --git a/rowers/interactiveplots.py b/rowers/interactiveplots.py index 80517b36..c5d79779 100644 --- a/rowers/interactiveplots.py +++ b/rowers/interactiveplots.py @@ -1588,19 +1588,19 @@ def interactive_cum_flex_chart2(theworkouts, promember=0, except (KeyError, ColumnNotFoundError): yparam1 = 'None' - datadf = datadf.fill_nan(None).drop_nulls() + datadf = dataprep.remove_nulls_pl(datadf) # test if we have drive energy try: # pragma: no cover _ = datadf['driveenergy'].mean() - except KeyError: # pragma: no cover - datadf['driveenergy'] = 500. + except (KeyError, ColumnNotFoundError): # pragma: no cover + datadf = datadf.with_columns((pl.lit(500)).alias("driveenergy")) # test if we have power try: # pragma: no cover _ = datadf['power'].mean() - except KeyError: # pragma: no cover - datadf['power'] = 50. + except (KeyError, ColumnNotFoundError): # pragma: no cover + datadf = datadf.with_columns((pl.lit(50)).alias("power")) yparamname1 = axlabels[yparam1] if yparam2 != 'None': @@ -1687,8 +1687,8 @@ def interactive_flexchart_stacked(id, r, xparam='time', columns = [name for name, d in metrics.rowingmetrics] columns_basic = [name for name, d in metrics.rowingmetrics if d['group'] == 'basic'] - columns = columns + ['spm', 'driveenergy', 'distance'] - columns_basic = columns_basic + ['spm', 'driveenergy', 'distance'] + columns = columns + ['spm', 'driveenergy', 'distance','workoutid','workoutstate'] + columns_basic = columns_basic + ['spm', 'driveenergy', 'distance','workoutid','workoutstate'] rowdata = pd.DataFrame() row = Workout.objects.get(id=id) @@ -1763,7 +1763,7 @@ def interactive_flexchart_stacked(id, r, xparam='time', rowdata = rowdata.with_columns(y4=pl.col("time")) rowdata = rowdata.with_columns((pl.col("y4")).alias(yparam4)) - + # replace nans rowdata = rowdata.fill_nan(0) @@ -1783,7 +1783,7 @@ def interactive_flexchart_stacked(id, r, xparam='time', 'metrics': metrics_list, } - script, div = get_chart("/stacked", chart_data) + script, div = get_chart("/stacked", chart_data, debug=False) return script, div diff --git a/rowers/tests/mocks.py b/rowers/tests/mocks.py index 80a12eb1..012dddf2 100644 --- a/rowers/tests/mocks.py +++ b/rowers/tests/mocks.py @@ -272,7 +272,7 @@ def mocked_fetchcperg(*args, **kwargs): return df -import pandas as pd + def mocked_read_df_sql(id): # pragma: no cover df = pd.read_csv('rowers/tests/testdata/fake_strokedata.csv') @@ -310,7 +310,7 @@ def mocked_getrowdata_uh(*args, **kwargs): # pragma: no cover return df, row def mocked_getsmallrowdata_uh(*args, **kwargs): # pragma: no cover - df = pd.read_csv('rowers/tests/testdata/uhfull.csv') + df = pl.read_csv('rowers/tests/testdata/uhfull.csv') return df @@ -390,7 +390,7 @@ def mocked_read_cols_df_sql(*args, **kwargs): def mock_workout_summaries(*args, **kwargs): - df = pd.read_csv('rowers/tests/testdata/workout_summaries.csv') + df = pl.read_csv('rowers/tests/testdata/workout_summaries.csv') return df def mocked_read_df_cols_sql_multi(ids, columns, convertnewtons=True): # pragma: no cover diff --git a/rowers/tests/test_unit_tests.py b/rowers/tests/test_unit_tests.py index 5e57721c..113faeb6 100644 --- a/rowers/tests/test_unit_tests.py +++ b/rowers/tests/test_unit_tests.py @@ -582,7 +582,7 @@ class DataPrepTests(TestCase): age = dataprep.calculate_age(born,today=today) self.assertEqual(age,49) - @patch('rowers.dataprep.getsmallrowdata_db',side_effect=mocked_getsmallrowdata_uh) + @patch('rowers.dataprep.getsmallrowdata_pl',side_effect=mocked_getsmallrowdata_uh) def test_goldmedalstandard(self,mocked_getsmallrowdata_uh): maxvalue, delta = dataprep.calculate_goldmedalstandard(self.r,self.wuh_otw) records = CalcAgePerformance.objects.filter( diff --git a/rowers/tests/testdata/testdata.tcx.gz b/rowers/tests/testdata/testdata.tcx.gz index 132867e1..66430507 100644 Binary files a/rowers/tests/testdata/testdata.tcx.gz and b/rowers/tests/testdata/testdata.tcx.gz differ diff --git a/rowers/utils.py b/rowers/utils.py index 7cb1413b..95e5fb78 100644 --- a/rowers/utils.py +++ b/rowers/utils.py @@ -440,17 +440,19 @@ def wavg(group, avg_name, weight_name): """ try: d = group[avg_name] - except KeyError: + except (KeyError, ColumnNotFoundError): return 0 try: w = group[weight_name] - except KeyError: + except (KeyError, ColumnNotFoundError): return d.mean() try: return (d * w).sum() / w.sum() except ZeroDivisionError: # pragma: no cover return d.mean() + return 0 + from string import Formatter def totaltime_sec_to_string(totaltime, shorten=False): diff --git a/rowers/views/analysisviews.py b/rowers/views/analysisviews.py index cbab1c80..86ed09d3 100644 --- a/rowers/views/analysisviews.py +++ b/rowers/views/analysisviews.py @@ -932,12 +932,16 @@ def boxplotdata(workouts, options): datadf = dataprep.clean_df_stats_pl(datadf, workstrokesonly=workstrokesonly) - datadf = datadf.filter( - pl.col("spm")>spmmin, - pl.col("spm")workmin, - pl.col("driveenergy")spmmin, + pl.col("spm")workmin, + pl.col("driveenergy")