histo converted to polars

2024-04-08 20:50:08 +02:00
parent 6d549e3b8b
commit fd46732b6e
3 changed files with 116 additions and 20 deletions
--- a/rowers/dataroutines.py
+++ b/rowers/dataroutines.py
@@ -1414,6 +1414,37 @@ def getrowdata_db(id=0, doclean=False, convertnewtons=True,

    return data, row

+def getrowdata_pl(id=0, doclean=False, convertnewtons=True,
+                  checkefficiency=True, for_chart=False):
+    data = read_df_sql(id,polars=True)
+    print(data)
+    try:
+        data = data.with_columns(pl.col(data['time'].diff()).alias("deltat")) # data['time'].diff()
+    except KeyError:  # pragma: no cover
+        data = pl.DataFrame()
+
+    if data.is_empty():
+        rowdata, row = getrowdata(id=id)
+        if not rowdata.empty: # pragma: no cover
+            data = dataprep(rowdata.df, id=id, bands=True,
+                            barchart=True, otwpower=True, polars=True)
+        else:
+            data = pl.DataFrame()  # returning empty dataframe
+    else:
+        row = Workout.objects.get(id=id)
+
+    if checkefficiency is True and not data.is_empty():
+        try:
+            if data['efficiency'].mean() == 0 and data['power'].mean() != 0:  # pragma: no cover
+                data = add_efficiency_pl(id=id, polars=True)
+        except KeyError:  # pragma: no cover
+            data = add_efficiency_pl(id=id)
+
+    if doclean:  # pragma: no cover
+        data = clean_df_stats(data, ignorehr=True, for_chart=for_chart)
+
+    return data, row
+
 # Fetch a subset of the data from the DB

 def getsmallrowdata_pl(columns, ids=[], doclean=True, workstrokesonly=True, compute=True,
@@ -1707,7 +1738,28 @@ def read_cols_df_sql(ids, columns, convertnewtons=True):
 # Read stroke data from the DB for a Workout ID. Returns a pandas dataframe


-def read_df_sql(id):
+def read_df_sql(id, polars=False):
+    if polars:
+        try:
+            f = 'media/strokedata_{id}.parquet.gz'.format(id=id)
+            df = pd.read_parquet(f)
+        except (IsADirectoryError, FileNotFoundError, OSError, ArrowInvalid, IndexError):  # pragma: no cover
+            rowdata, row = getrowdata(id=id)
+            try:
+                shutil.rmtree(f)
+            except:
+                pass
+            if rowdata and len(rowdata.df):
+                _ = dataprep(rowdata.df, id=id,
+                             bands=True, otwpower=True, barchart=True,
+                             polars=True)
+                try:
+                    df = pl.read_parquet(f, columns=columns)
+                except (OSError, ArrowInvalid, IndexError):
+                    pass
+        df = df.fill_nan(None).drop_nulls()
+
+        return df
    try:
        f = 'media/strokedata_{id}.parquet.gz'.format(id=id)
        df = pd.read_parquet(f)
@@ -1802,6 +1854,13 @@ def fix_newtons(id=0, limit=3000):  # pragma: no cover
        pass


+def remove_invalid_columns_pl(df):  # pragma: no cover
+    for c in df.get_columns():
+        if c not in allowedcolumns:
+           df = df.drop(c)
+
+    return df
+
 def remove_invalid_columns(df):  # pragma: no cover
    for c in df.columns:
        if c not in allowedcolumns:
@@ -1809,6 +1868,36 @@ def remove_invalid_columns(df):  # pragma: no cover

    return df

+def add_efficiency_pl(id=0):  # pragma: no cover
+    rowdata, row = getrowdata_pl(id=id,
+                                 doclean=False,
+                                 convertnewtons=False,
+                                 checkefficiency=False)
+    power = rowdata['power']
+    pace = rowdata['pace'] / 1.0e3
+    velo = 500. / pace
+    ergpw = 2.8 * velo**3
+    efficiency = 100. * ergpw / power
+
+    efficiency = efficiency.replace([-np.inf, np.inf], np.nan)
+    efficiency.fillna(method='ffill')
+    rowdata = rowdata.with_columns(pl.col(efficiency).alias("efficiency")) #  ['efficiency'] = efficiency
+
+    rowdata = remove_invalid_columns_pl(rowdata)
+    rowdata = rowdata.replace([-np.inf, np.inf], np.nan)
+    rowdata = rowdata.fillna(method='ffill')
+
+    delete_strokedata(id)
+
+
+    if id != 0:
+        rowdata = rowdata.with_column(pl.lit(id).alias("workoutid"))
+        filename = 'media/strokedata_{id}.parquet.gz'.format(id=id)
+        rowdata.write_parquet(filename, compression='gzip')
+
+
+    return rowdata
+

 def add_efficiency(id=0):  # pragma: no cover
    rowdata, row = getrowdata_db(id=id,
@@ -2144,7 +2233,10 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
                os.remove(filename)
                df.to_parquet(filename, engine='fastparquet', compression='GZIP')

-
+    if polars:
+        pldf = pl.from_pandas(data)
+        return pldf
+                
    return data


--- a/rowers/interactiveplots.py
+++ b/rowers/interactiveplots.py
@@ -469,18 +469,20 @@ def interactive_forcecurve(theworkouts):

    columns = ['catch', 'slip', 'wash', 'finish', 'averageforce',
               'peakforceangle', 'peakforce', 'spm', 'distance',
-               'workoutstate', 'driveenergy', 'cumdist']
+               'workoutstate', 'driveenergy', 'cumdist', 'workoutid']
+    columns = columns + [name for name, d in metrics.rowingmetrics]

-    rowdata = dataprep.getsmallrowdata_db(columns, ids=ids,
+    
+    rowdata = dataprep.getsmallrowdata_pl(columns, ids=ids,
                                          workstrokesonly=False)

-    rowdata.dropna(axis=1, how='all', inplace=True)
-    rowdata.dropna(axis=0, how='any', inplace=True)
+    rowdata = rowdata.fill_nan(None).drop_nulls()

-    if rowdata.empty:
+
+    if rowdata.is_empty():
        return "", "No Valid Data Available"

-    data_dict = rowdata.to_dict("records")
+    data_dict = rowdata.to_dicts()

    thresholdforce = 100. if 'x' in boattype else 200.
    
@@ -490,7 +492,7 @@ def interactive_forcecurve(theworkouts):
        'thresholdforce': thresholdforce,
    }

-    script, div = get_chart("/forcecurve", chart_data)
+    script, div = get_chart("/forcecurve", chart_data, debug=False)
    return script, div


@@ -822,25 +824,27 @@ def interactive_histoall(theworkouts, histoparam, includereststrokes,

    ids = [int(w.id) for w in theworkouts]

+    columns = [name for name, d in metrics.rowingmetrics]+['spm', 'driveenergy', 'distance', 'workoutstate', 'workoutid']
+
    workstrokesonly = not includereststrokes
-    rowdata = dataprep.getsmallrowdata_db(
-        [histoparam], ids=ids, doclean=True, workstrokesonly=workstrokesonly)
+    rowdata = dataprep.getsmallrowdata_pl(
+        columns, ids=ids, doclean=True, workstrokesonly=workstrokesonly)

-    rowdata.dropna(axis=0, how='any', inplace=True)
+    rowdata = rowdata.fill_nan(None).drop_nulls()

-    rowdata = dataprep.filter_df(rowdata, 'spm', spmmin, largerthan=True)
-    rowdata = dataprep.filter_df(rowdata, 'spm', spmmax, largerthan=False)
+    #rowdata = dataprep.filter_df(rowdata, 'spm', spmmin, largerthan=True)
+    #rowdata = dataprep.filter_df(rowdata, 'spm', spmmax, largerthan=False)

-    rowdata = dataprep.filter_df(
-        rowdata, 'driveenergy', workmin, largerthan=True)
-    rowdata = dataprep.filter_df(
-        rowdata, 'driveenergy', workmax, largerthan=False)
+    #rowdata = dataprep.filter_df(
+    #    rowdata, 'driveenergy', workmin, largerthan=True)
+    #rowdata = dataprep.filter_df(
+    #    rowdata, 'driveenergy', workmax, largerthan=False)

-    if rowdata.empty:
+    if rowdata.is_empty():
        return "", "No Valid Data Available"

    try:
-        histopwr = rowdata[histoparam].values
+        histopwr = rowdata[histoparam].to_numpy()
    except KeyError:
        return "", "No data"
    if len(histopwr) == 0:  # pragma: no cover
--- a/rowers/tests/testdata/testdata.tcx.gz
+++ b/rowers/tests/testdata/testdata.tcx.gz