improved data cleaning

2019-10-31 17:17:03 +01:00
parent f0a2100bb8
commit f183579237
2 changed files with 302 additions and 270 deletions
--- a/rowers/dataprep.py
+++ b/rowers/dataprep.py
@@ -353,7 +353,6 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
                   ignoreadvanced=False):
    # clean data remove zeros and negative values

-
    # bring metrics which have negative values to positive domain
    if len(datadf)==0:
        return datadf
@@ -378,11 +377,30 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (KeyError,TypeError) as e:
        pass

+
    try:
        datadf = datadf.clip(lower=0)
    except TypeError:
        pass

+    # protect advanced metrics columns
+    advancedcols = [
+        'rhythm',
+        'power',
+        'drivelength',
+        'forceratio',
+        'drivespeed',
+        'driveenergy',
+        'catch',
+        'finish',
+        'averageforce',
+        'peakforce',
+        'slip',
+        'wash',
+        'peakforceangle',
+        'effectiveangle',
+    ]
+
    datadf.replace(to_replace=0, value=np.nan, inplace=True)
    #    datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))

@@ -392,6 +410,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (TypeError,KeyError) as e:
        pass

+
    # return from positive domain to negative
    try:
        datadf['catch'] = -datadf['catch']
@@ -422,12 +441,14 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (KeyError,TypeError):
        pass

+
    try:
        mask = datadf['efficiency'] > 200.
        datadf.mask(mask,inplace=True)
    except (KeyError,TypeError):
        pass

+
    try:
        mask = datadf['spm'] < 10
        datadf.mask(mask,inplace=True)
@@ -459,15 +480,23 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (KeyError,TypeError):
        pass

+
    try:
        mask = datadf['wash'] > 1
        datadf.loc[mask, 'wash'] = np.nan
    except (KeyError,TypeError):
        pass

+    # try to guess ignoreadvanced
+    if not ignoreadvanced:
+        for metric in advancedcols:
+            sum = datadf[metric].std()
+            if sum == 0 or np.isnan(sum):
+                ignoreadvanced = True
+
    if not ignoreadvanced:
        try:
-            mask = datadf['rhythm'] < 5
+            mask = datadf['rhythm'] < 0
            datadf.mask(mask,inplace=True)
        except (KeyError,TypeError):
            pass
@@ -532,6 +561,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
        except (KeyError,TypeError):
            pass

+
    workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
    workoutstatesrest = [3]
    workoutstatetransition = [0, 2, 10, 11, 12, 13]