improved data cleaning

2019-10-31 17:17:03 +01:00
parent f0a2100bb8
commit f183579237
2 changed files with 302 additions and 270 deletions
@@ -353,7 +353,6 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
                   ignoreadvanced=False):
    # clean data remove zeros and negative values
    # bring metrics which have negative values to positive domain
    if len(datadf)==0:
        return datadf
@@ -378,11 +377,30 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (KeyError,TypeError) as e:
        pass
    try:
        datadf = datadf.clip(lower=0)
    except TypeError:
        pass
    # protect advanced metrics columns
    advancedcols = [
        'rhythm',
        'power',
        'drivelength',
        'forceratio',
        'drivespeed',
        'driveenergy',
        'catch',
        'finish',
        'averageforce',
        'peakforce',
        'slip',
        'wash',
        'peakforceangle',
        'effectiveangle',
    ]
    datadf.replace(to_replace=0, value=np.nan, inplace=True)
    #    datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
@@ -392,6 +410,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (TypeError,KeyError) as e:
        pass
    # return from positive domain to negative
    try:
        datadf['catch'] = -datadf['catch']
@@ -422,12 +441,14 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (KeyError,TypeError):
        pass
    try:
        mask = datadf['efficiency'] > 200.
        datadf.mask(mask,inplace=True)
    except (KeyError,TypeError):
        pass
    try:
        mask = datadf['spm'] < 10
        datadf.mask(mask,inplace=True)
@@ -459,15 +480,23 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    except (KeyError,TypeError):
        pass
    try:
        mask = datadf['wash'] > 1
        datadf.loc[mask, 'wash'] = np.nan
    except (KeyError,TypeError):
        pass
    # try to guess ignoreadvanced
    if not ignoreadvanced:
        for metric in advancedcols:
            sum = datadf[metric].std()
            if sum == 0 or np.isnan(sum):
                ignoreadvanced = True
    if not ignoreadvanced:
        try:
-            mask = datadf['rhythm'] < 5
+            mask = datadf['rhythm'] < 0
            datadf.mask(mask,inplace=True)
        except (KeyError,TypeError):
            pass
@@ -532,6 +561,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
        except (KeyError,TypeError):
            pass
    workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
    workoutstatesrest = [3]
    workoutstatetransition = [0, 2, 10, 11, 12, 13]