Private
Public Access
1
0

improved data cleaning

This commit is contained in:
Sander Roosendaal
2019-10-31 17:17:03 +01:00
parent f0a2100bb8
commit f183579237
2 changed files with 302 additions and 270 deletions

View File

@@ -353,7 +353,6 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
ignoreadvanced=False):
# clean data remove zeros and negative values
# bring metrics which have negative values to positive domain
if len(datadf)==0:
return datadf
@@ -378,11 +377,30 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError) as e:
pass
try:
datadf = datadf.clip(lower=0)
except TypeError:
pass
# protect advanced metrics columns
advancedcols = [
'rhythm',
'power',
'drivelength',
'forceratio',
'drivespeed',
'driveenergy',
'catch',
'finish',
'averageforce',
'peakforce',
'slip',
'wash',
'peakforceangle',
'effectiveangle',
]
datadf.replace(to_replace=0, value=np.nan, inplace=True)
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
@@ -392,6 +410,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (TypeError,KeyError) as e:
pass
# return from positive domain to negative
try:
datadf['catch'] = -datadf['catch']
@@ -422,12 +441,14 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError):
pass
try:
mask = datadf['efficiency'] > 200.
datadf.mask(mask,inplace=True)
except (KeyError,TypeError):
pass
try:
mask = datadf['spm'] < 10
datadf.mask(mask,inplace=True)
@@ -459,15 +480,23 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError):
pass
try:
mask = datadf['wash'] > 1
datadf.loc[mask, 'wash'] = np.nan
except (KeyError,TypeError):
pass
# try to guess ignoreadvanced
if not ignoreadvanced:
for metric in advancedcols:
sum = datadf[metric].std()
if sum == 0 or np.isnan(sum):
ignoreadvanced = True
if not ignoreadvanced:
try:
mask = datadf['rhythm'] < 5
mask = datadf['rhythm'] < 0
datadf.mask(mask,inplace=True)
except (KeyError,TypeError):
pass
@@ -532,6 +561,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError):
pass
workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
workoutstatesrest = [3]
workoutstatetransition = [0, 2, 10, 11, 12, 13]