improved data cleaning
This commit is contained in:
@@ -353,7 +353,6 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
||||
ignoreadvanced=False):
|
||||
# clean data remove zeros and negative values
|
||||
|
||||
|
||||
# bring metrics which have negative values to positive domain
|
||||
if len(datadf)==0:
|
||||
return datadf
|
||||
@@ -378,11 +377,30 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
||||
except (KeyError,TypeError) as e:
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
datadf = datadf.clip(lower=0)
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
# protect advanced metrics columns
|
||||
advancedcols = [
|
||||
'rhythm',
|
||||
'power',
|
||||
'drivelength',
|
||||
'forceratio',
|
||||
'drivespeed',
|
||||
'driveenergy',
|
||||
'catch',
|
||||
'finish',
|
||||
'averageforce',
|
||||
'peakforce',
|
||||
'slip',
|
||||
'wash',
|
||||
'peakforceangle',
|
||||
'effectiveangle',
|
||||
]
|
||||
|
||||
datadf.replace(to_replace=0, value=np.nan, inplace=True)
|
||||
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
|
||||
|
||||
@@ -392,6 +410,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
||||
except (TypeError,KeyError) as e:
|
||||
pass
|
||||
|
||||
|
||||
# return from positive domain to negative
|
||||
try:
|
||||
datadf['catch'] = -datadf['catch']
|
||||
@@ -422,12 +441,14 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
||||
except (KeyError,TypeError):
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
mask = datadf['efficiency'] > 200.
|
||||
datadf.mask(mask,inplace=True)
|
||||
except (KeyError,TypeError):
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
mask = datadf['spm'] < 10
|
||||
datadf.mask(mask,inplace=True)
|
||||
@@ -459,15 +480,23 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
||||
except (KeyError,TypeError):
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
mask = datadf['wash'] > 1
|
||||
datadf.loc[mask, 'wash'] = np.nan
|
||||
except (KeyError,TypeError):
|
||||
pass
|
||||
|
||||
# try to guess ignoreadvanced
|
||||
if not ignoreadvanced:
|
||||
for metric in advancedcols:
|
||||
sum = datadf[metric].std()
|
||||
if sum == 0 or np.isnan(sum):
|
||||
ignoreadvanced = True
|
||||
|
||||
if not ignoreadvanced:
|
||||
try:
|
||||
mask = datadf['rhythm'] < 5
|
||||
mask = datadf['rhythm'] < 0
|
||||
datadf.mask(mask,inplace=True)
|
||||
except (KeyError,TypeError):
|
||||
pass
|
||||
@@ -532,6 +561,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
||||
except (KeyError,TypeError):
|
||||
pass
|
||||
|
||||
|
||||
workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
|
||||
workoutstatesrest = [3]
|
||||
workoutstatetransition = [0, 2, 10, 11, 12, 13]
|
||||
|
||||
Reference in New Issue
Block a user