improved data cleaning
This commit is contained in:
@@ -353,7 +353,6 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
ignoreadvanced=False):
|
ignoreadvanced=False):
|
||||||
# clean data remove zeros and negative values
|
# clean data remove zeros and negative values
|
||||||
|
|
||||||
|
|
||||||
# bring metrics which have negative values to positive domain
|
# bring metrics which have negative values to positive domain
|
||||||
if len(datadf)==0:
|
if len(datadf)==0:
|
||||||
return datadf
|
return datadf
|
||||||
@@ -378,11 +377,30 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
except (KeyError,TypeError) as e:
|
except (KeyError,TypeError) as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
datadf = datadf.clip(lower=0)
|
datadf = datadf.clip(lower=0)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# protect advanced metrics columns
|
||||||
|
advancedcols = [
|
||||||
|
'rhythm',
|
||||||
|
'power',
|
||||||
|
'drivelength',
|
||||||
|
'forceratio',
|
||||||
|
'drivespeed',
|
||||||
|
'driveenergy',
|
||||||
|
'catch',
|
||||||
|
'finish',
|
||||||
|
'averageforce',
|
||||||
|
'peakforce',
|
||||||
|
'slip',
|
||||||
|
'wash',
|
||||||
|
'peakforceangle',
|
||||||
|
'effectiveangle',
|
||||||
|
]
|
||||||
|
|
||||||
datadf.replace(to_replace=0, value=np.nan, inplace=True)
|
datadf.replace(to_replace=0, value=np.nan, inplace=True)
|
||||||
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
|
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
|
||||||
|
|
||||||
@@ -392,6 +410,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
except (TypeError,KeyError) as e:
|
except (TypeError,KeyError) as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
# return from positive domain to negative
|
# return from positive domain to negative
|
||||||
try:
|
try:
|
||||||
datadf['catch'] = -datadf['catch']
|
datadf['catch'] = -datadf['catch']
|
||||||
@@ -422,12 +441,14 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
except (KeyError,TypeError):
|
except (KeyError,TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['efficiency'] > 200.
|
mask = datadf['efficiency'] > 200.
|
||||||
datadf.mask(mask,inplace=True)
|
datadf.mask(mask,inplace=True)
|
||||||
except (KeyError,TypeError):
|
except (KeyError,TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['spm'] < 10
|
mask = datadf['spm'] < 10
|
||||||
datadf.mask(mask,inplace=True)
|
datadf.mask(mask,inplace=True)
|
||||||
@@ -459,15 +480,23 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
except (KeyError,TypeError):
|
except (KeyError,TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['wash'] > 1
|
mask = datadf['wash'] > 1
|
||||||
datadf.loc[mask, 'wash'] = np.nan
|
datadf.loc[mask, 'wash'] = np.nan
|
||||||
except (KeyError,TypeError):
|
except (KeyError,TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# try to guess ignoreadvanced
|
||||||
|
if not ignoreadvanced:
|
||||||
|
for metric in advancedcols:
|
||||||
|
sum = datadf[metric].std()
|
||||||
|
if sum == 0 or np.isnan(sum):
|
||||||
|
ignoreadvanced = True
|
||||||
|
|
||||||
if not ignoreadvanced:
|
if not ignoreadvanced:
|
||||||
try:
|
try:
|
||||||
mask = datadf['rhythm'] < 5
|
mask = datadf['rhythm'] < 0
|
||||||
datadf.mask(mask,inplace=True)
|
datadf.mask(mask,inplace=True)
|
||||||
except (KeyError,TypeError):
|
except (KeyError,TypeError):
|
||||||
pass
|
pass
|
||||||
@@ -532,6 +561,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
except (KeyError,TypeError):
|
except (KeyError,TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
|
workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
|
||||||
workoutstatesrest = [3]
|
workoutstatesrest = [3]
|
||||||
workoutstatetransition = [0, 2, 10, 11, 12, 13]
|
workoutstatetransition = [0, 2, 10, 11, 12, 13]
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user