Private
Public Access
1
0

improved data cleaning

This commit is contained in:
Sander Roosendaal
2019-10-31 17:17:03 +01:00
parent f0a2100bb8
commit f183579237
2 changed files with 302 additions and 270 deletions

View File

@@ -353,7 +353,6 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
ignoreadvanced=False): ignoreadvanced=False):
# clean data remove zeros and negative values # clean data remove zeros and negative values
# bring metrics which have negative values to positive domain # bring metrics which have negative values to positive domain
if len(datadf)==0: if len(datadf)==0:
return datadf return datadf
@@ -378,11 +377,30 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError) as e: except (KeyError,TypeError) as e:
pass pass
try: try:
datadf = datadf.clip(lower=0) datadf = datadf.clip(lower=0)
except TypeError: except TypeError:
pass pass
# protect advanced metrics columns
advancedcols = [
'rhythm',
'power',
'drivelength',
'forceratio',
'drivespeed',
'driveenergy',
'catch',
'finish',
'averageforce',
'peakforce',
'slip',
'wash',
'peakforceangle',
'effectiveangle',
]
datadf.replace(to_replace=0, value=np.nan, inplace=True) datadf.replace(to_replace=0, value=np.nan, inplace=True)
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan)) # datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
@@ -392,6 +410,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (TypeError,KeyError) as e: except (TypeError,KeyError) as e:
pass pass
# return from positive domain to negative # return from positive domain to negative
try: try:
datadf['catch'] = -datadf['catch'] datadf['catch'] = -datadf['catch']
@@ -422,12 +441,14 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError): except (KeyError,TypeError):
pass pass
try: try:
mask = datadf['efficiency'] > 200. mask = datadf['efficiency'] > 200.
datadf.mask(mask,inplace=True) datadf.mask(mask,inplace=True)
except (KeyError,TypeError): except (KeyError,TypeError):
pass pass
try: try:
mask = datadf['spm'] < 10 mask = datadf['spm'] < 10
datadf.mask(mask,inplace=True) datadf.mask(mask,inplace=True)
@@ -459,15 +480,23 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError): except (KeyError,TypeError):
pass pass
try: try:
mask = datadf['wash'] > 1 mask = datadf['wash'] > 1
datadf.loc[mask, 'wash'] = np.nan datadf.loc[mask, 'wash'] = np.nan
except (KeyError,TypeError): except (KeyError,TypeError):
pass pass
# try to guess ignoreadvanced
if not ignoreadvanced:
for metric in advancedcols:
sum = datadf[metric].std()
if sum == 0 or np.isnan(sum):
ignoreadvanced = True
if not ignoreadvanced: if not ignoreadvanced:
try: try:
mask = datadf['rhythm'] < 5 mask = datadf['rhythm'] < 0
datadf.mask(mask,inplace=True) datadf.mask(mask,inplace=True)
except (KeyError,TypeError): except (KeyError,TypeError):
pass pass
@@ -532,6 +561,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
except (KeyError,TypeError): except (KeyError,TypeError):
pass pass
workoutstateswork = [1, 4, 5, 8, 9, 6, 7] workoutstateswork = [1, 4, 5, 8, 9, 6, 7]
workoutstatesrest = [3] workoutstatesrest = [3]
workoutstatetransition = [0, 2, 10, 11, 12, 13] workoutstatetransition = [0, 2, 10, 11, 12, 13]

View File

@@ -2516,11 +2516,14 @@ def workout_stats_view(request,id=0,message="",successmessage=""):
if (checkworkoutuserview(request.user,row)==False): if (checkworkoutuserview(request.user,row)==False):
raise PermissionDenied('Access Denied') raise PermissionDenied('Access Denied')
datadf = dataprep.clean_df_stats(datadf,workstrokesonly=workstrokesonly) datadf = dataprep.clean_df_stats(datadf,workstrokesonly=workstrokesonly,
ignoreadvanced=False)
if datadf.empty: if datadf.empty:
datadf,row = dataprep.getrowdata_db(id=encoder.decode_hex(id)) datadf,row = dataprep.getrowdata_db(id=encoder.decode_hex(id))
datadf = dataprep.clean_df_stats(datadf,workstrokesonly=False) datadf = dataprep.clean_df_stats(datadf,workstrokesonly=False,
ignoreadvanced=True)
workstrokesonly=False workstrokesonly=False
if datadf.empty: if datadf.empty:
return HttpResponse("CSV data file not found") return HttpResponse("CSV data file not found")
@@ -5372,4 +5375,3 @@ class WorkoutDelete(DeleteView):
raise PermissionDenied('You are not allowed to delete this workout') raise PermissionDenied('You are not allowed to delete this workout')
return obj return obj