Private
Public Access
1
0

small improvements dataprep

This commit is contained in:
Sander Roosendaal
2019-10-23 06:48:58 +02:00
parent 8e2f7b0cce
commit 4ea24fa5aa
2 changed files with 35 additions and 29 deletions

View File

@@ -379,6 +379,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
pass pass
datadf.replace(to_replace=0, value=np.nan, inplace=True) datadf.replace(to_replace=0, value=np.nan, inplace=True)
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
# bring spm back to real values # bring spm back to real values
try: try:
@@ -406,55 +407,55 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
if not ignorehr: if not ignorehr:
try: try:
mask = datadf['hr'] < 30 mask = datadf['hr'] < 30
datadf.loc[mask, 'hr'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['spm'] < 0 mask = datadf['spm'] < 0
datadf.loc[mask,'spm'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['efficiency'] > 200. mask = datadf['efficiency'] > 200.
datadf.loc[mask, 'efficiency'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['spm'] < 10 mask = datadf['spm'] < 10
datadf.loc[mask, 'spm'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['pace'] / 1000. > 300. mask = datadf['pace'] / 1000. > 300.
datadf.loc[mask, 'pace'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['efficiency'] < 0. mask = datadf['efficiency'] < 0.
datadf.loc[mask, 'efficiency'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['pace'] / 1000. < 60. mask = datadf['pace'] / 1000. < 60.
datadf.loc[mask, 'pace'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['spm'] > 60 mask = datadf['spm'] > 60
datadf.loc[mask, 'spm'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['wash'] < 1 mask = datadf['wash'] > 1
datadf.loc[mask, 'wash'] = np.nan datadf.loc[mask, 'wash'] = np.nan
except KeyError: except KeyError:
pass pass
@@ -462,67 +463,67 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
if not ignoreadvanced: if not ignoreadvanced:
try: try:
mask = datadf['rhythm'] < 5 mask = datadf['rhythm'] < 5
datadf.loc[mask, 'rhythm'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['rhythm'] > 70 mask = datadf['rhythm'] > 70
datadf.loc[mask, 'rhythm'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['power'] < 20 mask = datadf['power'] < 20
datadf.loc[mask, 'power'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['drivelength'] < 0.5 mask = datadf['drivelength'] < 0.5
datadf.loc[mask, 'drivelength'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['forceratio'] < 0.2 mask = datadf['forceratio'] < 0.2
datadf.loc[mask, 'forceratio'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['forceratio'] > 1.0 mask = datadf['forceratio'] > 1.0
datadf.loc[mask, 'forceratio'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['drivespeed'] < 0.5 mask = datadf['drivespeed'] < 0.5
datadf.loc[mask, 'drivespeed'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['drivespeed'] > 4 mask = datadf['drivespeed'] > 4
datadf.loc[mask, 'drivespeed'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['driveenergy'] > 2000 mask = datadf['driveenergy'] > 2000
datadf.loc[mask, 'driveenergy'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['driveenergy'] < 100 mask = datadf['driveenergy'] < 100
datadf.loc[mask, 'driveenergy'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
try: try:
mask = datadf['catch'] > -30. mask = datadf['catch'] > -30.
datadf.loc[mask, 'catch'] = np.nan datadf.mask(mask,inplace=True)
except KeyError: except KeyError:
pass pass
@@ -1772,7 +1773,7 @@ def getrowdata_db(id=0, doclean=False, convertnewtons=True,
# Fetch a subset of the data from the DB # Fetch a subset of the data from the DB
def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True): def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True,compute=True):
prepmultipledata(ids) prepmultipledata(ids)
csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids] csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids]
@@ -1792,16 +1793,19 @@ def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True):
else: else:
df = dd.read_parquet(csvfilenames[0],columns=columns,engine='pyarrow') df = dd.read_parquet(csvfilenames[0],columns=columns,engine='pyarrow')
data = df.compute() df = df.loc[:,~df.columns.duplicated()]
data = data.loc[:,~data.columns.duplicated()]
extracols = []
if doclean: if compute:
data = clean_df_stats(data, ignorehr=True, data = df.compute()
workstrokesonly=workstrokesonly) if doclean:
data = clean_df_stats(data, ignorehr=True,
workstrokesonly=workstrokesonly)
data.dropna(axis=1,how='all',inplace=True) data.dropna(axis=1,how='all',inplace=True)
data.dropna(axis=0,how='any',inplace=True) data.dropna(axis=0,how='any',inplace=True)
return data
return data return df
def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True): def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True):
prepmultipledata(ids) prepmultipledata(ids)

View File

@@ -16,6 +16,8 @@ from pandas import DataFrame,Series
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import itertools import itertools
import dask.dataframe as dd
from dask.delayed import delayed
from sqlalchemy import create_engine from sqlalchemy import create_engine
import sqlalchemy as sa import sqlalchemy as sa