small improvements dataprep
This commit is contained in:
@@ -379,6 +379,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
datadf.replace(to_replace=0, value=np.nan, inplace=True)
|
datadf.replace(to_replace=0, value=np.nan, inplace=True)
|
||||||
|
# datadf = datadf.map_partitions(lambda df:df.replace(to_replace=0,value=np.nan))
|
||||||
|
|
||||||
# bring spm back to real values
|
# bring spm back to real values
|
||||||
try:
|
try:
|
||||||
@@ -406,55 +407,55 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
if not ignorehr:
|
if not ignorehr:
|
||||||
try:
|
try:
|
||||||
mask = datadf['hr'] < 30
|
mask = datadf['hr'] < 30
|
||||||
datadf.loc[mask, 'hr'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['spm'] < 0
|
mask = datadf['spm'] < 0
|
||||||
datadf.loc[mask,'spm'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['efficiency'] > 200.
|
mask = datadf['efficiency'] > 200.
|
||||||
datadf.loc[mask, 'efficiency'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['spm'] < 10
|
mask = datadf['spm'] < 10
|
||||||
datadf.loc[mask, 'spm'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['pace'] / 1000. > 300.
|
mask = datadf['pace'] / 1000. > 300.
|
||||||
datadf.loc[mask, 'pace'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['efficiency'] < 0.
|
mask = datadf['efficiency'] < 0.
|
||||||
datadf.loc[mask, 'efficiency'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['pace'] / 1000. < 60.
|
mask = datadf['pace'] / 1000. < 60.
|
||||||
datadf.loc[mask, 'pace'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['spm'] > 60
|
mask = datadf['spm'] > 60
|
||||||
datadf.loc[mask, 'spm'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['wash'] < 1
|
mask = datadf['wash'] > 1
|
||||||
datadf.loc[mask, 'wash'] = np.nan
|
datadf.loc[mask, 'wash'] = np.nan
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
@@ -462,67 +463,67 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
|
|||||||
if not ignoreadvanced:
|
if not ignoreadvanced:
|
||||||
try:
|
try:
|
||||||
mask = datadf['rhythm'] < 5
|
mask = datadf['rhythm'] < 5
|
||||||
datadf.loc[mask, 'rhythm'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['rhythm'] > 70
|
mask = datadf['rhythm'] > 70
|
||||||
datadf.loc[mask, 'rhythm'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['power'] < 20
|
mask = datadf['power'] < 20
|
||||||
datadf.loc[mask, 'power'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['drivelength'] < 0.5
|
mask = datadf['drivelength'] < 0.5
|
||||||
datadf.loc[mask, 'drivelength'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['forceratio'] < 0.2
|
mask = datadf['forceratio'] < 0.2
|
||||||
datadf.loc[mask, 'forceratio'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['forceratio'] > 1.0
|
mask = datadf['forceratio'] > 1.0
|
||||||
datadf.loc[mask, 'forceratio'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['drivespeed'] < 0.5
|
mask = datadf['drivespeed'] < 0.5
|
||||||
datadf.loc[mask, 'drivespeed'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['drivespeed'] > 4
|
mask = datadf['drivespeed'] > 4
|
||||||
datadf.loc[mask, 'drivespeed'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['driveenergy'] > 2000
|
mask = datadf['driveenergy'] > 2000
|
||||||
datadf.loc[mask, 'driveenergy'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['driveenergy'] < 100
|
mask = datadf['driveenergy'] < 100
|
||||||
datadf.loc[mask, 'driveenergy'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mask = datadf['catch'] > -30.
|
mask = datadf['catch'] > -30.
|
||||||
datadf.loc[mask, 'catch'] = np.nan
|
datadf.mask(mask,inplace=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1772,7 +1773,7 @@ def getrowdata_db(id=0, doclean=False, convertnewtons=True,
|
|||||||
|
|
||||||
# Fetch a subset of the data from the DB
|
# Fetch a subset of the data from the DB
|
||||||
|
|
||||||
def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True):
|
def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True,compute=True):
|
||||||
prepmultipledata(ids)
|
prepmultipledata(ids)
|
||||||
|
|
||||||
csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids]
|
csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids]
|
||||||
@@ -1792,16 +1793,19 @@ def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True):
|
|||||||
else:
|
else:
|
||||||
df = dd.read_parquet(csvfilenames[0],columns=columns,engine='pyarrow')
|
df = dd.read_parquet(csvfilenames[0],columns=columns,engine='pyarrow')
|
||||||
|
|
||||||
data = df.compute()
|
df = df.loc[:,~df.columns.duplicated()]
|
||||||
data = data.loc[:,~data.columns.duplicated()]
|
|
||||||
extracols = []
|
|
||||||
if doclean:
|
if compute:
|
||||||
data = clean_df_stats(data, ignorehr=True,
|
data = df.compute()
|
||||||
workstrokesonly=workstrokesonly)
|
if doclean:
|
||||||
|
data = clean_df_stats(data, ignorehr=True,
|
||||||
|
workstrokesonly=workstrokesonly)
|
||||||
data.dropna(axis=1,how='all',inplace=True)
|
data.dropna(axis=1,how='all',inplace=True)
|
||||||
data.dropna(axis=0,how='any',inplace=True)
|
data.dropna(axis=0,how='any',inplace=True)
|
||||||
|
return data
|
||||||
return data
|
|
||||||
|
return df
|
||||||
|
|
||||||
def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True):
|
def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True):
|
||||||
prepmultipledata(ids)
|
prepmultipledata(ids)
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ from pandas import DataFrame,Series
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import itertools
|
import itertools
|
||||||
|
import dask.dataframe as dd
|
||||||
|
from dask.delayed import delayed
|
||||||
|
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
import sqlalchemy as sa
|
import sqlalchemy as sa
|
||||||
|
|||||||
Reference in New Issue
Block a user