From 622ae44ea64f9ed580c4499ca7e101f99b4c674e Mon Sep 17 00:00:00 2001 From: Sander Roosendaal Date: Tue, 22 Oct 2019 21:40:35 +0200 Subject: [PATCH] using dask --- rowers/dataprep.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/rowers/dataprep.py b/rowers/dataprep.py index 01a649a7..bdc2ef4d 100644 --- a/rowers/dataprep.py +++ b/rowers/dataprep.py @@ -1779,11 +1779,15 @@ def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True): data = [] columns = [c for c in columns if c != 'None'] - for f in csvfilenames: - df = dd.read_parquet(f,columns=columns,engine='pyarrow') - data.append(df) + if len(ids)>1: + for f in csvfilenames: + df = dd.read_parquet(f,columns=columns,engine='pyarrow') + data.append(df) - df = dd.concat(data,axis=0) + + df = dd.concat(data,axis=0) + else: + df = dd.read_parquet(csvfilenames[0],columns=columns,engine='pyarrow') data = df.compute() data = data.loc[:,~data.columns.duplicated()] @@ -2443,7 +2447,7 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True, data['workoutid'] = id filename = 'media/strokedata_{id}.parquet'.format(id=id) # df = dd.from_pandas(data,npartitions=1) - data.to_parquet(filename,engine='pyarrow') + data.to_parquet(filename,engine='pyarrow',compression='gzip') return data