diff --git a/requirements.txt b/requirements.txt index 76e37d6a..86e57035 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ certifi==2019.3.9 cffi==1.12.2 chardet==3.0.4 Click==7.0 +cloudpickle==1.2.2 colorama==0.4.1 colorclass==2.2.0 cookies==2.2.1 @@ -27,7 +28,7 @@ coreschema==0.0.4 coverage==4.5.3 cryptography==2.6.1 cycler==0.10.0 -dask==1.1.4 +dask==2.6.0 decorator==4.4.0 defusedxml==0.5.0 Django==2.1.7 @@ -39,7 +40,7 @@ django-cookie-law==2.0.1 django-cors-headers==2.5.2 django-countries==5.3.3 django-datetime-widget==0.9.3 -django-debug-toolbar==1.11 +django-debug-toolbar==2.0 django-extensions==2.1.6 django-htmlmin==0.11.0 django-leaflet==0.24.0 @@ -64,8 +65,10 @@ entrypoints==0.3 execnet==1.5.0 factory-boy==2.11.1 Faker==1.0.4 +fastparquet==0.3.2 fitparse==1.1.0 Flask==1.0.2 +fsspec==0.5.2 future==0.17.1 geocoder==1.38.1 geos==0.2.1 @@ -74,6 +77,7 @@ html5lib==1.0.1 htmlmin==0.1.12 HTMLParser==0.0.2 httplib2==0.12.1 +hvplot==0.4.0 icalendar==4.0.3 idna==2.8 image==1.5.27 @@ -99,10 +103,12 @@ jupyterlab-server==0.3.0 keyring==18.0.0 kiwisolver==1.0.1 kombu==4.5.0 +llvmlite==0.30.0 lxml==4.3.2 Markdown==3.0.1 MarkupSafe==1.1.1 matplotlib==3.0.3 +minify==0.1.4 MiniMockTest==0.5 mistune==0.8.4 mock==2.0.0 @@ -111,9 +117,11 @@ mpld3==0.3 mysqlclient==1.4.2.post1 nbconvert==5.4.1 nbformat==4.4.0 +newrelic==5.2.1.129 nose==1.3.7 nose-parameterized==0.6.0 notebook==5.7.6 +numba==0.46.0 numpy==1.16.2 oauth2==1.9.0.post1 oauthlib==3.0.1 @@ -135,6 +143,7 @@ prompt-toolkit==2.0.9 psycopg2==2.8.1 ptyprocess==0.6.0 py==1.8.0 +pyarrow==0.15.0 pycparser==2.19 Pygments==2.3.1 pyparsing==2.3.1 @@ -160,7 +169,7 @@ ratelim==0.1.6 redis==3.2.1 requests==2.21.0 requests-oauthlib==1.2.0 -rowingdata==2.5.4 +rowingdata==2.5.5 rowingphysics==0.5.0 rq==0.13.0 scipy==1.2.1 @@ -179,7 +188,9 @@ terminado==0.8.1 terminaltables==3.1.0 testpath==0.4.2 text-unidecode==1.2 +thrift==0.11.0 timezonefinder==4.0.1 +toolz==0.10.0 tornado==6.0.1 tqdm==4.31.1 traitlets==4.3.2 @@ -196,3 +207,4 @@ xlrd==1.2.0 xmltodict==0.12.0 yamjam==0.1.7 yamllint==1.15.0 +yuicompressor==2.4.8 diff --git a/rowers/dataprep.py b/rowers/dataprep.py index 9d43a330..01a649a7 100644 --- a/rowers/dataprep.py +++ b/rowers/dataprep.py @@ -4,7 +4,6 @@ from __future__ import print_function from __future__ import unicode_literals - # All the data preparation, data cleaning and data mangling should # be defined here from __future__ import unicode_literals, absolute_import @@ -26,6 +25,8 @@ from rowers.tasks import handle_sendemail_unrecognized from rowers.tasks import handle_zip_file from pandas import DataFrame, Series +import dask.dataframe as dd +from dask.delayed import delayed from django.utils import timezone from django.utils.timezone import get_current_timezone @@ -349,7 +350,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True, # clean data remove zeros and negative values # bring metrics which have negative values to positive domain - if datadf.empty: + if len(datadf)==0: return datadf try: datadf['catch'] = -datadf['catch'] @@ -1771,8 +1772,31 @@ def getrowdata_db(id=0, doclean=False, convertnewtons=True, # Fetch a subset of the data from the DB +def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True): + prepmultipledata(ids) -def getsmallrowdata_db(columns, ids=[], doclean=True, workstrokesonly=True): + csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids] + data = [] + columns = [c for c in columns if c != 'None'] + + for f in csvfilenames: + df = dd.read_parquet(f,columns=columns,engine='pyarrow') + data.append(df) + + df = dd.concat(data,axis=0) + + data = df.compute() + data = data.loc[:,~data.columns.duplicated()] + extracols = [] + if doclean: + data = clean_df_stats(data, ignorehr=True, + workstrokesonly=workstrokesonly) + data.dropna(axis=1,how='all',inplace=True) + data.dropna(axis=0,how='any',inplace=True) + + return data + +def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True): prepmultipledata(ids) data,extracols = read_cols_df_sql(ids, columns) if extracols and len(ids)==1: @@ -1850,31 +1874,20 @@ def getrowdata(id=0): # safety net for programming errors elsewhere in the app # Also used heavily when I moved from CSV file only to CSV+Stroke data +import glob def prepmultipledata(ids, verbose=False): - query = sa.text('SELECT DISTINCT workoutid FROM strokedata') - engine = create_engine(database_url, echo=False) + filenames = glob.glob('media/*.parquet') + ids = [id for id in ids if 'media/strokedata_{id}.parquet'.format(id=id) not in filenames] - with engine.connect() as conn, conn.begin(): - res = conn.execute(query) - res = list(itertools.chain.from_iterable(res.fetchall())) - conn.close() - engine.dispose() - - try: - ids2 = [int(id) for id in ids] - except ValueError: - ids2 = ids - - res = list(set(ids2) - set(res)) - for id in res: + for id in ids: rowdata, row = getrowdata(id=id) if verbose: print(id) if rowdata and len(rowdata.df): data = dataprep(rowdata.df, id=id, bands=True, barchart=True, otwpower=True) - return res + return ids # Read a set of columns for a set of workout ids, returns data as a # pandas dataframe @@ -2292,19 +2305,6 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True, except KeyError: rowdatadf[' ElapsedTime (sec)'] = rowdatadf['TimeStamp (sec)'] - if barchart: - # time increments for bar chart - time_increments = rowdatadf.loc[:, ' ElapsedTime (sec)'].diff() - try: - time_increments.iloc[0] = time_increments.iloc[1] - except (KeyError, IndexError): - time_increments.iloc[0] = 1. - - time_increments = 0.5 * time_increments + 0.5 * np.abs(time_increments) - x_right = (t2 + time_increments.apply(lambda x: timedeltaconv(x))) - - data['x_right'] = x_right - if empower: try: wash = rowdatadf.loc[:, 'wash'] @@ -2441,12 +2441,10 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True, # write data if id given if id != 0: data['workoutid'] = id + filename = 'media/strokedata_{id}.parquet'.format(id=id) +# df = dd.from_pandas(data,npartitions=1) + data.to_parquet(filename,engine='pyarrow') - engine = create_engine(database_url, echo=False) - with engine.connect() as conn, conn.begin(): - data.to_sql('strokedata', engine, if_exists='append', index=False) - conn.close() - engine.dispose() return data