bla

2019-10-22 21:34:03 +02:00
parent 6de6a0dae1
commit 7f2c68a903
2 changed files with 50 additions and 40 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,7 @@ certifi==2019.3.9
 cffi==1.12.2
 chardet==3.0.4
 Click==7.0
+cloudpickle==1.2.2
 colorama==0.4.1
 colorclass==2.2.0
 cookies==2.2.1
@@ -27,7 +28,7 @@ coreschema==0.0.4
 coverage==4.5.3
 cryptography==2.6.1
 cycler==0.10.0
-dask==1.1.4
+dask==2.6.0
 decorator==4.4.0
 defusedxml==0.5.0
 Django==2.1.7
@@ -39,7 +40,7 @@ django-cookie-law==2.0.1
 django-cors-headers==2.5.2
 django-countries==5.3.3
 django-datetime-widget==0.9.3
-django-debug-toolbar==1.11
+django-debug-toolbar==2.0
 django-extensions==2.1.6
 django-htmlmin==0.11.0
 django-leaflet==0.24.0
@@ -64,8 +65,10 @@ entrypoints==0.3
 execnet==1.5.0
 factory-boy==2.11.1
 Faker==1.0.4
+fastparquet==0.3.2
 fitparse==1.1.0
 Flask==1.0.2
+fsspec==0.5.2
 future==0.17.1
 geocoder==1.38.1
 geos==0.2.1
@@ -74,6 +77,7 @@ html5lib==1.0.1
 htmlmin==0.1.12
 HTMLParser==0.0.2
 httplib2==0.12.1
+hvplot==0.4.0
 icalendar==4.0.3
 idna==2.8
 image==1.5.27
@@ -99,10 +103,12 @@ jupyterlab-server==0.3.0
 keyring==18.0.0
 kiwisolver==1.0.1
 kombu==4.5.0
+llvmlite==0.30.0
 lxml==4.3.2
 Markdown==3.0.1
 MarkupSafe==1.1.1
 matplotlib==3.0.3
+minify==0.1.4
 MiniMockTest==0.5
 mistune==0.8.4
 mock==2.0.0
@@ -111,9 +117,11 @@ mpld3==0.3
 mysqlclient==1.4.2.post1
 nbconvert==5.4.1
 nbformat==4.4.0
+newrelic==5.2.1.129
 nose==1.3.7
 nose-parameterized==0.6.0
 notebook==5.7.6
+numba==0.46.0
 numpy==1.16.2
 oauth2==1.9.0.post1
 oauthlib==3.0.1
@@ -135,6 +143,7 @@ prompt-toolkit==2.0.9
 psycopg2==2.8.1
 ptyprocess==0.6.0
 py==1.8.0
+pyarrow==0.15.0
 pycparser==2.19
 Pygments==2.3.1
 pyparsing==2.3.1
@@ -160,7 +169,7 @@ ratelim==0.1.6
 redis==3.2.1
 requests==2.21.0
 requests-oauthlib==1.2.0
-rowingdata==2.5.4
+rowingdata==2.5.5
 rowingphysics==0.5.0
 rq==0.13.0
 scipy==1.2.1
@@ -179,7 +188,9 @@ terminado==0.8.1
 terminaltables==3.1.0
 testpath==0.4.2
 text-unidecode==1.2
+thrift==0.11.0
 timezonefinder==4.0.1
+toolz==0.10.0
 tornado==6.0.1
 tqdm==4.31.1
 traitlets==4.3.2
@@ -196,3 +207,4 @@ xlrd==1.2.0
 xmltodict==0.12.0
 yamjam==0.1.7
 yamllint==1.15.0
+yuicompressor==2.4.8
--- a/rowers/dataprep.py
+++ b/rowers/dataprep.py
@@ -4,7 +4,6 @@ from __future__ import print_function
 from __future__ import unicode_literals


-
 # All the data preparation, data cleaning and data mangling should
 # be defined here
 from __future__ import unicode_literals, absolute_import
@@ -26,6 +25,8 @@ from rowers.tasks import handle_sendemail_unrecognized
 from rowers.tasks import handle_zip_file

 from pandas import DataFrame, Series
+import dask.dataframe as dd
+from dask.delayed import delayed

 from django.utils import timezone
 from django.utils.timezone import get_current_timezone
@@ -349,7 +350,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
    # clean data remove zeros and negative values

    # bring metrics which have negative values to positive domain
-    if datadf.empty:
+    if len(datadf)==0:
        return datadf
    try:
        datadf['catch'] = -datadf['catch']
@@ -1771,8 +1772,31 @@ def getrowdata_db(id=0, doclean=False, convertnewtons=True,

 # Fetch a subset of the data from the DB

+def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True):
+    prepmultipledata(ids)

-def getsmallrowdata_db(columns, ids=[], doclean=True, workstrokesonly=True):
+    csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids]
+    data = []
+    columns = [c for c in columns if c != 'None']
+
+    for f in csvfilenames:
+        df = dd.read_parquet(f,columns=columns,engine='pyarrow')
+        data.append(df)
+
+    df = dd.concat(data,axis=0)
+
+    data = df.compute()
+    data = data.loc[:,~data.columns.duplicated()]
+    extracols = []
+    if doclean:
+        data = clean_df_stats(data, ignorehr=True,
+                              workstrokesonly=workstrokesonly)
+        data.dropna(axis=1,how='all',inplace=True)
+        data.dropna(axis=0,how='any',inplace=True)
+
+    return data
+
+def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True):
    prepmultipledata(ids)
    data,extracols = read_cols_df_sql(ids, columns)
    if extracols and len(ids)==1:
@@ -1850,31 +1874,20 @@ def getrowdata(id=0):
 # safety net for programming errors elsewhere in the app
 # Also used heavily when I moved from CSV file only to CSV+Stroke data

+import glob

 def prepmultipledata(ids, verbose=False):
-    query = sa.text('SELECT DISTINCT workoutid FROM strokedata')
-    engine = create_engine(database_url, echo=False)
+    filenames = glob.glob('media/*.parquet')
+    ids = [id for id in ids if 'media/strokedata_{id}.parquet'.format(id=id) not in filenames]

-    with engine.connect() as conn, conn.begin():
-        res = conn.execute(query)
-        res = list(itertools.chain.from_iterable(res.fetchall()))
-    conn.close()
-    engine.dispose()
-
-    try:
-        ids2 = [int(id) for id in ids]
-    except ValueError:
-        ids2 = ids
-
-    res = list(set(ids2) - set(res))
-    for id in res:
+    for id in ids:
        rowdata, row = getrowdata(id=id)
        if verbose:
            print(id)
        if rowdata and len(rowdata.df):
            data = dataprep(rowdata.df, id=id, bands=True,
                            barchart=True, otwpower=True)
-    return res
+    return ids

 # Read a set of columns for a set of workout ids, returns data as a
 # pandas dataframe
@@ -2292,19 +2305,6 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
    except KeyError:
        rowdatadf[' ElapsedTime (sec)'] = rowdatadf['TimeStamp (sec)']

-    if barchart:
-        # time increments for bar chart
-        time_increments = rowdatadf.loc[:, ' ElapsedTime (sec)'].diff()
-        try:
-            time_increments.iloc[0] = time_increments.iloc[1]
-        except (KeyError, IndexError):
-            time_increments.iloc[0] = 1.
-            
-        time_increments = 0.5 * time_increments + 0.5 * np.abs(time_increments)
-        x_right = (t2 + time_increments.apply(lambda x: timedeltaconv(x)))
-
-        data['x_right'] = x_right
-
    if empower:
        try:
            wash = rowdatadf.loc[:, 'wash']
@@ -2441,12 +2441,10 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
    # write data if id given
    if id != 0:
        data['workoutid'] = id
+        filename = 'media/strokedata_{id}.parquet'.format(id=id)
+#        df = dd.from_pandas(data,npartitions=1)
+        data.to_parquet(filename,engine='pyarrow')

-        engine = create_engine(database_url, echo=False)
-        with engine.connect() as conn, conn.begin():
-            data.to_sql('strokedata', engine, if_exists='append', index=False)
-        conn.close()
-        engine.dispose()
    return data