Private
Public Access
1
0
This commit is contained in:
Sander Roosendaal
2019-10-22 21:34:03 +02:00
parent 6de6a0dae1
commit 7f2c68a903
2 changed files with 50 additions and 40 deletions

View File

@@ -19,6 +19,7 @@ certifi==2019.3.9
cffi==1.12.2 cffi==1.12.2
chardet==3.0.4 chardet==3.0.4
Click==7.0 Click==7.0
cloudpickle==1.2.2
colorama==0.4.1 colorama==0.4.1
colorclass==2.2.0 colorclass==2.2.0
cookies==2.2.1 cookies==2.2.1
@@ -27,7 +28,7 @@ coreschema==0.0.4
coverage==4.5.3 coverage==4.5.3
cryptography==2.6.1 cryptography==2.6.1
cycler==0.10.0 cycler==0.10.0
dask==1.1.4 dask==2.6.0
decorator==4.4.0 decorator==4.4.0
defusedxml==0.5.0 defusedxml==0.5.0
Django==2.1.7 Django==2.1.7
@@ -39,7 +40,7 @@ django-cookie-law==2.0.1
django-cors-headers==2.5.2 django-cors-headers==2.5.2
django-countries==5.3.3 django-countries==5.3.3
django-datetime-widget==0.9.3 django-datetime-widget==0.9.3
django-debug-toolbar==1.11 django-debug-toolbar==2.0
django-extensions==2.1.6 django-extensions==2.1.6
django-htmlmin==0.11.0 django-htmlmin==0.11.0
django-leaflet==0.24.0 django-leaflet==0.24.0
@@ -64,8 +65,10 @@ entrypoints==0.3
execnet==1.5.0 execnet==1.5.0
factory-boy==2.11.1 factory-boy==2.11.1
Faker==1.0.4 Faker==1.0.4
fastparquet==0.3.2
fitparse==1.1.0 fitparse==1.1.0
Flask==1.0.2 Flask==1.0.2
fsspec==0.5.2
future==0.17.1 future==0.17.1
geocoder==1.38.1 geocoder==1.38.1
geos==0.2.1 geos==0.2.1
@@ -74,6 +77,7 @@ html5lib==1.0.1
htmlmin==0.1.12 htmlmin==0.1.12
HTMLParser==0.0.2 HTMLParser==0.0.2
httplib2==0.12.1 httplib2==0.12.1
hvplot==0.4.0
icalendar==4.0.3 icalendar==4.0.3
idna==2.8 idna==2.8
image==1.5.27 image==1.5.27
@@ -99,10 +103,12 @@ jupyterlab-server==0.3.0
keyring==18.0.0 keyring==18.0.0
kiwisolver==1.0.1 kiwisolver==1.0.1
kombu==4.5.0 kombu==4.5.0
llvmlite==0.30.0
lxml==4.3.2 lxml==4.3.2
Markdown==3.0.1 Markdown==3.0.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
matplotlib==3.0.3 matplotlib==3.0.3
minify==0.1.4
MiniMockTest==0.5 MiniMockTest==0.5
mistune==0.8.4 mistune==0.8.4
mock==2.0.0 mock==2.0.0
@@ -111,9 +117,11 @@ mpld3==0.3
mysqlclient==1.4.2.post1 mysqlclient==1.4.2.post1
nbconvert==5.4.1 nbconvert==5.4.1
nbformat==4.4.0 nbformat==4.4.0
newrelic==5.2.1.129
nose==1.3.7 nose==1.3.7
nose-parameterized==0.6.0 nose-parameterized==0.6.0
notebook==5.7.6 notebook==5.7.6
numba==0.46.0
numpy==1.16.2 numpy==1.16.2
oauth2==1.9.0.post1 oauth2==1.9.0.post1
oauthlib==3.0.1 oauthlib==3.0.1
@@ -135,6 +143,7 @@ prompt-toolkit==2.0.9
psycopg2==2.8.1 psycopg2==2.8.1
ptyprocess==0.6.0 ptyprocess==0.6.0
py==1.8.0 py==1.8.0
pyarrow==0.15.0
pycparser==2.19 pycparser==2.19
Pygments==2.3.1 Pygments==2.3.1
pyparsing==2.3.1 pyparsing==2.3.1
@@ -160,7 +169,7 @@ ratelim==0.1.6
redis==3.2.1 redis==3.2.1
requests==2.21.0 requests==2.21.0
requests-oauthlib==1.2.0 requests-oauthlib==1.2.0
rowingdata==2.5.4 rowingdata==2.5.5
rowingphysics==0.5.0 rowingphysics==0.5.0
rq==0.13.0 rq==0.13.0
scipy==1.2.1 scipy==1.2.1
@@ -179,7 +188,9 @@ terminado==0.8.1
terminaltables==3.1.0 terminaltables==3.1.0
testpath==0.4.2 testpath==0.4.2
text-unidecode==1.2 text-unidecode==1.2
thrift==0.11.0
timezonefinder==4.0.1 timezonefinder==4.0.1
toolz==0.10.0
tornado==6.0.1 tornado==6.0.1
tqdm==4.31.1 tqdm==4.31.1
traitlets==4.3.2 traitlets==4.3.2
@@ -196,3 +207,4 @@ xlrd==1.2.0
xmltodict==0.12.0 xmltodict==0.12.0
yamjam==0.1.7 yamjam==0.1.7
yamllint==1.15.0 yamllint==1.15.0
yuicompressor==2.4.8

View File

@@ -4,7 +4,6 @@ from __future__ import print_function
from __future__ import unicode_literals from __future__ import unicode_literals
# All the data preparation, data cleaning and data mangling should # All the data preparation, data cleaning and data mangling should
# be defined here # be defined here
from __future__ import unicode_literals, absolute_import from __future__ import unicode_literals, absolute_import
@@ -26,6 +25,8 @@ from rowers.tasks import handle_sendemail_unrecognized
from rowers.tasks import handle_zip_file from rowers.tasks import handle_zip_file
from pandas import DataFrame, Series from pandas import DataFrame, Series
import dask.dataframe as dd
from dask.delayed import delayed
from django.utils import timezone from django.utils import timezone
from django.utils.timezone import get_current_timezone from django.utils.timezone import get_current_timezone
@@ -349,7 +350,7 @@ def clean_df_stats(datadf, workstrokesonly=True, ignorehr=True,
# clean data remove zeros and negative values # clean data remove zeros and negative values
# bring metrics which have negative values to positive domain # bring metrics which have negative values to positive domain
if datadf.empty: if len(datadf)==0:
return datadf return datadf
try: try:
datadf['catch'] = -datadf['catch'] datadf['catch'] = -datadf['catch']
@@ -1771,9 +1772,32 @@ def getrowdata_db(id=0, doclean=False, convertnewtons=True,
# Fetch a subset of the data from the DB # Fetch a subset of the data from the DB
def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True): def getsmallrowdata_db(columns, ids=[], doclean=True,workstrokesonly=True):
prepmultipledata(ids) prepmultipledata(ids)
csvfilenames = ['media/strokedata_{id}.parquet'.format(id=id) for id in ids]
data = []
columns = [c for c in columns if c != 'None']
for f in csvfilenames:
df = dd.read_parquet(f,columns=columns,engine='pyarrow')
data.append(df)
df = dd.concat(data,axis=0)
data = df.compute()
data = data.loc[:,~data.columns.duplicated()]
extracols = []
if doclean:
data = clean_df_stats(data, ignorehr=True,
workstrokesonly=workstrokesonly)
data.dropna(axis=1,how='all',inplace=True)
data.dropna(axis=0,how='any',inplace=True)
return data
def getsmallrowdata_db_old(columns, ids=[], doclean=True, workstrokesonly=True):
prepmultipledata(ids)
data,extracols = read_cols_df_sql(ids, columns) data,extracols = read_cols_df_sql(ids, columns)
if extracols and len(ids)==1: if extracols and len(ids)==1:
w = Workout.objects.get(id=ids[0]) w = Workout.objects.get(id=ids[0])
@@ -1850,31 +1874,20 @@ def getrowdata(id=0):
# safety net for programming errors elsewhere in the app # safety net for programming errors elsewhere in the app
# Also used heavily when I moved from CSV file only to CSV+Stroke data # Also used heavily when I moved from CSV file only to CSV+Stroke data
import glob
def prepmultipledata(ids, verbose=False): def prepmultipledata(ids, verbose=False):
query = sa.text('SELECT DISTINCT workoutid FROM strokedata') filenames = glob.glob('media/*.parquet')
engine = create_engine(database_url, echo=False) ids = [id for id in ids if 'media/strokedata_{id}.parquet'.format(id=id) not in filenames]
with engine.connect() as conn, conn.begin(): for id in ids:
res = conn.execute(query)
res = list(itertools.chain.from_iterable(res.fetchall()))
conn.close()
engine.dispose()
try:
ids2 = [int(id) for id in ids]
except ValueError:
ids2 = ids
res = list(set(ids2) - set(res))
for id in res:
rowdata, row = getrowdata(id=id) rowdata, row = getrowdata(id=id)
if verbose: if verbose:
print(id) print(id)
if rowdata and len(rowdata.df): if rowdata and len(rowdata.df):
data = dataprep(rowdata.df, id=id, bands=True, data = dataprep(rowdata.df, id=id, bands=True,
barchart=True, otwpower=True) barchart=True, otwpower=True)
return res return ids
# Read a set of columns for a set of workout ids, returns data as a # Read a set of columns for a set of workout ids, returns data as a
# pandas dataframe # pandas dataframe
@@ -2292,19 +2305,6 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
except KeyError: except KeyError:
rowdatadf[' ElapsedTime (sec)'] = rowdatadf['TimeStamp (sec)'] rowdatadf[' ElapsedTime (sec)'] = rowdatadf['TimeStamp (sec)']
if barchart:
# time increments for bar chart
time_increments = rowdatadf.loc[:, ' ElapsedTime (sec)'].diff()
try:
time_increments.iloc[0] = time_increments.iloc[1]
except (KeyError, IndexError):
time_increments.iloc[0] = 1.
time_increments = 0.5 * time_increments + 0.5 * np.abs(time_increments)
x_right = (t2 + time_increments.apply(lambda x: timedeltaconv(x)))
data['x_right'] = x_right
if empower: if empower:
try: try:
wash = rowdatadf.loc[:, 'wash'] wash = rowdatadf.loc[:, 'wash']
@@ -2441,12 +2441,10 @@ def dataprep(rowdatadf, id=0, bands=True, barchart=True, otwpower=True,
# write data if id given # write data if id given
if id != 0: if id != 0:
data['workoutid'] = id data['workoutid'] = id
filename = 'media/strokedata_{id}.parquet'.format(id=id)
# df = dd.from_pandas(data,npartitions=1)
data.to_parquet(filename,engine='pyarrow')
engine = create_engine(database_url, echo=False)
with engine.connect() as conn, conn.begin():
data.to_sql('strokedata', engine, if_exists='append', index=False)
conn.close()
engine.dispose()
return data return data