'Python SQLAlchemy structure and averaged data retrieval

I have a database using Flask-sqlalchemy and I want to store and retrieve (averaged) temperature and humidity data of a sensor. I've written the class below that stores both together, since I figured it was one sensor and the values are valid for the same timestep they were measured.

My question is, how can I make this performant?

Specifically I found that the retrieval of data takes a long time ~5s for some hundered points. I know that it's programmed very inefficiently - how do I improve this mindfully? The only thing I really need is a function to retrieve measurements in a time-range or in the order that I put them in, which is equivalent and a moving average or blockwise averages. The bulk of the time is consumed by: TempHum.query.filter(TempHum.timestamp>starttime)

The main focus is: how do I efficiently structure the database and how to retrieve (and filter) the data efficiently.

#!/usr/bin/python3

from flask_sqlalchemy import SQLAlchemy
from datetime import datetime, timedelta
from math import floor

db = SQLAlchemy()

# Define a Table in the database
class TempHum(db.Model):
    __tablename__  = 'tempHumMeasurements'
    # Here we define columns for the table
    # Each column is also a Python attribute.
    id = db.Column(db.Integer, primary_key = True)
    timestamp = db.Column(db.DateTime(timezone=True))
    temperature = db.Column('temperature', db.Float)
    humidity = db.Column('humidity', db.Float)
    location = db.Column('location', db.String(32))

    def __init__(self, **kwargs):
        super(TempHum, self).__init__(**kwargs)
        # do custom initialization here
        #now.isoformat(timespec="seconds")
        self.timestamp = datetime.now().replace(microsecond=0)
        self.location = 'room1'

    def __repr__(self):
        return "<TempHum {}>".format(self.timestamp.strftime("%d %m %Y, %H:%M"))

    def getNSamples(n):
        return TempHum.query.\
            order_by(TempHum.timestamp.desc()).\
            limit(n).\
            from_self().order_by(TempHum.timestamp.asc()).all()

    def getSampledNDays(n):
        starttime = datetime.now() - timedelta(days=n)
        meas = TempHum.query.\
            filter(TempHum.timestamp>starttime).\
            order_by(TempHum.timestamp.asc()).all()
        return meas1
    

# ___________
# helper functions

    def mean(lst):
        return sum(lst)/len(lst)


    def averageData(measures):
        mid = floor(len(measures)/2)
        meantemp = TempHum.mean([tmp.temperature for tmp in measures])
        meanhum = TempHum.mean([tmp.humidity for tmp in measures])
        means = TempHum(temperature=meantemp,humidity=meanhum)
        means.timestamp = measures[mid].timestamp
        return means


    def downsample(arr,average_mins):
    # if values have the correct sampling time they are averaged
    # over average_count values, otherwise all data is taken into account
    # arr is an array of TempHum objects that should be downsampled
        average_time = timedelta(minutes=average_mins)
        buffer = []
        output_array = []
        for k,date in enumerate(arr):
            if not buffer: # buffer first one if buffer is empty
                buffer.append(arr[k])
                continue
            timestep = arr[k].timestamp-buffer[0].timestamp
            if timestep<average_time:
                # values within the window
                buffer.append(arr[k])
            else:
                averaged = TempHum.averageData(buffer)
                output_array.append(averaged)
                buffer.clear()
                buffer.append(arr[k])
        return output_array;



class HeaterState(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    timestamp = db.Column(db.DateTime(timezone=True))
    heating = db.Column('heating', db.Boolean)

    def __init__(self, **kwargs):
        super(HeaterState, self).__init__(**kwargs)
        self.timestamp = datetime.now().replace(microsecond=0)

    def __repr__(self):
        return "<HeaterState {}>".format(self.timestamp.strftime("%d %m %Y, %H:%M"))

    def getSampledNDays(n):
        starttime = datetime.now() - timedelta(days=n)
        meas = HeaterState.query.\
            filter(HeaterState.timestamp>starttime).\
            order_by(HeaterState.timestamp.asc()).all()
        return meas


Solution 1:[1]

Have you tried adding an index on the timestamp column?

Something like this:

class TempHum(db.Model):
    ...
    timestamp = db.Column(db.DateTime(timezone=True), db_index=True) 

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Chris Sears