PolyCortex · conorato · Oct 22, 2020 · Oct 6, 2020 · Oct 6, 2020 · Oct 7, 2020
diff --git a/.editorconfig b/.editorconfig
@@ -8,6 +8,9 @@ indent_size = 2
 insert_final_newline = true
 trim_trailing_whitespace = true
 
+[*.py]
+indent_size = 4
+
 [*.md]
 max_line_length = off
 trim_trailing_whitespace = false
diff --git a/ai/feature_extraction.ipynb b/ai/feature_extraction.ipynb
diff --git a/ai/prediction_openbci.ipynb b/ai/prediction_openbci.ipynb
diff --git a/backend/app.py b/backend/app.py
@@ -3,22 +3,37 @@
 from waitress import serve
 from http import HTTPStatus
 
+from classification.file_loading import get_raw_array
+from classification.predict import predict
+from classification.exceptions import ClassificationError
+from classification.config.constants import Sex, ALLOWED_FILE_EXTENSIONS
 
 app = Flask(__name__)
 
 
 def allowed_file(filename):
-    ALLOWED_EXTENSIONS = {'txt', 'csv'}
-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+    return filename.lower().endswith(ALLOWED_FILE_EXTENSIONS)
 
 
 @app.route("/")
 def status():
     return ""
 
 
-@app.route('/analyze_sleep', methods=['POST'])
+@app.route('/analyze-sleep', methods=['POST'])
 def analyze_sleep():
+    """
+    Request payload example
+    {
+        "file": File(...),
+        "device": "CYTON",
+        "sex": "F",
+        "age": "23",
+        "stream_start": 1602895800000,
+        "bedtime": 1602898320000,
+        "wakeup": 1602931800000
+    }
+    """
     if 'file' not in request.files:
         return 'Missing file', HTTPStatus.BAD_REQUEST
     file = request.files['file']
@@ -29,9 +44,28 @@ def analyze_sleep():
     if not allowed_file(file.filename):
         return 'File format not allowed', HTTPStatus.BAD_REQUEST
 
-    file_content = file.read()
     form_data = request.form.to_dict()
 
+    try:
+        age = int(form_data['age'])
+        sex = Sex[form_data['sex']]
+        stream_start = int(form_data['stream_start'])
+        bedtime = int(form_data['bedtime'])
+        wakeup = int(form_data['wakeup'])
+    except (KeyError, ValueError):
+        return 'Missing or invalid request parameters', HTTPStatus.BAD_REQUEST
+
+    try:
+        raw_array = get_raw_array(file)
+        predict(raw_array, info={
+            'sex': sex,
+            'age': age,
+            'in_bed_seconds': bedtime - stream_start,
+            'out_of_bed_seconds': wakeup - stream_start
+        })
+    except ClassificationError as e:
+        return e.message, HTTPStatus.BAD_REQUEST
+
     with open("assets/mock_response.json", "r") as mock_response_file:
         return mock_response_file.read()
 

diff --git a/backend/assets/mock_response.json b/backend/assets/mock_response.json
@@ -1,5 +1,5 @@
 {
-  "board": "CYTHON",
+  "board": "CYTON",
   "subject": {
     "age": 28,
     "sex": "F"

diff --git a/backend/classification/config/constants.py b/backend/classification/config/constants.py
@@ -0,0 +1,31 @@
+from enum import Enum
+
+
+class Sex(Enum):
+    # based from subject description file (see header)
+    #  https://physionet.org/content/sleep-edfx/1.0.0/SC-subjects.xls
+    F = 1
+    M = 2
+
+
+ALLOWED_FILE_EXTENSIONS = ('.txt', '.csv')
+
+EEG_CHANNELS = [
+    'EEG Fpz-Cz',
+    'EEG Pz-Oz'
+]
+
+EPOCH_DURATION = 30
+FILE_MINIMUM_DURATION = EPOCH_DURATION
+
+DATASET_SAMPLE_RATE = 100
+OPENBCI_CYTON_SAMPLE_RATE = 250
+OPENBCI_GANGLION_SAMPLE_RATE = 200
+
+AGE_FEATURE_BINS = [
+    [12, 49],
+    [50, 59],
+    [60, 84],
+    [85, 125]
+]
+ACCEPTED_AGE_RANGE = [AGE_FEATURE_BINS[0][0], AGE_FEATURE_BINS[-1][-1]]
diff --git a/backend/classification/exceptions.py b/backend/classification/exceptions.py
@@ -0,0 +1,13 @@
+class ClassificationError(Exception):
+    """Base errors for application errors that can occur"""
+    message = "An error occured while calculating sleep stages."
+
+
+class TimestampsError(ClassificationError):
+    """Raised when timestamps are incoherent or doesn't fit with the provided file"""
+    message = "Received file, stream start time, bedtime or wakeup time are incoherent"
+
+
+class FileSizeError(ClassificationError):
+    """Raised when file is either too big or too small"""
+    message = "Received file is either too big or too small"
diff --git a/backend/classification/features/__init__.py b/backend/classification/features/__init__.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+from classification.features.extraction import (
+    get_eeg_features,
+    get_non_eeg_features,
+)
+
+
+def get_features(signal, info):
+    """Returns the raw features
+    Input:
+    - raw_eeg: instance of mne.io.RawArray
+        Should contain 2 channels (1: FPZ-CZ, 2: PZ-OZ)
+    - info: dict
+        Should contain the following keys:
+        - sex: instance of Sex enum
+        - age: indicates the subject's age
+        - in_bed_seconds: timespan, in seconds, from which
+            the subject started the recording and went to bed
+        - out_of_bed_seconds: timespan, in seconds, from which
+            the subject started the recording and got out of bed
+    Returns
+    -------
+    - features X in a vector of (nb_epochs, nb_features)
+    """
+    X_eeg = get_eeg_features(signal, info['in_bed_seconds'], info['out_of_bed_seconds'])
+    X_categorical = get_non_eeg_features(info['age'], info['sex'], X_eeg.shape[0])
+
+    return np.append(X_categorical, X_eeg, axis=1)
diff --git a/backend/classification/features/constants.py b/backend/classification/features/constants.py
@@ -0,0 +1,32 @@
+from classification.config.constants import (
+    DATASET_SAMPLE_RATE,
+    EPOCH_DURATION,
+)
+
+NYQUIST_FREQ = DATASET_SAMPLE_RATE / 2
+
+DELTA = "delta"
+THETA = "theta"
+ALPHA = "alpha"
+SIGMA = "sigma"
+BETA = "beta"
+
+FREQ_BANDS_RANGE = {
+    DELTA: [0.5, 4.5],
+    THETA: [4.5, 8.5],
+    ALPHA: [8.5, 11.5],
+    SIGMA: [11.5, 15.5],
+    BETA: [15.5, 30]
+}
+
+FREQ_BANDS_ORDERS = {
+    DELTA: 5,
+    THETA: 8,
+    ALPHA: 9,
+    SIGMA: 9,
+    BETA: 14
+}
+
+DATASET_HIGH_PASS_FREQ = 0.5
+HIGH_PASS_FILTER_ORDER = 6
+HIGH_PASS_MAX_RIPPLE_DB = 0.2
diff --git a/backend/classification/features/extraction.py b/backend/classification/features/extraction.py
@@ -0,0 +1,63 @@
+"""Feature extraction tools based off a two channel EEG recording"""
+import numpy as np
+
+from classification.config.constants import (
+    EEG_CHANNELS,
+    AGE_FEATURE_BINS,
+)
+from classification.features.pipeline import get_feature_union
+from classification.features.preprocessing import preprocess
+
+
+def get_eeg_features(raw_data, in_bed_seconds, out_of_bed_seconds):
+    """Returns the continuous feature matrix
+    Input
+    -------
+    raw_signal: MNE.Raw object with signals with or without annotations
+    in_bed_seconds: timespan, in seconds, from which the subject started
+        the recording and went to bed
+    out_of_bed_seconds: timespan, in seconds, from which the subject
+        started the recording and got out of bed
+
+    Returns
+    -------
+    Array of size (nb_epochs, nb_continuous_features)
+    """
+    features_file = []
+    feature_union = get_feature_union()
+
+    for channel in EEG_CHANNELS:
+        chan_data = preprocess(raw_data, channel, in_bed_seconds, out_of_bed_seconds)
+
+        X_features = feature_union.transform(chan_data)
+        features_file.append(X_features)
+
+        print(
+            f"Done extracting {X_features.shape[1]} features "
+            f"on {X_features.shape[0]} epochs for {channel}\n"
+        )
+
+    return np.hstack(tuple(features_file))
+
+
+def get_non_eeg_features(age, sex, nb_epochs):
+    """Returns the categorical feature matrix
+    Input
+    -------
+    age: Age of the subject
+    sex: Sex of the subject
+    nb_epochs: corresponds to the nb of epochs which will be analyzed.
+
+    Returns
+    -------
+    Array of size (nb_epochs,nb_categorical_features), which contains
+    (duplicated) value for all epochs because it concerns the same subject.
+    """
+    age_category = next(
+        category_index
+        for category_index, age_range in enumerate(AGE_FEATURE_BINS)
+        if age >= age_range[0] and age <= age_range[1]
+    )
+    X_categorical = [sex.value, age_category]
+
+    return np.array(X_categorical * nb_epochs).reshape(nb_epochs, -1)
diff --git a/backend/classification/features/pipeline/__init__.py b/backend/classification/features/pipeline/__init__.py
@@ -0,0 +1,19 @@
+from sklearn.pipeline import FeatureUnion
+
+from classification.features.pipeline.time_domain import (
+    get_time_domain_pipeline,
+)
+from classification.features.pipeline.frequency_domain import (
+    get_frequency_domain_pipeline,
+)
+from classification.features.pipeline.time_subband import (
+    get_subband_feature_union,
+)
+
+
+def get_feature_union():
+    return FeatureUnion([
+        ('time_domain', get_time_domain_pipeline()),
+        ('frequency_domain', get_frequency_domain_pipeline()),
+        ('subband_time_domain', get_subband_feature_union())
+    ], n_jobs=1)
diff --git a/backend/classification/features/pipeline/frequency_domain.py b/backend/classification/features/pipeline/frequency_domain.py
@@ -0,0 +1,113 @@
+import numpy as np
+from sklearn.pipeline import FeatureUnion, Pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+from classification.features.pipeline.utils import (
+    get_psds_from_epochs,
+)
+from classification.features.constants import (
+    FREQ_BANDS_RANGE,
+)
+
+
+def _get_mean_psds(psds_with_freqs, are_relative=False):
+    """EEG power band feature extraction.
+    Input
+    -------
+    psds_with_freqs: tuple which contains
+            - (nb_epochs, nb_chan=1, nb_freqs) psds amplitudes
+            - (nb_freqs,) corresponding frequency values
+
+    are_relative: boolean which indicates if the mean band powers
+        for each subband are relative to the total power or not.
+
+    Returns
+    -------
+    X : numpy array of shape [n_samples, nb_subband=5]
+        Transformed data.
+    """
+    psds = psds_with_freqs[0]
+    freqs = psds_with_freqs[1]
+
+    if are_relative:
+        psds /= np.sum(psds, axis=-1, keepdims=True)
+
+    X = []
+    for fmin, fmax in FREQ_BANDS_RANGE.values():
+        psds_band = psds[:, :, (freqs >= fmin) & (freqs < fmax)].mean(axis=-1)
+        X.append(psds_band.reshape(len(psds), -1))
+
+    return np.concatenate(X, axis=1)
+
+
+def _get_sefd_on_all_epochs(psds_with_freqs):
+    """SEFd on all epochs
+    """
+    SUBBAND_FREQ_SEFD = [8., 16.]
+
+    psds = psds_with_freqs[0].squeeze()
+    freqs = psds_with_freqs[1]
+
+    psds = psds[:, (freqs >= SUBBAND_FREQ_SEFD[0])
+                & (freqs < SUBBAND_FREQ_SEFD[1])]
+    freqs = freqs[(freqs >= SUBBAND_FREQ_SEFD[0])
+                  & (freqs < SUBBAND_FREQ_SEFD[1])]
+
+    def get_sefd(psd, freqs):
+        """Spectral edge frequency difference
+        Input
+        -------
+        psd: array of the power spectrum density for one epoch
+        freqs: array of the frequencies
+
+        Returns
+        -------
+        Difference between the frequencies under which
+        cumulates 95 and 50 percent of the power
+        """
+        assert len(psd) == len(
+            freqs), "All PSD value must have a corresponding frequency value"
+
+        CUMUL_POWER_RATIO = [0.50, 0.95]
+
+        total_power = np.sum(psd)
+        cumul_power = 0
+
+        lower_freq = None
+        upper_freq = None
+
+        for amp, freq in zip(psd, freqs):
+            cumul_power += amp
+            if cumul_power >= CUMUL_POWER_RATIO[1] * total_power:
+                upper_freq = freq
+                break
+            elif lower_freq is None and cumul_power >= CUMUL_POWER_RATIO[0] * total_power:
+                lower_freq = freq
+
+        return upper_freq - lower_freq
+
+    return [[get_sefd(one_epoch_psd, freqs)] for one_epoch_psd in psds]
+
+
+def get_frequency_domain_pipeline():
+    get_psds_from_epochs_transformer = FunctionTransformer(
+        get_psds_from_epochs, validate=False)
+    absolute_mean_psds_transformer = FunctionTransformer(
+        _get_mean_psds, validate=False)
+    relative_mean_psds_transformer = FunctionTransformer(
+        lambda psds_with_freq: _get_mean_psds(
+            psds_with_freq,
+            are_relative=True
+        ), validate=False)
+
+    sefd_transformer = FunctionTransformer(
+        _get_sefd_on_all_epochs, validate=False)
+
+    return Pipeline([
+        ('get_psds_from_epochs', get_psds_from_epochs_transformer),
+        ('frequency_domain_features', FeatureUnion([
+            ('absolute_mean_power_band', absolute_mean_psds_transformer),
+            ('relative_mean_power_band', relative_mean_psds_transformer),
+            ('sefd', sefd_transformer)
+        ], n_jobs=1))
+    ])