Example: Use of Upper and Lower bound as error estimation

This example is an extension to Example: Use of CMRC with different settings where we will prove how the upper and lower bound of the loss are an unbiased estimator of the error. The models are trained with different number of cases ranging from 10% to 80% of the data and then are tested with 20% of the samples. The graphs show how in most of the cases the error is between those bounds which proves the potential of this feature of the MRCs. The results are for a MRC(phi = 'fourier', loss = '0-1', s = 1)

Note

Note that there is an additional dataset related to COVID-19 patients that is available upon requesting to HM Hospitales here. More information about this dataset can be found in the COVID example

# Import needed modules
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold

from MRCpy import MRC
from MRCpy.datasets import *


sns.set_style("whitegrid")
sns.set_context("paper")
warnings.filterwarnings("ignore")


def load_covid(norm=False, array=True):
    data_consensus = pd.read_csv("data/data_consensus.csv", sep=";")

    variable_dict = {
        "CD0000AGE": "Age",
        "CORE": "PATIENT_ID",
        "CT000000U": "Urea",
        "CT00000BT": "Bilirubin",
        "CT00000NA": "Sodium",
        "CT00000TP": "Proth_time",
        "CT0000COM": "Com",
        "CT0000LDH": "LDH",
        "CT0000NEU": "Neutrophils",
        "CT0000PCR": "Pro_C_Rea",
        "CT0000VCM": "Med_corp_vol",
        "CT000APTT": "Ceph_time",
        "CT000CHCM": "Mean_corp_Hgb",
        "CT000EOSP": "Eosinophils%",
        "CT000LEUC": "Leukocytes",
        "CT000LINP": "Lymphocytes%",
        "CT000NEUP": "Neutrophils%",
        "CT000PLAQ": "Platelet_count",
        "CTHSDXXRATE": "Rate",
        "CTHSDXXSAT": "Sat",
        "ED0DISWHY": "Status",
        "F_INGRESO/ADMISSION_D_ING/INPAT": "Fecha_admision",
        "SEXO/SEX": "Sexo",
    }
    data_consensus = data_consensus.rename(columns=variable_dict)
    if norm:
        x_consensus = data_consensus[
            data_consensus.columns.difference(["Status", "PATIENT_ID"])
        ][:]
        std_scale = preprocessing.StandardScaler().fit(x_consensus)
        x_consensus_std = std_scale.transform(x_consensus)
        dataframex_consensus = pd.DataFrame(
            x_consensus_std, columns=x_consensus.columns
        )
        data_consensus.reset_index(drop=True, inplace=True)
        data_consensus = pd.concat(
            [dataframex_consensus, data_consensus[["Status"]]], axis=1
        )
    data_consensus = data_consensus[data_consensus.columns.difference(
        ["PATIENT_ID"])]
    X = data_consensus[data_consensus.columns.difference(
        ["Status", "PATIENT_ID"])]
    y = data_consensus["Status"]
    if array:
        X = X.to_numpy()
        y = y.to_numpy()
    return X, y


def getUpperLowerdf(train_size, X, y, cv, paramsMRC, smote=True):
    """
    Parameters
    ----------
    train_size : array
        Array of different training sizes to train the model.
    cv : CrossValidator
        Cross validator.
    paramsMRC : TYPE
        Parameters for the MRCs.
    smote : Bool, optional
        Class imbalance corrector, set to false to disable. The default is
        True.
    Returns
    -------
    table : dataFrame
        Dataframe with the results of the training for each training size.

    """
    if smote:
        smotefit = SMOTE(sampling_strategy="auto")
        X, y = smotefit.fit_resample(X, y)
    table = pd.DataFrame()
    for train_set in train_size:
        for j, (train_index, test_index) in enumerate(cv.split(X, y)):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            random_indices = np.random.choice(
                X_train.shape[0], size=int(X.shape[0] * train_set),
                replace=False,
            )
            X_train = X_train[random_indices, :]
            y_train = y_train[random_indices]
            std_scale = preprocessing.StandardScaler().fit(X_train, y_train)
            X_train = std_scale.transform(X_train)
            X_test = std_scale.transform(X_test)
            start_time = time.time()
            MRC_model = MRC(phi="fourier", s=1, **
                            paramsMRC).fit(X_train, y_train)
            train_time = time.time() - start_time
            auxtable = pd.DataFrame(
                columns=["Error", "Upper", "Lower", "iteration", "train_size",
                         "Time", ],
                index=range(0, 1),
            )
            auxtable["train_size"] = train_set
            auxtable["iteration"] = j
            auxtable["Error"] = 1 - MRC_model.score(X_test, y_test)
            auxtable["Time"] = train_time
            auxtable["Upper"] = MRC_model.get_upper_bound()
            auxtable["Lower"] = MRC_model.get_lower_bound()

            table = table.append(auxtable, ignore_index=True)
    return table


# Data sets
loaders = [
    load_mammographic,
    load_haberman,
    load_indian_liver,
    load_diabetes,
    load_credit,
    load_covid,
]

dataName = [
    "mammographic",
    "haberman",
    "indian_liver",
    "diabetes",
    "credit",
    "COVID",
]
paramsMRC = {
    "deterministic": False,
    "fit_intercept": False,
    "use_cvx": True,
    "loss": "0-1",
}
train = np.arange(0.1, 0.81, 0.1)

Cross test validation

5 fold repeated Stratified Cross validation is performed where each of the fold is trained with 80% of the data and then tested with the remaining 20%

n_splits = 5
n_repeats = 10
cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats,
                             random_state=1)

Results

We will present the results for the 6 datasets. For more information about the dataset refer to the MRCpy documentation of the loaders. In the results we can see how the upper and lower bounds get closer when the training size is increased. Furthermore, the standard deviation of both bounds is reduced significantly.

Mammographic

X, y = load_mammographic()
table = getUpperLowerdf(train, X, y, cv, paramsMRC)
# dataframes.append(table)
# plotUpperLower(table)
means = table[table.columns.difference(["iteration"])].groupby(
    "train_size").mean()
std = table[table.columns.difference(["iteration"])].groupby(
    "train_size").std()
for column in means.columns:
    means[column] = (
        means[column].round(3).astype(str) + " ± " + std[column].round(
            3).astype(str)
    )
means[["Error", "Upper", "Lower", "Time"]]

	Error	Upper	Lower	Time
train_size
0.1	0.206 ± 0.028	0.214 ± 0.031	0.104 ± 0.03	0.189 ± 0.012
0.2	0.195 ± 0.027	0.216 ± 0.025	0.134 ± 0.027	0.336 ± 0.016
0.3	0.2 ± 0.032	0.215 ± 0.017	0.145 ± 0.02	0.482 ± 0.029
0.4	0.202 ± 0.031	0.215 ± 0.018	0.154 ± 0.022	0.579 ± 0.027
0.5	0.191 ± 0.032	0.214 ± 0.015	0.158 ± 0.019	0.672 ± 0.029
0.6	0.201 ± 0.038	0.22 ± 0.014	0.168 ± 0.017	0.782 ± 0.033
0.7	0.201 ± 0.03	0.217 ± 0.013	0.171 ± 0.016	0.877 ± 0.028
0.8	0.206 ± 0.03	0.219 ± 0.012	0.175 ± 0.013	0.965 ± 0.043

fig, ax = plt.subplots()
sns.lineplot(data=table, x="train_size", y="Error", label="Test Error", ax=ax)
sns.lineplot(
    data=table,
    x="train_size",
    y="Upper",
    color="red",
    label="Upper bound",
    linestyle="dotted",
    ax=ax,
)
sns.lineplot(
    data=table,
    x="train_size",
    y="Lower",
    color="green",
    label="Lower bound",
    linestyle="dotted",
    ax=ax,
)
plt.suptitle("Mammographic")
plt.show()

Haberman

X, y = load_haberman()
table = getUpperLowerdf(train, X, y, cv, paramsMRC)
means = table[table.columns.difference(
    ["iteration"])].groupby("train_size").mean()
std = table[table.columns.difference(
    ["iteration"])].groupby("train_size").std()
for column in means.columns:
    means[column] = (
        means[column].round(3).astype(
            str) + " ± " + std[column].round(3).astype(str)
    )
means[["Error", "Upper", "Lower", "Time"]]

	Error	Upper	Lower	Time
train_size
0.1	0.414 ± 0.061	0.36 ± 0.037	0.139 ± 0.053	0.101 ± 0.005
0.2	0.384 ± 0.057	0.376 ± 0.026	0.206 ± 0.042	0.171 ± 0.007
0.3	0.385 ± 0.045	0.376 ± 0.019	0.241 ± 0.03	0.251 ± 0.008
0.4	0.369 ± 0.044	0.381 ± 0.017	0.254 ± 0.026	0.319 ± 0.01
0.5	0.36 ± 0.045	0.375 ± 0.013	0.255 ± 0.022	0.388 ± 0.017
0.6	0.351 ± 0.041	0.379 ± 0.011	0.266 ± 0.019	0.467 ± 0.017
0.7	0.35 ± 0.048	0.377 ± 0.009	0.272 ± 0.016	0.514 ± 0.02
0.8	0.357 ± 0.042	0.378 ± 0.007	0.278 ± 0.011	0.57 ± 0.023

fig, ax = plt.subplots()
sns.lineplot(data=table, x="train_size", y="Error", label="Test Error", ax=ax)
sns.lineplot(
    data=table,
    x="train_size",
    y="Upper",
    color="red",
    label="Upper bound",
    linestyle="dotted",
    ax=ax,
)
sns.lineplot(
    data=table,
    x="train_size",
    y="Lower",
    color="green",
    label="Lower bound",
    linestyle="dotted",
    ax=ax,
)
plt.suptitle("Haberman")
plt.show()

Indian liver

X, y = load_indian_liver()

table = getUpperLowerdf(train, X, y, cv, paramsMRC)
means = table[table.columns.difference(
    ["iteration"])].groupby("train_size").mean()
std = table[table.columns.difference(
    ["iteration"])].groupby("train_size").std()
for column in means.columns:
    means[column] = (
        means[column].round(3).astype(str) + " ± " +
        std[column].round(3).astype(str)
    )
means[["Error", "Upper", "Lower", "Time"]]

	Error	Upper	Lower	Time
train_size
0.1	0.383 ± 0.036	0.344 ± 0.03	0.163 ± 0.031	0.159 ± 0.005
0.2	0.367 ± 0.035	0.358 ± 0.024	0.223 ± 0.029	0.299 ± 0.011
0.3	0.356 ± 0.038	0.364 ± 0.015	0.252 ± 0.02	0.438 ± 0.025
0.4	0.353 ± 0.034	0.361 ± 0.012	0.266 ± 0.015	0.539 ± 0.018
0.5	0.341 ± 0.035	0.36 ± 0.012	0.273 ± 0.015	0.668 ± 0.02
0.6	0.348 ± 0.031	0.361 ± 0.009	0.279 ± 0.01	0.805 ± 0.023
0.7	0.344 ± 0.035	0.361 ± 0.007	0.284 ± 0.008	0.941 ± 0.028
0.8	0.346 ± 0.033	0.36 ± 0.006	0.288 ± 0.007	1.062 ± 0.028

fig, ax = plt.subplots()
sns.lineplot(data=table, x="train_size", y="Error", label="Test Error", ax=ax)
sns.lineplot(
    data=table,
    x="train_size",
    y="Upper",
    color="red",
    label="Upper bound",
    linestyle="dotted",
    ax=ax,
)
sns.lineplot(
    data=table,
    x="train_size",
    y="Lower",
    color="green",
    label="Lower bound",
    linestyle="dotted",
    ax=ax,
)
plt.suptitle("Indian Liver")
plt.show()

diabetes

X, y = load_diabetes()

table = getUpperLowerdf(train, X, y, cv, paramsMRC)
means = table[table.columns.difference(
    ["iteration"])].groupby("train_size").mean()
std = table[table.columns.difference(
    ["iteration"])].groupby("train_size").std()
for column in means.columns:
    means[column] = (
        means[column].round(3).astype(str) + " ± " +
        std[column].round(3).astype(str)
    )
means[["Error", "Upper", "Lower", "Time"]]

	Error	Upper	Lower	Time
train_size
0.1	0.332 ± 0.034	0.292 ± 0.027	0.093 ± 0.032	0.187 ± 0.006
0.2	0.31 ± 0.037	0.302 ± 0.021	0.173 ± 0.029	0.356 ± 0.012
0.3	0.3 ± 0.038	0.309 ± 0.013	0.201 ± 0.016	0.537 ± 0.02
0.4	0.298 ± 0.032	0.31 ± 0.013	0.217 ± 0.012	0.654 ± 0.021
0.5	0.299 ± 0.033	0.311 ± 0.01	0.23 ± 0.013	0.816 ± 0.023
0.6	0.296 ± 0.029	0.311 ± 0.008	0.236 ± 0.011	0.981 ± 0.03
0.7	0.295 ± 0.031	0.31 ± 0.006	0.238 ± 0.008	1.152 ± 0.048
0.8	0.301 ± 0.033	0.31 ± 0.005	0.24 ± 0.008	1.299 ± 0.031

fig, ax = plt.subplots()
sns.lineplot(data=table, x="train_size", y="Error", label="Test Error", ax=ax)
sns.lineplot(
    data=table,
    x="train_size",
    y="Upper",
    color="red",
    label="Upper bound",
    linestyle="dotted",
    ax=ax,
)
sns.lineplot(
    data=table,
    x="train_size",
    y="Lower",
    color="green",
    label="Lower bound",
    linestyle="dotted",
    ax=ax,
)
plt.suptitle("Diabetes")
plt.show()

credit

X, y = load_credit()

table = getUpperLowerdf(train, X, y, cv, paramsMRC)
means = table[table.columns.difference(
    ["iteration"])].groupby("train_size").mean()
std = table[table.columns.difference(
    ["iteration"])].groupby("train_size").std()
for column in means.columns:
    means[column] = (
        means[column].round(3).astype(str) + " ± " +
        std[column].round(3).astype(str)
    )
means[["Error", "Upper", "Lower", "Time"]]

	Error	Upper	Lower	Time
train_size
0.1	0.225 ± 0.032	0.203 ± 0.024	0.016 ± 0.014	0.147 ± 0.006
0.2	0.195 ± 0.03	0.207 ± 0.017	0.049 ± 0.017	0.288 ± 0.008
0.3	0.182 ± 0.024	0.211 ± 0.016	0.073 ± 0.018	0.414 ± 0.014
0.4	0.176 ± 0.026	0.205 ± 0.014	0.082 ± 0.016	0.525 ± 0.02
0.5	0.18 ± 0.023	0.205 ± 0.011	0.095 ± 0.013	0.653 ± 0.022
0.6	0.174 ± 0.022	0.202 ± 0.008	0.099 ± 0.008	0.766 ± 0.02
0.7	0.173 ± 0.029	0.2 ± 0.008	0.105 ± 0.008	0.891 ± 0.028
0.8	0.174 ± 0.026	0.198 ± 0.008	0.11 ± 0.007	1.01 ± 0.026

fig, ax = plt.subplots()
sns.lineplot(data=table, x="train_size", y="Error", label="Test Error", ax=ax)
sns.lineplot(
    data=table,
    x="train_size",
    y="Upper",
    color="red",
    label="Upper bound",
    linestyle="dotted",
    ax=ax,
)
sns.lineplot(
    data=table,
    x="train_size",
    y="Lower",
    color="green",
    label="Lower bound",
    linestyle="dotted",
    ax=ax,
)
plt.suptitle("Credit")
plt.show()

COVID

table = pd.read_csv('data/table.csv')
means = table[table.columns.difference(
    ["iteration"])].groupby("train_size").mean()
std = table[table.columns.difference(
    ["iteration"])].groupby("train_size").std()
for column in means.columns:
    means[column] = (
        means[column].round(3).astype(str) + " ± " +
        std[column].round(3).astype(str)
    )
means[["Error", "Upper", "Lower", "Time"]]

	Error	Upper	Lower	Time
train_size
0.1	0.259 ± 0.019	0.273 ± 0.012	0.113 ± 0.019	0.531 ± 0.033
0.2	0.256 ± 0.017	0.275 ± 0.009	0.154 ± 0.013	1.028 ± 0.03
0.3	0.258 ± 0.018	0.278 ± 0.006	0.174 ± 0.009	1.518 ± 0.047
0.4	0.253 ± 0.016	0.277 ± 0.007	0.182 ± 0.01	2.034 ± 0.053
0.5	0.253 ± 0.016	0.278 ± 0.005	0.19 ± 0.007	2.659 ± 0.078
0.6	0.256 ± 0.017	0.277 ± 0.005	0.192 ± 0.006	3.121 ± 0.118
0.7	0.256 ± 0.019	0.277 ± 0.005	0.196 ± 0.005	3.836 ± 0.153
0.8	0.252 ± 0.015	0.276 ± 0.005	0.198 ± 0.006	4.144 ± 0.11

fig, ax = plt.subplots()
sns.lineplot(data=table, x="train_size", y="Error", label="Test Error", ax=ax)
sns.lineplot(
    data=table,
    x="train_size",
    y="Upper",
    color="red",
    label="Upper bound",
    linestyle="dotted",
    ax=ax,
)
sns.lineplot(
    data=table,
    x="train_size",
    y="Lower",
    color="green",
    label="Lower bound",
    linestyle="dotted",
    ax=ax,
)
plt.suptitle("COVID")
plt.show()

Total running time of the script: ( 38 minutes 39.470 seconds)

Gallery generated by Sphinx-Gallery