# ML prediction pipeline

# --- imports ---
import os
import time
# Prevent numba from loading LLVM OpenMP
os.environ["SHAP_DISABLE_JIT"] = "1"          # shap will use pure-python fallbacks
os.environ["JOBLIB_PROGRESSBAR"] = "0"  # turn off joblib's tqdm bars

# Standard library
import statistics

# Core scientific stack
import numpy as np
import pandas as pd

# Utilities
from tqdm import tqdm  # To display progress bars during loops and long computations.

# # Scikit-learn with Intel acceleration
from sklearnex import patch_sklearn
patch_sklearn()

# Scikit-learn modules
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel


# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Imbalanced-learn
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import BorderlineSMOTE, SMOTE

# Model interpretation
import shap

# Gradient boosting
from xgboost import XGBClassifier



def _gpu_available():
    """Return True if a CUDA-capable GPU appears available via numba.
    
    
    Notes
    -----
    - This is a lightweight check. XGBoost may still error if CUDA runtime is
    unavailable or mismatched even when this returns True.
    """
    try:
        import numba.cuda as _cuda
        return _cuda.is_available()
    except Exception:
        return False
        
def arraylist2arrayColumn(arrays):
    """Stack a list of 1‑D arrays (possibly different lengths) as columns.
    
    
    Shorter arrays are **left‑padded with zeros** so that all columns align.
    
    
    Parameters
    ----------
    arrays : list[np.ndarray]
    List of 1‑D arrays with potentially different lengths.
    
    
    Returns
    -------
    np.ndarray
    2‑D array of shape (max_length, n_arrays) where each input array forms
    a column and is left‑padded with zeros to `max_length`.
    """
    max_length = max(len(arr) for arr in arrays)
    padded_arrays = [np.pad(arr, (max_length - len(arr), 0), constant_values=0) for arr in arrays]
    return np.column_stack(padded_arrays)

def allocate_feature_list(data, feature_list):
    """Count how often features were selected across folds/iterations.
    
    
    Parameters
    ----------
    data : pd.DataFrame
    Original feature dataframe; used to preserve full feature list/order.
    feature_list : list[Iterable[str]]
    List where each item is an iterable of selected feature names from one
    fold/iteration.
    
    
    Returns
    -------
    pd.DataFrame
    DataFrame with columns `Feature` and `Count` (selection frequency).
    """
    all_features = list(data.columns)
    flattened = [f for sub in feature_list for f in sub]
    counts = pd.Series(flattened).value_counts()
    out = pd.DataFrame({'Feature': all_features})
    out['Count'] = out['Feature'].map(counts).fillna(0).astype(int)
    return out

def should_apply_pca(x, threshold=1.5):
    """Heuristic to decide if PCA can be applied based on sample/feature ratio. if false, pca is not allowed.
    
    Parameters
    ----------
    x : pd.DataFrame | np.ndarray
    Feature matrix.
    threshold : float, default 1.5
    Apply PCA when n_samples / n_features < `threshold`.
    
    
    Returns
    -------
    bool
    True if PCA is recommended under the heuristic.
    """
    n_samples, n_features = x.shape
    return n_samples / n_features < threshold


def _to_2d_array(sv):
    """Normalize SHAP output to shape (n_samples, n_features).
    
    
    Handles lists (per‑class), 1D vectors, and 3D arrays.
    
    
    Parameters
    ----------
    sv : array-like | list
    Raw SHAP values as returned by various explainers/models.
    
    
    Returns
    -------
    np.ndarray
    SHAP values as a 2‑D array.
    """
    # If SHAP returns a list per class -> take positive class if available
    if isinstance(sv, (list, tuple)):
        sv = sv[1] if len(sv) > 1 else sv[0]
    sv = np.asarray(sv)

    # If 3D: (n_samples, n_features, n_outputs). Pick class 1 if available; else 0.
    if sv.ndim == 3:
        sv = sv[..., 1] if sv.shape[2] > 1 else sv[..., 0]

    # If 1D: (n_features,) -> (1, n_features)
    if sv.ndim == 1:
        sv = sv.reshape(1, -1)

    # Now it should be (n_samples, n_features)
    return sv



# ---------- model specs ----------
def _get_model_specs(use_gpu=False, n_jobs=5, ratio=1.0):
    """Define available model, grid, selector, and type for the pipeline search.
    
    
    Parameters
    ----------
    use_gpu : bool, default False
    If True, enable GPU‑accelerated XGBoost when available.
    n_jobs : int, default 5
    Parallelism for estimators that support it.
    ratio : float, default 1.0
    Passed to XGBoost as `scale_pos_weight` to compensate class imbalance
    (usually `neg/pos`).
    
    
    Returns
    -------
    dict[str, dict]
    Mapping from model name to its specification with keys:
    - "clf": estimator instance
    - "grid": parameter grid for GridSearchCV
    - "selector": feature selector (SelectFromModel)
    - "type": one of {"linear", "kernel", "tree"}
    """
    rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=200, random_state=42), threshold='median')

    specs = {
        # lasso and ridge are just penalties for logistic, so I comment them here.
        # "lasso": {
        #     "clf": LogisticRegression(penalty='l1', solver='saga', max_iter=5000),
        #     "grid": {
        #         "feature_selection__threshold": ['mean', 'median'],
        #         "classification__C": [0.01, 0.1, 1.0, 10.0],
        #     },
        #     "selector": SelectFromModel(LogisticRegression(penalty='l1', solver='saga', max_iter=5000), threshold='median'),
        #     "type": "linear"
        # },
        # "ridge": {
        #     "clf": LogisticRegression(penalty='l2', solver='lbfgs', max_iter=5000),
        #     "grid": {
        #         "feature_selection__threshold": ['mean', 'median'],
        #         "classification__C": [0.01, 0.1, 1.0, 10.0],
        #     },
        #     "selector": SelectFromModel(LogisticRegression(penalty='l2', solver='lbfgs', max_iter=5000), threshold='median'),
        #     "type": "linear"
        # },
        "logistic": {
            "clf": LogisticRegression(class_weight='balanced', penalty='l2', solver='lbfgs', max_iter=5000),
            "grid": {
                "feature_selection__threshold": ['median'],
                "classification__C": [0.01, 0.1, 1.0, 10.0],
            },
            "selector": SelectFromModel(LogisticRegression(penalty='l2', solver='lbfgs', max_iter=5000), threshold='median'),
            "type": "linear"
        },
        "svm": {
            "clf": SVC(kernel='rbf', class_weight= 'balanced' , probability=True),
            "grid": {
                "feature_selection__threshold": ['median'],
                "classification__C": [0.1, 1],
                "classification__gamma": ['auto']
            },
            "selector": rf_selector,
            "type": "kernel"
        },
        "random_forest": {
            "clf": RandomForestClassifier(class_weight='balanced', n_jobs=n_jobs, random_state=42),
            "grid": {
                "feature_selection__threshold": ['median'],
                "classification__n_estimators": [100, 200, 400],
                "classification__min_samples_leaf": [1, 2],
                "classification__max_features": ['sqrt', 0.5]
            },
            "selector": SelectFromModel(RandomForestClassifier(n_estimators=200, random_state=42), threshold='median'),
            "type": "tree"
        },
        "decision_tree": {
            "clf": DecisionTreeClassifier(class_weight='balanced', random_state=42),
            "grid": {
                "feature_selection__threshold": ['median'],
                "classification__max_depth": [5, 10],
                "classification__min_samples_split": [2, 5]
            },
            "selector": rf_selector,
            "type": "tree"
        },
        "knn": {
            "clf": KNeighborsClassifier(),
            "grid": {
                "feature_selection__threshold": ['median'],
                "classification__n_neighbors": [3, 5, 7, 11],
                "classification__weights": ['uniform', 'distance'],
                "classification__p": [1, 2],
            },
            "selector": rf_selector,
            "type": "kernel"
        },
    }


    xgb_kwargs = dict(
            n_estimators=300,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            max_depth=4,
            reg_lambda=1.0,
            reg_alpha=0.0,
            eval_metric='logloss',
            scale_pos_weight=ratio,
            n_jobs=n_jobs,
            random_state=42,
            # use_label_encoder=False
        )
    if use_gpu and _gpu_available():
        xgb_kwargs.update({
            "tree_method": "gpu_hist",
            "predictor": "gpu_predictor"
        })
        # xgb_kwargs.update({
        #     "tree_method": "hist",
        #     "predictor": "auto",
        #     "device": "cuda"
        # })
    else:
        # fast CPU histogram as fallback
        xgb_kwargs.update({
            "tree_method": "hist",
            # "predictor": "auto"
        })

    specs["xgboost"] = {
        "clf": XGBClassifier(**xgb_kwargs),
        "grid": {
            "feature_selection__threshold": ['median'],
            "classification__n_estimators": [200, 400],
            "classification__max_depth": [4, 6],
            "classification__learning_rate": [0.03, 0.1],
            "classification__subsample": [0.8],
            "classification__colsample_bytree": [0.8],
        },
        "selector": SelectFromModel(RandomForestClassifier(n_estimators=200, random_state=42), threshold='median'),
        "type": "tree"
    }
    return specs


# ---------- SHAP utils ----------
def _compute_shap_summary(best_pipeline, x_train, x_test, feature_names, model_type,
                          bg_max=100, test_max=300):
    """
    Compute mean|SHAP| per (selected) feature for the final estimator in best_pipeline.
    Returns a DataFrame with columns: Feature, mean_abs_shap.
    For SVM/KNN (kernel), uses KernelExplainer with small background/test samples.
    """

    """Compute mean |SHAP| per (selected) feature for the pipeline's classifier.
    
    
    Parameters
    ----------
    best_pipeline : imblearn.Pipeline
    Fitted pipeline with steps [scaler, smote, feature_selection, classification].
    x_train, x_test : pd.DataFrame | np.ndarray
    Train/test data from the outer fold (before scaling/selection).
    feature_names : list[str]
    Original feature names for alignment.
    model_type : {"tree", "linear", "kernel"}
    Determines which SHAP explainer to use.
    bg_max : int, default 100
    Background sample size for KernelExplainer (used for kernel models).
    test_max : int, default 300
    Test subsample size for KernelExplainer (used for kernel models).
    
    
    Returns
    -------
    pd.DataFrame
    Columns: `Feature`, `mean_abs_shap` for **all** original features
    (unselected features are filled with 0.0). If SHAP fails, values are NaN.
    
    
    Notes
    -----
    - For kernel models (SVM/KNN), full SHAP on all samples is expensive. We
    approximate by computing SHAP on a small subset and taking the mean |SHAP|.
    - For tree/linear models, TreeExplainer/LinearExplainer is used directly.
    """


    # Extract steps
    scaler = best_pipeline.named_steps.get('scaler', None)
    selector = best_pipeline.named_steps['feature_selection']
    est      = best_pipeline.named_steps['classification']

    # Transform to the estimator's input space (selected features after scaling)
    # Xtr = x_train.values if isinstance(x_train, pd.DataFrame) else x_train
    # Xte = x_test.values if isinstance(x_test, pd.DataFrame) else x_test
    Xtr = x_train if isinstance(x_train, pd.DataFrame) else pd.DataFrame(x_train, columns=feature_names)
    Xte = x_test  if isinstance(x_test,  pd.DataFrame) else pd.DataFrame(x_test,  columns=feature_names)

    if scaler is not None:
        Xtr = scaler.transform(Xtr)
        Xte = scaler.transform(Xte)

    Xtr_sel = selector.transform(Xtr)
    Xte_sel = selector.transform(Xte)

    # Map selected column names
    support = selector.get_support()
    # tqdm.write(f"[{model_type}] selected features: {np.sum(support)} / {len(support)}")
    if support is None or np.sum(support) == 0:
        sel_names = np.array(feature_names)
    else:
        sel_names = np.array(feature_names)[support]

    # If no features selected (edge case), return NaNs
    if Xtr_sel.shape[1] == 0:
        return pd.DataFrame({"Feature": feature_names, "mean_abs_shap": np.nan})

    # Choose explainer
    try:
        if model_type == "tree":
            explainer = shap.TreeExplainer(est)
            shap_vals = explainer.shap_values(Xte_sel)
            # binary: list-of-arrays per class; take class 1 if available
            if isinstance(shap_vals, list):
                # prefer positive class if present
                sv = shap_vals[1] if len(shap_vals) > 1 else shap_vals[0]
            else:
                sv = shap_vals
        
        elif model_type == "linear":
            explainer = shap.LinearExplainer(est, Xtr_sel)
            sv = explainer.shap_values(Xte_sel)
        
        else:  # "kernel" (e.g., SVM, KNN)
            # sample small background and test to keep it fast
            rng = np.random.default_rng(0)
            bg_idx = rng.choice(Xtr_sel.shape[0], size=min(bg_max, Xtr_sel.shape[0]), replace=False)
            te_idx = rng.choice(Xte_sel.shape[0], size=min(test_max, Xte_sel.shape[0]), replace=False)
            Xbg = Xtr_sel[bg_idx]
            Xte_small = Xte_sel[te_idx]
            # explain probability of class 1
            f = (lambda A: est.predict_proba(A)[:, 1])
            explainer = shap.KernelExplainer(f, Xbg)  # Kernel SHAP approximates this by replacing missing features with their distribution from the background dataset.
            sv_small = explainer.shap_values(Xte_small, nsamples="auto", silent=True)
            # expand to full test by mapping means (approx)
            sv = np.zeros((Xte_sel.shape[0], Xte_sel.shape[1]))
            sv[:] = np.mean(np.abs(sv_small), axis=0)  # use mean abs across the small set as proxy
            # NOTE: Kernel SHAP exact per-sample is expensive; this proxy summarizes feature impact.
        sv = _to_2d_array(sv)
        mean_abs = np.mean(np.abs(sv), axis=0) if sv.ndim == 2 else np.mean(np.abs(sv), axis=0)
        df = pd.DataFrame({"Feature": sel_names, "mean_abs_shap": mean_abs})
        # ensure all features appear; fill 0 for unselected
        out = pd.DataFrame({"Feature": feature_names}).merge(df, on="Feature", how="left").fillna(0.0)
        return out

    except Exception:
        # fallback if SHAP fails
        return pd.DataFrame({"Feature": feature_names, "mean_abs_shap": np.nan})



def find_threshold(y_true, p_val, mode="fbeta", beta=2.0, target_sens=None, costs=None):
    """
    Pick an operating threshold from validation probs.
    modes:
      - "fbeta": maximize F_beta (beta>1 favors sensitivity)
      - "youden": maximize Youden's J = TPR - FPR
      - "target_sens": smallest threshold with sensitivity >= target_sens
      - "cost": minimize expected cost given costs={"fp":c_fp,"fn":c_fn}
      beta : float, default 2.0
    F-beta parameter (used only when mode="fbeta").
    target_sens : float | None
    Target sensitivity (recall) in [0,1] when mode="target_sens".
    costs : dict | None
    Required when mode="cost"; keys {"fp", "fn"} specify relative costs for
    false positives/negatives.
    
    
    Returns
    -------
    float
    Chosen threshold in [0,1].
    """
    # use all unique prob cutpoints (plus 0 and 1 for safety)
    thr_grid = np.unique(np.r_[0.0, p_val, 1.0])
    best_thr, best_val = 0.5, -np.inf
    if mode == "cost": 
        # for costs we minimize, so track min instead
        best_val, best_thr = np.inf, 0.5

    for t in thr_grid:
        y_hat = (p_val >= t).astype(int)
        tp = np.sum((y_true==1) & (y_hat==1))
        fn = np.sum((y_true==1) & (y_hat==0))
        tn = np.sum((y_true==0) & (y_hat==0))
        fp = np.sum((y_true==0) & (y_hat==1))
        sens = tp / (tp + fn) if (tp+fn)>0 else 0.0
        spec = tn / (tn + fp) if (tn+fp)>0 else 0.0

        if mode == "fbeta":
            prec = tp / (tp + fp) if (tp+fp)>0 else 0.0
            rec  = sens
            if prec+rec==0:
                val = 0.0
            else:
                beta2 = beta*beta
                val = (1+beta2) * (prec*rec) / (beta2*prec + rec) if (beta2*prec + rec)>0 else 0.0
            if val > best_val:
                best_val, best_thr = val, t

        elif mode == "youden":
            val = sens - (1 - spec)  # TPR - FPR
            if val > best_val:
                best_val, best_thr = val, t

        elif mode == "target_sens":
            # choose smallest t achieving desired sensitivity
            if target_sens is None:
                raise ValueError("Provide target_sens for mode='target_sens'")
            if sens >= target_sens:
                # prefer higher specificity among those meeting the target; break ties by higher threshold
                val = spec
                if val > best_val or (np.isclose(val, best_val) and t > best_thr):
                    best_val, best_thr = val, t

        elif mode == "cost":
            if costs is None or not {"fp","fn"} <= set(costs):
                raise ValueError("Provide costs={'fp':...,'fn':...} for mode='cost'")
            exp_cost = costs["fp"]*fp + costs["fn"]*fn
            if exp_cost < best_val:
                best_val, best_thr = exp_cost, t

        else:
            raise ValueError("Unknown mode")

    return float(best_thr)
    

def tune_threshold_on_inner(x_train, y_train, best_pipeline, inner_cv,
                            mode="fbeta", beta=2.0, target_sens=None, costs=None, n_jobs=1):
    """
    Using the best pipeline (hyperparams fixed), get out-of-fold validation probabilities
    on x_train via the inner CV, then pick threshold by the chosen rule.
    Parameters
    ----------
    x_train : pd.DataFrame | np.ndarray
    Features for the outer fold's training split.
    y_train : pd.Series | np.ndarray
    Binary labels for the outer fold's training split.
    best_pipeline : imblearn.Pipeline
    Pipeline with hyperparameters fixed (e.g., the GridSearchCV best estimator).
    inner_cv : sklearn.model_selection.StratifiedKFold
    The same inner CV scheme used during hyperparameter selection.
    mode, beta, target_sens, costs : see `find_threshold`.
    n_jobs : int, default 1
    Parallelism for `cross_val_predict`.
    
    
    Returns
    -------
    float
    Selected decision threshold.
    """
    # get OOF probabilities for the positive class
    p_val = cross_val_predict(best_pipeline, x_train, y_train,
                              cv=inner_cv, method="predict_proba", n_jobs=n_jobs)[:, 1]
    thr = find_threshold(y_train.values, p_val,
                         mode=mode, beta=beta, target_sens=target_sens, costs=costs)
    return thr






# ---------- main multi-model nested CV with tqdm, scaler, SHAP ----------
def my_nestedcv_multi(
    x, y, n_iter=1, n_jobs=5, permutation=False, threshold_mode="youden", pca_predictors=False,
    model_names=("svm", "logistic", "random_forest", "decision_tree", "xgboost", "knn"),
    shap_kernel_bg_max=30, shap_kernel_test_max=40, use_gpu=True, ratio=1.0
):
    """Run nested CV for multiple models with SMOTE, feature selection, and SHAP.
    
    
    Workflow (per model)
    --------------------
    For each outer fold and `n_iter` repetitions:
    1) Inner 5× Stratified CV grid search (scoring='recall') over pipeline params
    2) Threshold tuning on inner CV OOF probabilities (e.g., Youden)
    3) Refit best pipeline on full outer-train with tuned threshold
    4) Evaluate on outer-test (ACC, SEN, SPEC, PPV, NPV, AUC + ROC curve)
    5) Record selected features and SHAP mean |SHAP| per feature
    
    
    Parameters
    ----------
    x : pd.DataFrame
    Feature matrix. If `pca_predictors=True` and the heuristic triggers,
    PCA is applied once to reduce dimensionality.
    y : array-like
    Binary labels (0/1).
    n_iter : int, default 1
    How many times to repeat the outer CV (with different shuffles).
    n_jobs : int, default 5
    Parallelism for GridSearchCV and models that support it.
    permutation : bool, default False
    If True, permutes labels for a quick sanity check (should degrade performance).
    threshold_mode : str, default "youden"
    Passed to `tune_threshold_on_inner` ("youden", "fbeta", "target_sens", "cost").
    pca_predictors : bool, default False
    Whether to optionally apply PCA before CV when samples/features are low.
    model_names : tuple[str], default ("svm", "logistic", "random_forest", "decision_tree", "xgboost", "knn")
    Subset of models defined in `_get_model_specs` to run.
    shap_kernel_bg_max, shap_kernel_test_max : int
    Subsample sizes for Kernel SHAP when using kernel models (SVM/KNN).
    use_gpu : bool, default True
    Try GPU for XGBoost if available.
    ratio : float, default 1.0
    Class imbalance ratio for XGBoost's `scale_pos_weight`.
    
    
    Returns
    -------
    dict
    Mapping: model_name -> {
    "performance": {
    "score": {"accuracy": list, "sensitivity": list, "specificity": list,
    "AUC": list, "PPV": list, "NPV": list},
    "fpr": list[np.ndarray],
    "tpr": list[np.ndarray],
    },
    "selected_features": pd.DataFrame, # selection counts across folds/iters
    "shap_summary": pd.DataFrame # mean of mean|SHAP| across folds/iters
    }
    
    
    Notes
    -----
    - The pipeline order is critical: **StandardScaler -> SMOTE -> FeatureSelection -> Classifier**.
    This ensures SMOTE operates in a standardized space and selection is applied to
    the resampled data.
    - GridSearchCV uses `scoring='recall'` to prioritize sensitivity on imbalanced data.
    """
    # PCA (optional; applied once pre-CV ) would be better to put this into the "ImbPipeline" to avoid data leakage
    if pca_predictors:
        if should_apply_pca(x):
            n_components = min(10, x.shape[1])
            pca = PCA(n_components=n_components)
            x = pd.DataFrame(pca.fit_transform(x), index=x.index, columns=[f"PC{i+1}" for i in range(n_components)])
        else:
            x = x.copy()
    else:
        x = x.copy()

    # y to Series 0/1
    y_series = pd.Series(y).astype(int).reset_index(drop=True)
    if permutation:
        y_series = y_series.sample(frac=1.0, random_state=None).reset_index(drop=True)

    model_specs = _get_model_specs(use_gpu=use_gpu, n_jobs=n_jobs, ratio=ratio)
    results_by_model = {}

    outer_cv = StratifiedKFold(n_splits=5, shuffle=True)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True)

    feature_names = list(x.columns)

    # Only run models that exist in the spec mapping
    pbar_models = [m for m in model_names if m in model_specs]
    for name in pbar_models:
        spec = model_specs[name]
        classifier = spec["clf"]
        param_grid = spec["grid"]
        feature_selector = spec["selector"]
        model_type = spec["type"]

        
    # Build pipeline
    # NOTE: StandardScaler is placed **before** SMOTE so that k-NN based resamplers
    # and distance metrics are computed in a standardized space.
        pipeline = ImbPipeline([
            ('scaler', StandardScaler(with_mean=True, with_std=True)),
            ('smote', BorderlineSMOTE(kind='borderline-1', random_state=42)),
            # ('smote', SMOTE(random_state=42)),
            ('feature_selection', feature_selector),
            ('classification', classifier),
        ])

        overall_outercv_performance = {
            "score": {"accuracy": [], "sensitivity": [], "specificity": [], "AUC": [], "PPV": [], "NPV": []},
            "fpr": [],
            "tpr": [],
        }
        selected_features_list = []
        shap_summaries = []  # collect per-fold mean|SHAP|; aggregate later

        pbar_iters = tqdm(range(n_iter), desc=f"{name} • iterations", leave=True)
        for _ in pbar_iters:
            iteration_outercv = {
                "score": {"accuracy": [], "sensitivity": [], "specificity": [], "AUC": [], "PPV": [], "NPV": []},
                "fpr": [],
                "tpr": [],
            }

            # pbar_folds = tqdm(list(outer_cv.split(x, y_series)), desc=f"{name} • outer folds", leave=False)
            for train_idx, test_idx in outer_cv.split(x, y_series):
                x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
                y_train, y_test = y_series.iloc[train_idx], y_series.iloc[test_idx]

                # Inner CV grid search (optimize recall)  you can choose others like roc_auc if needed
                grid_search = GridSearchCV(
                    estimator=pipeline,
                    param_grid=param_grid,
                    cv=inner_cv,
                    scoring='recall',
                    n_jobs=n_jobs,
                    verbose=0 # Silence joblib progress display
                )
                grid_search.fit(x_train, y_train)
                best_pipeline = grid_search.best_estimator_


                ## choose the best threshold
                best_thr = tune_threshold_on_inner(
                x_train, y_train, best_pipeline, inner_cv,
                mode=threshold_mode, beta=2.0,   # e.g., favor sensitivity
                # OR: mode="youden"
                # OR: mode="target_sens", target_sens=0.80
                # OR: mode="cost", costs={"fp":1, "fn":5}  # cost of FN is higher 
                n_jobs=n_jobs)
                
                # 2) refit on full outer-train (already fit by gridsearch on inner splits,
                #    but we want a fresh fit on all x_train with the chosen params)
                best_pipeline.fit(x_train, y_train)

                # Selected features (names)
                selector = best_pipeline.named_steps['feature_selection']
                support = selector.get_support()
                if support is None or np.sum(support) == 0:
                    selected_cols = x.columns
                else:
                    selected_cols = x.columns[support]
                selected_features_list.append(selected_cols)

                # Evaluate on outer test with tuned threshold
                probs = best_pipeline.predict_proba(x_test)[:, 1]
                yhat  = (probs >= best_thr).astype(int)

                acc = accuracy_score(y_test, yhat)
                tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
                sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
                spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
                cur_fpr, cur_tpr, _ = roc_curve(y_test, probs)
                ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
                npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
                auc_val = auc(cur_fpr, cur_tpr)

                iteration_outercv["score"]["accuracy"].append(acc)
                iteration_outercv["score"]["sensitivity"].append(sens)
                iteration_outercv["score"]["specificity"].append(spec)
                iteration_outercv["score"]["AUC"].append(auc_val)
                iteration_outercv["score"]["PPV"].append(ppv)
                iteration_outercv["score"]["NPV"].append(npv)
                iteration_outercv["fpr"].append(cur_fpr)
                iteration_outercv["tpr"].append(cur_tpr)

                # SHAP summary for this fold (mean|SHAP| per feature)
                shap_df = _compute_shap_summary(
                    best_pipeline, x_train, x_test, feature_names, model_type,
                    bg_max=shap_kernel_bg_max, test_max=shap_kernel_test_max
                )
                shap_summaries.append(shap_df)

            # Aggregate metrics across outer folds for this iteration
            overall_outercv_performance["score"]["accuracy"].append(statistics.mean(iteration_outercv["score"]["accuracy"]))
            overall_outercv_performance["score"]["sensitivity"].append(statistics.mean(iteration_outercv["score"]["sensitivity"]))
            overall_outercv_performance["score"]["specificity"].append(statistics.mean(iteration_outercv["score"]["specificity"]))
            overall_outercv_performance["score"]["AUC"].append(statistics.mean(iteration_outercv["score"]["AUC"]))
            overall_outercv_performance["score"]["PPV"].append(statistics.mean(iteration_outercv["score"]["PPV"]))
            overall_outercv_performance["score"]["NPV"].append(statistics.mean(iteration_outercv["score"]["NPV"]))
            overall_outercv_performance["fpr"].append(np.mean(arraylist2arrayColumn(iteration_outercv["fpr"]), axis=1))
            overall_outercv_performance["tpr"].append(np.mean(arraylist2arrayColumn(iteration_outercv["tpr"]), axis=1))

        # Feature selection frequencies across all folds/iterations
        selected_features_df = allocate_feature_list(x, selected_features_list)

        # aggregate SHAP summaries across all folds/iterations (mean of mean|SHAP|)
        if len(shap_summaries) > 0:
            shap_concat = pd.concat(shap_summaries, axis=0)
            shap_summary = (shap_concat.groupby("Feature", as_index=False)["mean_abs_shap"]
                            .mean()
                            .sort_values("mean_abs_shap", ascending=False)
                            .reset_index(drop=True))
        else:
            shap_summary = pd.DataFrame({"Feature": feature_names, "mean_abs_shap": np.nan})

        results_by_model[name] = {
            "performance": overall_outercv_performance,
            "selected_features": selected_features_df,
            "shap_summary": shap_summary
        }

    return results_by_model

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)

# create predictors and outcome
# Load sheets
data = pd.read_excel(r"MLpipeline data.xlsx")

predictors = data.drop(columns=["outcome"])

# Create the outcome variable
y = (data["outcome"]).astype(int)

y_ = np.asarray(y)
n_pos = np.sum(y_ == 1)
n_neg = np.sum(y_ == 0)
ratio = float(n_neg) / float(max(n_pos, 1))

n_iter = 5  # number of iteration
n_jobs = 6  # number of cores in parallel computation

start_time = time.time() # record running time
results = my_nestedcv_multi(
    predictors, y,
    n_iter=n_iter,            # e.g., repeat the outer loop 3 times
    n_jobs=n_jobs,
    permutation=False,
    threshold_mode="youden",
    pca_predictors=False,
    # model_names=("lasso","ridge","svm","logistic","random_forest","decision_tree","xgboost","knn"),
    model_names=("svm","logistic","random_forest","decision_tree","xgboost","knn"),
    use_gpu=False,
    ratio=1.0
)


results_permu = my_nestedcv_multi(
    predictors, y,
    n_iter=n_iter,            # e.g., repeat the outer loop 3 times
    n_jobs=n_jobs,
    permutation=True,
    threshold_mode="youden",
    pca_predictors=False,
    # model_names=("lasso","ridge","svm","logistic","random_forest","decision_tree","xgboost","knn"),
    model_names=("svm","logistic","random_forest","decision_tree","xgboost","knn"),
    use_gpu=False,
    ratio=1.0
)  # permutation


end_time = time.time()
runtime = end_time - start_time
print(f"runtime: {runtime}")

svm • iterations: 100%|██████████████████████████████████████████████████████████████████| 5/5 [03:00<00:00, 36.01s/it]
logistic • iterations: 100%|█████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.87s/it]
random_forest • iterations: 100%|████████████████████████████████████████████████████████| 5/5 [01:16<00:00, 15.32s/it]
decision_tree • iterations: 100%|████████████████████████████████████████████████████████| 5/5 [00:39<00:00,  7.97s/it]
xgboost • iterations: 100%|██████████████████████████████████████████████████████████████| 5/5 [01:43<00:00, 20.69s/it]
knn • iterations: 100%|██████████████████████████████████████████████████████████████████| 5/5 [04:02<00:00, 48.59s/it]
svm • iterations: 100%|██████████████████████████████████████████████████████████████████| 5/5 [03:00<00:00, 36.02s/it]
logistic • iterations: 100%|█████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.85s/it]
random_forest • iterations: 100%|████████████████████████████████████████████████████████| 5/5 [01:16<00:00, 15.34s/it]
decision_tree • iterations: 100%|████████████████████████████████████████████████████████| 5/5 [00:39<00:00,  7.96s/it]
xgboost • iterations: 100%|██████████████████████████████████████████████████████████████| 5/5 [01:44<00:00, 20.85s/it]
knn • iterations: 100%|██████████████████████████████████████████████████████████████████| 5/5 [04:04<00:00, 48.93s/it]

runtime: 1307.1047189235687

# save results
import pickle

# save
with open("ML_results.pkl", "wb") as f:
    pickle.dump(results, f)

with open("ML_results_permu.pkl", "wb") as f:
pickle.dump(results_permu, f)

# # load back
# with open("results.pkl", "rb") as f:
#     loaded = pickle.load(f)
# print(loaded)

import numpy as np
import pandas as pd
import scipy.stats as st

# ---- your original test (unchanged) ----
def mypermutation_ttest(x, y):
    """Compute a simple permutation-style p-value vs. chance for metric arrays.
    
    
    Parameters
    ----------
    x : array-like
    Permutation distribution for a metric (e.g., scores from label shuffles).
    Interpreted as the **null** distribution.
    y : array-like
    Observed metric values under the real labels.
    
    
    Returns
    -------
    float
    Proportion of permutation samples greater than the **median** of `y`.
    (Matches the original behavior precisely.)

    - This is not a full t-test; consider a standard permutation test comparing means or AUC with a two-sided tail if needed
    """
    permutation_large_sum = sum(x > np.median(y))
    permutation_size = len(x)
    p_value = permutation_large_sum / permutation_size
    return p_value

# ---- helpers for stars / summary formatting ----

def _stars_from_p(p):
    """Map a p-value to significance stars.
    """
    if p is None or np.isnan(p):
        return ""
    if p > 0.05:
        return "ns"
    elif p <= 0.0001:
        return "***"
    elif p <= 0.01:
        return "**"
    else:
        return "*"

def _mean_ci_95(arr):
    """Return (mean, [low, high]) using a normal-approximation 95% CI on the mean.
    
    
    Parameters
    ----------
    arr : array-like
    Input values (NaNs ignored).
    
    
    Returns
    -------
    tuple
    `(mean, [ci_low, ci_high])` where the CI is computed as a **normal**
    95% CI on the mean using the standard error of the mean (SEM). Values
    are rounded to 3 decimals.
    """
    arr = np.asarray(arr, dtype=float)
    arr = arr[~np.isnan(arr)]
    if arr.size == 0:
        return (np.nan, [np.nan, np.nan])
    if arr.size == 1:
        m = float(arr[0])
        return (m, [m, m])
    m = float(np.mean(arr))
    # 95% CI for mean (your previous approach): normal approx
    ci_lo, ci_hi = st.norm.interval(confidence=0.95, loc=np.mean(arr), scale=st.sem(arr))
    return (m, [float(np.round(ci_lo, 3)), float(np.round(ci_hi, 3))])

# Map internal metric keys to display names/order
_METRIC_ORDER = [
    ("accuracy",    "ACC"),
    ("sensitivity", "SENS"),
    ("specificity", "SPEC"),
    ("AUC",         "AUC"),
    ("PPV",         "PPV"),
    ("NPV",         "NPV"),
]

def build_model_performance_table(
    results_by_model,
    permutation_results_by_model=None,
    models=None,
    fmt="string"  # "string" -> "mean★, 95%CI([lo, hi])"; or "numbers" -> separate numeric tables
):
    """
    Build a single table with rows=metrics and columns=models.
    - results_by_model: dict like results['lasso'] = {"performance": {"score": {...}}}
    - permutation_results_by_model: same structure; if given, stars are computed vs permutation per metric/model
    - models: optional list to control column order; defaults to keys in results_by_model
    - fmt: "string" for compact strings in cells, or "numbers" to also return numeric matrices
    
    Returns:
      table_df  (if fmt="string")
      OR
      {"display": DataFrame of strings,
       "mean":  DataFrame of means,
       "cilow":   DataFrame of CI lows,
       "cihigh":  DataFrame of CI highs,
       "p":       DataFrame of p-values,
       "stars":   DataFrame of stars}
    """
    if models is None:
        models = [m for m in results_by_model.keys()]
    
    # Prepare containers
    display = pd.DataFrame(index=[lab for _, lab in _METRIC_ORDER], columns=models, dtype=object)
    mean_df  = pd.DataFrame(index=[lab for _, lab in _METRIC_ORDER], columns=models, dtype=float)
    lo_df   = pd.DataFrame(index=[lab for _, lab in _METRIC_ORDER], columns=models, dtype=float)
    hi_df   = pd.DataFrame(index=[lab for _, lab in _METRIC_ORDER], columns=models, dtype=float)
    p_df    = pd.DataFrame(index=[lab for _, lab in _METRIC_ORDER], columns=models, dtype=float)
    star_df = pd.DataFrame(index=[lab for _, lab in _METRIC_ORDER], columns=models, dtype=object)

    for model in models:
        if model not in results_by_model:
            continue
        scores = results_by_model[model]["performance"]["score"]
        perm_scores = None
        if permutation_results_by_model is not None and model in permutation_results_by_model:
            perm_scores = permutation_results_by_model[model]["performance"]["score"]

        for key, label in _METRIC_ORDER:
            data = scores.get(key, [])
            mean, ci = _mean_ci_95(data)

            # p-value / stars vs permutation (if provided)
            pval, stars = (np.nan, "")
            if perm_scores is not None and key in perm_scores:
                try:
                    pval = mypermutation_ttest(np.asarray(perm_scores[key]), np.asarray(data))
                except Exception:
                    pval = np.nan
                stars = _stars_from_p(pval)

            # fill numeric frames
            mean_df.loc[label, model]  = round(mean, 3) if pd.notna(mean) else np.nan
            lo_df.loc[label, model]   = ci[0]
            hi_df.loc[label, model]   = ci[1]
            p_df.loc[label, model]    = pval
            star_df.loc[label, model] = stars

            # build display string
            if pd.isna(mean):
                disp = "NA"
            else:
                disp = f"{round(mean,3)}{stars}, 95%CI([{ci[0]}, {ci[1]}])"
            display.loc[label, model] = disp

    if fmt == "numbers":
        return {
            "display": display,
            "mean": mean_df,
            "cilow": lo_df,
            "cihigh": hi_df,
            "p": p_df,
            "stars": star_df,
        }
    else:
        return display

models_in_order = ("svm","logistic","random_forest","decision_tree","xgboost","knn")


# Do the same for EEG-only and Pedi-only
table_predictors  = build_model_performance_table(results, results_permu ,  models_in_order, fmt="string") # perm_eeg
# table_pedi = build_model_performance_table(results_pedi, perm_pedi, models_in_order, fmt="string")

print("\npredictors → outcomes")
table_predictors

predictors → outcomes

results["xgboost"]["shap_summary"]

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

def plot_shap_summary_bars(results_by_model, models=None, topk=10, normalize='sum', save=True, save_filename=None):
    """
    Plot top-k features per model with a consistent 0–1 scale.

    normalize:
      - 'sum' -> value = mean_abs_shap / sum(mean_abs_shap)  (shares add to 1)
      - 'max' -> value = mean_abs_shap / max(mean_abs_shap)  (largest bar = 1)
      - None  -> raw mean_abs_shap (original behavior)
    """
    if models is None:
        models = list(results_by_model.keys())

    n = len(models)
    fig, axes = plt.subplots(nrows=n, ncols=1, figsize=(8, 3*n), constrained_layout=True)
    if n == 1:
        axes = [axes]

    for ax, name in zip(axes, models):
        if name not in results_by_model:
            ax.axis('off')
            continue

        df_all = results_by_model[name]["shap_summary"].copy()

        # compute normalization using ALL features, not just top-k
        if normalize == 'sum':
            denom = df_all["mean_abs_shap"].sum()
            # avoid division by zero
            df_all["norm_value"] = df_all["mean_abs_shap"] / denom if denom > 0 else 0.0
            xlabel = "share of total |SHAP|"
            xlim = (0, 1)
            formatter = mtick.PercentFormatter(1.0)  # show as %
        elif normalize == 'max':
            denom = df_all["mean_abs_shap"].max()
            df_all["norm_value"] = df_all["mean_abs_shap"] / denom if denom > 0 else 0.0
            xlabel = "normalized |SHAP| (max=1)"
            xlim = (0, 1)
            formatter = None
        else:
            df_all["norm_value"] = df_all["mean_abs_shap"]
            xlabel = "mean |SHAP|"
            xlim = None
            formatter = None

        # pick top-k for plotting
        ss = df_all.sort_values("norm_value", ascending=False).head(topk)

        ax.barh(ss["Feature"][::-1], ss["norm_value"][::-1])
        ax.set_title(f"{name} — top {topk} features")
        ax.set_xlabel(xlabel)
        ax.set_ylabel("feature")

        if xlim is not None:
            ax.set_xlim(*xlim)
        if formatter is not None:
            ax.xaxis.set_major_formatter(formatter)
            
    if save and save_filename is None:
        save_filename = f"SHAP Plot.png"
    plt.savefig(save_filename, dpi=300, bbox_inches='tight')
    print(f"Figure saved to {save_filename}")        

    plt.show()

# Example:
plot_shap_summary_bars(
    results,
    models=("svm","logistic","random_forest","decision_tree","xgboost","knn"),
    topk=20,
    normalize=None   # or 'max' or None
)

Figure saved to SHAP Plot.png

import numpy as np
import matplotlib.pyplot as plt

def _interp_on_grid(fpr, tpr, grid):
    """Interpolate a single ROC curve (FPR, TPR) onto a shared FPR grid.

    
    Parameters
    ----------
    fpr : array-like
    False Positive Rate values from an ROC curve, assumed sorted ascending.
    tpr : array-like
    True Positive Rate values corresponding to each FPR.
    grid : array-like
    Common grid of FPR points (e.g., from 0 to 1) where TPR should be interpolated.
    
    
    Returns
    -------
    np.ndarray
    Interpolated TPR values evaluated at the given grid.
    
    
    Notes
    -----
    Ensures the curve starts at (0,0) and ends at (1,1) before performing
    piecewise-linear interpolation using ``np.interp``.
    """
    fpr = np.asarray(fpr, float)
    tpr = np.asarray(tpr, float)
    if fpr[0] > 0.0:
        fpr = np.r_[0.0, fpr]; tpr = np.r_[0.0, tpr]
    if fpr[-1] < 1.0:
        fpr = np.r_[fpr, 1.0]; tpr = np.r_[tpr, 1.0]
    return np.interp(grid, fpr, tpr)

def _adaptive_fpr_grid(fpr_list):
    """Generate an adaptive FPR grid based on all ROC curves.
    
    
    Parameters
    ----------
    fpr_list : list of array-like
    List of FPR arrays (one per ROC curve / iteration).
    
    
    Returns
    -------
    np.ndarray
    Sorted, unique array containing the union of all FPR points across iterations,
    including 0.0 and 1.0.
    
    
    Notes
    -----
    This avoids choosing an arbitrary ``grid_size`` and ensures the mean/SEM
    calculations preserve all original ROC anchor points.
    """
    all_fpr = np.concatenate([
        np.asarray(fpr, float).ravel() for fpr in fpr_list if len(fpr) > 0
    ])
    all_fpr = np.r_[all_fpr, 0.0, 1.0]
    grid = np.unique(np.clip(all_fpr, 0.0, 1.0))
    return grid

def plot_mean_roc_with_sem_adaptive(results_by_model, models=None, ax=None,
                                    label_fmt="{model} (AUC≈{auc:.3f})",
                                    save=True, save_filename="ROC_mean_sem.png"):
    """Plot mean ROC curves with SEM bands using an adaptive FPR grid.
    
    
    Parameters
    ----------
    results_by_model : dict
    Output dictionary from ``my_nestedcv_multi`` containing ROC data for each model.
    models : list[str] or None, optional
    Which model keys to plot. Defaults to all keys in ``results_by_model``.
    ax : matplotlib.axes.Axes or None, optional
    Axis object to draw on. Creates a new figure if None.
    label_fmt : str, optional
    Format string for legend labels (receives model and AUC).
    save : bool, optional
    If True, saves the plot as a PNG file.
    save_filename : str, optional
    Filename for saving the figure if ``save=True``.
    
    
    Returns
    -------
    matplotlib.axes.Axes
    The axis containing the plotted ROC curves.
    
    
    Notes
    -----
    - Computes the mean and SEM (standard error of the mean) across iterations.
    - Uses a data-driven adaptive grid formed from the union of all FPRs.
    - Avoids the need for a fixed grid size (e.g., 201 points).
    - Draws a shaded SEM band around the mean ROC curve for uncertainty visualization.
    """
    if models is None:
        models = list(results_by_model.keys())
    if ax is None:
        fig, ax = plt.subplots()

    for model in models:
        perf = results_by_model[model]["performance"]
        fpr_list = perf["fpr"]  # list of arrays (one per iteration)
        tpr_list = perf["tpr"]

        # Build adaptive grid from all iterations’ FPRs
        fpr_grid = _adaptive_fpr_grid(fpr_list)

        # Interpolate each iteration onto the adaptive grid
        tpr_interp = [
            _interp_on_grid(fpr_i, tpr_i, fpr_grid)
            for fpr_i, tpr_i in zip(fpr_list, tpr_list)
        ]
        tpr_interp = np.column_stack(tpr_interp)  # (len(fpr_grid), n_iter)

        # Mean and SEM across iterations
        mean_tpr = np.mean(tpr_interp, axis=1)
        if tpr_interp.shape[1] > 1:
            std_tpr  = np.std(tpr_interp, axis=1, ddof=1)
            sem_tpr  = std_tpr / np.sqrt(tpr_interp.shape[1])
        else:
            sem_tpr = np.zeros_like(mean_tpr)

        # AUC of the mean curve
        # auc_mean = np.trapz(mean_tpr, fpr_grid) # calculate the auc from the interpolate values
        auc_values = results_by_model[model]["performance"]["score"]["AUC"] # get the auc from the model validation resutls
        auc_mean = np.mean(auc_values)

        # Plot mean + SEM band
        ax.plot(fpr_grid, mean_tpr, linewidth=2, label=label_fmt.format(model=model, auc=auc_mean))
        ax.fill_between(fpr_grid,
                        np.clip(mean_tpr - sem_tpr, 0, 1),
                        np.clip(mean_tpr + sem_tpr, 0, 1),
                        alpha=0.2, linewidth=0)

    ax.plot([0,1],[0,1],'--',linewidth=1,label='Chance')
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title("ROC (mean across iterations) with SEM bands — adaptive grid")
    ax.legend(loc="lower right")
    ax.set_xlim(0, 1); ax.set_ylim(0, 1)

    if save:
        plt.savefig(save_filename, dpi=300, bbox_inches='tight')
        print(f"Figure saved to {save_filename}")

    plt.show()
    return ax

# Plot all models Or select a subset / customize
plot_mean_roc_with_sem_adaptive(results, models=["svm","random_forest","decision_tree","xgboost"])

Figure saved to ROC_mean_sem.png

<Axes: title={'center': 'ROC (mean across iterations) with SEM bands — adaptive grid'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>

	svm	logistic	random_forest	decision_tree	xgboost	knn
ACC	0.66ns, 95%CI([0.637, 0.684])	0.553ns, 95%CI([0.517, 0.589])	0.768***, 95%CI([0.726, 0.81])	0.754***, 95%CI([0.73, 0.779])	0.729***, 95%CI([0.716, 0.742])	0.654***, 95%CI([0.614, 0.695])
SENS	0.536ns, 95%CI([0.418, 0.654])	0.592ns, 95%CI([0.472, 0.712])	0.744***, 95%CI([0.656, 0.832])	0.544ns, 95%CI([0.513, 0.575])	0.712***, 95%CI([0.615, 0.809])	0.656ns, 95%CI([0.568, 0.744])
SPEC	0.709ns, 95%CI([0.655, 0.764])	0.538ns, 95%CI([0.467, 0.609])	0.777***, 95%CI([0.731, 0.823])	0.838***, 95%CI([0.797, 0.878])	0.737***, 95%CI([0.694, 0.78])	0.654***, 95%CI([0.593, 0.716])
AUC	0.685ns, 95%CI([0.652, 0.719])	0.628ns, 95%CI([0.602, 0.655])	0.856***, 95%CI([0.82, 0.892])	0.697***, 95%CI([0.678, 0.716])	0.811***, 95%CI([0.784, 0.837])	0.73***, 95%CI([0.683, 0.778])
PPV	0.488***, 95%CI([0.431, 0.544])	0.349ns, 95%CI([0.313, 0.385])	0.6***, 95%CI([0.539, 0.662])	0.568***, 95%CI([0.472, 0.664])	0.557ns, 95%CI([0.516, 0.598])	0.515***, 95%CI([0.439, 0.59])
NPV	0.819ns, 95%CI([0.784, 0.855])	0.783ns, 95%CI([0.738, 0.828])	0.895***, 95%CI([0.855, 0.935])	0.834***, 95%CI([0.819, 0.849])	0.885***, 95%CI([0.855, 0.914])	0.85***, 95%CI([0.814, 0.886])

	Feature	mean_abs_shap
0	Feature_49	0.664357
1	Feature_59	0.657823
2	Feature_47	0.524258
3	Feature_26	0.404132
4	Feature_46	0.300160
5	Feature_41	0.288114
6	Feature_33	0.228888
7	Feature_57	0.200986
8	Feature_35	0.194673
9	Feature_3	0.182435
10	Feature_31	0.157442
11	Feature_27	0.140048
12	Feature_50	0.132195
13	Feature_20	0.099229
14	Feature_16	0.097026
15	Feature_37	0.082143
16	Feature_51	0.081249
17	Feature_21	0.077274
18	Feature_29	0.075925
19	Feature_18	0.075559
20	Feature_39	0.071748
21	Feature_44	0.069582
22	Feature_42	0.068766
23	Feature_17	0.061193
24	Feature_22	0.059763
25	Feature_36	0.059216
26	Feature_2	0.055967
27	Feature_54	0.054189
28	Feature_48	0.052283
29	Feature_45	0.049294
30	Feature_32	0.047828
31	Feature_13	0.043113
32	Feature_4	0.037824
33	Feature_55	0.036640
34	Feature_25	0.033612
35	Feature_1	0.031376
36	Feature_24	0.026758
37	Feature_15	0.026149
38	Feature_40	0.021900
39	Feature_8	0.020529
40	Feature_30	0.019574
41	Feature_6	0.016959
42	Feature_43	0.014200
43	Feature_11	0.013500
44	Feature_5	0.013499
45	Feature_58	0.010371
46	Feature_10	0.010156
47	Feature_34	0.009229
48	Feature_23	0.007678
49	Feature_14	0.007328
50	Feature_28	0.006232
51	Feature_53	0.005224
52	Feature_38	0.004656
53	Feature_19	0.003142
54	Feature_52	0.002915
55	Feature_56	0.001687
56	Feature_9	0.000141
57	Feature_12	0.000000
58	Feature_7	0.000000

Model development using different ML methods¶

Model training and validation functions¶

Load data¶

Run the ML pipeline¶

Display the model performance¶

Plot the SHAP¶

Plot the ROC curve¶