ADDI Alzheimers Detection Challenge
undersampling+bagging example
To tackle with huge class imbalance, I applied downsampling+bagging using lightgbm.
As I mentioned https://discourse.aicrowd.com/t/undersampling-bagging-boosting-0-006-in-lb/5636 ,
I'm using undersampling+bagging technique. Here is a example.
In [1]:
!pip install -q -U aicrowd-cli
In [2]:
%load_ext aicrowd.magic
In [3]:
import os
# Please use the absolute for the location of the dataset.
# Or you can use relative path with `os.getcwd() + "test_data/validation.csv"`
AICROWD_DATASET_PATH = os.getenv("DATASET_PATH", "/ds_shared_drive/validation.csv")
#AICROWD_TRAINING_PATH = os.getenv("TRAINING_PATH","/ds_shared_drive/train.csv")
#AICROWD_VALIDATION_PATH = os.getenv("VALIDATION_PATH","/ds_shared_drive/validation_ground_truth.csv")
AICROWD_PREDICTIONS_PATH = os.getenv("PREDICTIONS_PATH", "predictions.csv")
AICROWD_ASSETS_DIR = "assets"
AICROWD_API_KEY ="" # Get your key from https://www.aicrowd.com/participants/me
Install packages 🗃¶
Please add all pacakage installations in this section
In [4]:
!pip install numpy pandas lightgbm
Import training data¶
In [5]:
import numpy as np
import pandas as pd
import nyaggle
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
NUM_FOLD = 5
SEED = 42
target_col = 'diagnosis'
target_values = ['normal','post_alzheimer','pre_alzheimer']
cat_cols = ['intersection_pos_rel_centre']
unique_cols = ['actual_hour_digit', 'actual_minute_digit']
train = pd.read_csv(AICROWD_DATASET_PATH.replace('validation','train'))
train = train[train[target_col].isin(target_values)].copy().reset_index(drop =True)
In [6]:
def missing_data(data):
total = data.isnull().sum()
percent = (data.isnull().sum()/data.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in data.columns:
dtype = str(data[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
missing_data(train)
Out[6]:
In [7]:
fig, ax = plt.subplots()
g = sns.countplot(train[target_col], palette='viridis')
g.set_xticklabels(['Normal', 'post_diagnosis','pre_diagnosis'])
g.set_yticklabels([])
def show_values_on_bars(axs):
def _show_on_single_plot(ax):
for p in ax.patches:
_x = p.get_x() + p.get_width() / 3
_y = p.get_y() + p.get_height()
value = '{:.0f}'.format(p.get_height())
ax.text(_x, _y, value, ha="center")
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
show_values_on_bars(ax)
sns.despine(left=True, bottom=True)
plt.xlabel('')
plt.ylabel('')
plt.title('Distribution of diagnosis', fontsize=20)
plt.tick_params(axis='x', which='major', labelsize=15)
plt.show()
In [8]:
def plot_feature_distribution(df1, df2, df3, label1, label2, label3, features):
i = 0
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(8,8,figsize=(20,25))
for feature in features:
i += 1
plt.subplot(8,8,i)
sns.distplot(df1[feature],label=label1)
sns.distplot(df2[feature],label=label2)
sns.distplot(df3[feature], label=label3)
plt.xlabel(feature, fontsize=9)
locs, labels = plt.xticks()
plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)
plt.tick_params(axis='y', which='major', labelsize=6)
plt.legend()
plt.show();
df_post = train[train[target_col] == 'post_alzheimer'].reset_index(drop = True)
df_pre = train[train[target_col] == 'pre_alzheimer'].reset_index(drop = True)
df_neg = train[train[target_col] == 'normal'].reset_index(drop = True)
features = [i for i in train.columns if i not in ['row_id',target_col]+cat_cols]
plot_feature_distribution(df_neg,df_pre,df_post,'neg','pre','post',features[:64])
In [9]:
plot_feature_distribution(df_neg,df_pre,df_post,'neg','pre','post',features[64:])