Feature Explorations
ADDI Alzhemiers columns Reference¶
In [7]:
from IPython.display import HTML
HTML(''' <script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
Code hidden , to toggle: <a href="javascript:code_toggle()">here</a>.''')
Out[7]:
In [1]:
import pandas as pd
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import warnings
warnings.filterwarnings("ignore")
INPUT_DIR = '../data/'
COLS_TO_SHOW = 130
def h(content):
display(HTML(content))
def target_hist(col):
tr[tr['diagnosis'] == 'normal'][col].plot(style='.', title='Hist ' + col, figsize=(15, 3))
tr[tr['diagnosis'] == 'post_alzheimer'][col].plot(style='.', title='Hist ' + col, figsize=(15, 3))
tr[tr['diagnosis'] == 'pre_alzheimer'][col].plot(style='.',figsize=(15, 3),
title=col + ' (blue=normal, orange=post_alzheimer, green=pre_alzheimer)')
plt.show()
def _desc(data, col, label):
d0 = data.describe().reset_index()
d0.columns = [col, label]
return d0.append({col:'unique values', label:data.unique().shape[0]}, ignore_index=True) \
.append({col:'unique values / count', label:np.round(data.unique().shape[0] / data.shape[0], 4)}, ignore_index=True) \
.append({col:'NaNs', label:data.isnull().sum()}, ignore_index=True) \
def desc1(col):
d0 = _desc(tr[col], col, 'Train')
d3 = _desc(te[col], col, 'Validation')
dd = d0.merge(d3)
display(dd)
if col not in ['row_id']:
h('<b>Most popular values (NaN = -999):</b>')
N = 10
d0 = tr[['row_id',col]].fillna(-999).groupby(col)['row_id'].count().reset_index()
d1 = te[['row_id',col]].fillna(-999).groupby(col)['row_id'].count().reset_index()
dd = d0.merge(d1, how='left', on=col)
dd['Share in train'] = np.round(dd['row_id_x'] / dd['row_id_x'].sum(), 5)
dd['Share in validation'] = np.round(dd['row_id_y'] / dd['row_id_y'].sum(), 5)
dd = dd.sort_values('row_id_x', ascending=False).head(N).fillna(0).reset_index(drop=True)
dd = dd.rename({'row_id_x':'Count in train (desc)','row_id_y':'Count in validation'}, axis=1)
display(dd)
def hist1(col):
plt.figure(figsize=(15, 3))
plt.subplot(121)
plt.hist(tr[col], bins=70);
plt.title('Train histogram: ' + col);
plt.subplot(122)
plt.hist(te[col], bins=70);
plt.title('Validation histogram: ' + col);
plt.show()
def barh1(col):
if col not in ['row_id']:
plt.figure(figsize=(15, 3))
plt.subplot(121)
tr[col].value_counts().sort_values().plot(kind = 'barh')
plt.title('Train value counts: ' + col);
plt.subplot(122)
te[col].value_counts().sort_values().plot(kind = 'barh')
plt.title('Validation value counts: ' + col);
plt.show()
def corr1(col):
N = None #10000
num_vars = [f for f in tr.columns if tr[f].dtype != 'object']
trx = tr.head(N) if N is not None else tr.copy()
corrs = trx[num_vars].corrwith(trx[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'Column',0:'Correlation with ' + col}, axis=1)
h('<b>Most correlated values with ' + col + ':</b>')
trx = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
def linkx(val):
return '<a href="#c_{}">{}</a>'.format(val, val) if val in included_cols else val
trx['Column'] = trx['Column'].apply(linkx)
h(trx.to_html(escape=False))
def numeric(col):
target_hist(col)
hist1(col)
desc1(col)
corr1(col)
def categorical(col):
barh1(col)
desc1(col)
def proc(col):
h('<h3 id="c_' + col + '">' + col + '</h3>' + '<a style="font-size:11px" href="#home">(Jump to top)</a>')
categorical(col) if tr[col].dtype == 'object' or tr[col].nunique()<=10 else numeric(col)
tr = pd.read_csv(INPUT_DIR + 'train.csv')
te = pd.read_csv(INPUT_DIR + 'validation.csv')
included_cols = list(tr.columns.values[:COLS_TO_SHOW])
split_on = ['row_id','missing_digit_1','1 dist from cen','diagnosis','euc_dist_digit_1','area_digit_1','height_digit_1','width_digit_1','variance_width']
h('<b>Links to column info:</b> ' + ', '.join([('<li>' if col in split_on else '') + '<a href="#c_' + col + '">' + col + '</a>' for col in included_cols]))
h('Train features shape: <b>' + str(tr.shape) + '</b>' +
'<br>Validation features shape: <b>' + str(te.shape) + '</b>')
h('Train features preview:')
display(tr.head(10))
for col in included_cols:
if "diagnosis" not in col:
proc(col)