Loading

ADDI Alzheimers Detection Challenge

ADDI Alzhemiers columns Reference

Feature Explorations

siddharth

Feature Explorations

ADDI Alzhemiers columns Reference

In [7]:
from IPython.display import HTML

HTML(''' <script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }

 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
Code hidden , to toggle: <a href="javascript:code_toggle()">here</a>.''')
Out[7]:
Code hidden , to toggle: here.
In [1]:
import pandas as pd
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import warnings
warnings.filterwarnings("ignore")

INPUT_DIR = '../data/'
COLS_TO_SHOW = 130


def h(content):
    display(HTML(content))
    
def target_hist(col):
    tr[tr['diagnosis'] == 'normal'][col].plot(style='.', title='Hist ' + col, figsize=(15, 3))
    tr[tr['diagnosis'] == 'post_alzheimer'][col].plot(style='.', title='Hist ' + col, figsize=(15, 3))
    tr[tr['diagnosis'] == 'pre_alzheimer'][col].plot(style='.',figsize=(15, 3),
                                                     title=col + ' (blue=normal, orange=post_alzheimer, green=pre_alzheimer)')
    plt.show()
    
    
def _desc(data, col, label):
    d0 = data.describe().reset_index()
    d0.columns = [col, label]
    return d0.append({col:'unique values', label:data.unique().shape[0]}, ignore_index=True) \
             .append({col:'unique values / count', label:np.round(data.unique().shape[0] / data.shape[0], 4)}, ignore_index=True) \
             .append({col:'NaNs', label:data.isnull().sum()}, ignore_index=True) \
    
def desc1(col):
    d0 = _desc(tr[col], col, 'Train')
    d3 = _desc(te[col], col, 'Validation')
    dd = d0.merge(d3)
    display(dd)
    
    if col not in ['row_id']:
        h('<b>Most popular values (NaN = -999):</b>')
        N = 10
        d0 = tr[['row_id',col]].fillna(-999).groupby(col)['row_id'].count().reset_index()
        d1 = te[['row_id',col]].fillna(-999).groupby(col)['row_id'].count().reset_index()
        dd = d0.merge(d1, how='left', on=col)
        dd['Share in train'] = np.round(dd['row_id_x'] / dd['row_id_x'].sum(), 5)
        dd['Share in validation'] = np.round(dd['row_id_y'] / dd['row_id_y'].sum(), 5)
        dd = dd.sort_values('row_id_x', ascending=False).head(N).fillna(0).reset_index(drop=True)
        dd = dd.rename({'row_id_x':'Count in train (desc)','row_id_y':'Count in validation'}, axis=1)
        display(dd)

def hist1(col):
    plt.figure(figsize=(15, 3))
    plt.subplot(121)
    plt.hist(tr[col], bins=70);
    plt.title('Train histogram: ' + col);
    plt.subplot(122)
    plt.hist(te[col], bins=70);
    plt.title('Validation histogram: ' + col);
    plt.show()

def barh1(col):
    if col not in ['row_id']:
        plt.figure(figsize=(15, 3))
        plt.subplot(121)
        tr[col].value_counts().sort_values().plot(kind = 'barh')
        plt.title('Train value counts: ' + col);
        plt.subplot(122)
        te[col].value_counts().sort_values().plot(kind = 'barh')
        plt.title('Validation value counts: ' + col);
        plt.show()
        
def corr1(col):
    N = None #10000
    num_vars = [f for f in tr.columns if tr[f].dtype != 'object']
    trx = tr.head(N) if N is not None else tr.copy()
    corrs = trx[num_vars].corrwith(trx[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'Column',0:'Correlation with ' + col}, axis=1)
    h('<b>Most correlated values with ' + col + ':</b>')
    trx = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
    def linkx(val):
        return '<a href="#c_{}">{}</a>'.format(val, val) if val in included_cols else val
    trx['Column'] = trx['Column'].apply(linkx)
    h(trx.to_html(escape=False))
    
def numeric(col):
    target_hist(col)
    hist1(col)
    desc1(col)
    corr1(col) 
    
def categorical(col):
    barh1(col)
    desc1(col)

def proc(col):
    h('<h3 id="c_' + col + '">' + col + '</h3>' + '<a style="font-size:11px" href="#home">(Jump to top)</a>')
    categorical(col) if tr[col].dtype == 'object' or tr[col].nunique()<=10 else numeric(col)
        
tr = pd.read_csv(INPUT_DIR + 'train.csv')
te = pd.read_csv(INPUT_DIR + 'validation.csv')

included_cols = list(tr.columns.values[:COLS_TO_SHOW])
split_on = ['row_id','missing_digit_1','1 dist from cen','diagnosis','euc_dist_digit_1','area_digit_1','height_digit_1','width_digit_1','variance_width']
h('<b>Links to column info:</b> ' + ', '.join([('<li>' if col in split_on else '') + '<a href="#c_' + col + '">' + col + '</a>' for col in included_cols]))

h('Train features shape: <b>' + str(tr.shape) + '</b>' + 
  '<br>Validation features shape: <b>' + str(te.shape) + '</b>')
h('Train features preview:')
display(tr.head(10))

for col in included_cols:
    if "diagnosis" not in col:
        proc(col)
Links to column info:
  • row_id, number_of_digits,
  • missing_digit_1, missing_digit_2, missing_digit_3, missing_digit_4, missing_digit_5, missing_digit_6, missing_digit_7, missing_digit_8, missing_digit_9, missing_digit_10, missing_digit_11, missing_digit_12,
  • 1 dist from cen, 10 dist from cen, 11 dist from cen, 12 dist from cen, 2 dist from cen, 3 dist from cen, 4 dist from cen, 5 dist from cen, 6 dist from cen, 7 dist from cen, 8 dist from cen, 9 dist from cen,
  • euc_dist_digit_1, euc_dist_digit_2, euc_dist_digit_3, euc_dist_digit_4, euc_dist_digit_5, euc_dist_digit_6, euc_dist_digit_7, euc_dist_digit_8, euc_dist_digit_9, euc_dist_digit_10, euc_dist_digit_11, euc_dist_digit_12,
  • area_digit_1, area_digit_2, area_digit_3, area_digit_4, area_digit_5, area_digit_6, area_digit_7, area_digit_8, area_digit_9, area_digit_10, area_digit_11, area_digit_12,
  • height_digit_1, height_digit_2, height_digit_3, height_digit_4, height_digit_5, height_digit_6, height_digit_7, height_digit_8, height_digit_9, height_digit_10, height_digit_11, height_digit_12,
  • width_digit_1, width_digit_2, width_digit_3, width_digit_4, width_digit_5, width_digit_6, width_digit_7, width_digit_8, width_digit_9, width_digit_10, width_digit_11, width_digit_12,
  • variance_width, variance_height, variance_area, deviation_dist_from_mid_axis, between_axis_digits_angle_sum, between_axis_digits_angle_var, between_digits_angle_cw_sum, between_digits_angle_cw_var, between_digits_angle_ccw_sum, between_digits_angle_ccw_var, sequence_flag_cw, sequence_flag_ccw, number_of_hands, hand_count_dummy, hour_hand_length, minute_hand_length, single_hand_length, clockhand_ratio, clockhand_diff, angle_between_hands, deviation_from_centre, intersection_pos_rel_centre, hour_proximity_from_11, minute_proximity_from_2, hour_pointing_digit, actual_hour_digit, minute_pointing_digit, actual_minute_digit, final_rotation_angle, ellipse_circle_ratio, count_defects, percentage_inside_ellipse, pred_tremor, double_major, double_minor, vertical_dist, horizontal_dist, top_area_perc, bottom_area_perc, left_area_perc, right_area_perc, hor_count, vert_count, eleven_ten_error, other_error, time_diff, centre_dot_detect,
  • diagnosis
  • Train features shape: (32777, 122)
    Validation features shape: (362, 121)
    Train features preview:
    row_id number_of_digits missing_digit_1 missing_digit_2 missing_digit_3 missing_digit_4 missing_digit_5 missing_digit_6 missing_digit_7 missing_digit_8 ... bottom_area_perc left_area_perc right_area_perc hor_count vert_count eleven_ten_error other_error time_diff centre_dot_detect diagnosis
    0 S0CIXBKIUEOUBNURP 12.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.526170 0.524975 0.474667 0 0 0 1 -105.0 0.0 normal
    1 IW1Z4Z3H720OPW8LL 12.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000810 0.516212 0.483330 0 1 0 1 NaN NaN normal
    2 PVUGU14JRSU44ZADT 12.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.488109 0.550606 0.449042 0 0 0 0 0.0 0.0 normal
    3 RW5UTGMB9H67LWJHX 7.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... NaN NaN NaN 1 0 0 1 NaN NaN normal
    4 W0IM2V6F6UP5LYS3E 12.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.512818 0.511865 0.487791 0 1 0 0 0.0 1.0 normal
    5 IR9A4R5TTZJR78ZC8 12.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.513425 0.482235 0.517410 0 0 0 1 495.0 0.0 normal
    6 LS1R4PFJUOVEU0K0E 2.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 0.510611 0.527788 0.471864 1 0 0 1 NaN NaN post_alzheimer
    7 OQLC2VXVZUNWI31P9 11.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.501412 0.532871 0.466693 1 0 0 1 NaN NaN normal
    8 N0KKCFX9FJG0NSQ1E 4.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 1.0 ... NaN NaN NaN 1 0 0 1 540.0 0.0 normal
    9 3LDA1Z7RH2HXAKRR1 10.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.491031 0.530640 0.468971 1 1 0 1 NaN NaN normal

    10 rows × 122 columns

    row_id Train Validation
    0 count 32777 362
    1 unique 32777 362
    2 top Q97OXAMNLSONJNDM2 NTZ3OMMX8MDOJZXWS
    3 freq 1 1
    4 unique values 32777 362
    5 unique values / count 1.0 1.0
    6 NaNs 0 0

    number_of_digits

    (Jump to top)
    number_of_digits Train Validation
    0 count 32703.000000 359.000000
    1 mean 10.299422 9.520891
    2 std 2.345710 3.132811
    3 min 1.000000 1.000000
    4 25% 10.000000 8.500000
    5 50% 11.000000 11.000000
    6 75% 12.000000 12.000000
    7 max 17.000000 13.000000
    8 unique values 18.000000 14.000000
    9 unique values / count 0.000500 0.038700
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    number_of_digits Count in train (desc) Count in validation Share in train Share in validation
    0 12.0 12818 122.0 0.39107 0.33702
    1 11.0 7535 74.0 0.22989 0.20442
    2 10.0 4416 42.0 0.13473 0.11602
    3 9.0 2541 27.0 0.07752 0.07459
    4 8.0 1564 12.0 0.04772 0.03315
    5 7.0 932 12.0 0.02843 0.03315
    6 6.0 678 14.0 0.02069 0.03867
    7 5.0 507 11.0 0.01547 0.03039
    8 4.0 460 13.0 0.01403 0.03591
    9 1.0 390 13.0 0.01190 0.03591
    Most correlated values with number_of_digits:
    Column Correlation with number_of_digits
    0 number_of_digits 1.000000
    1 between_digits_angle_cw_sum 0.450802
    2 between_axis_digits_angle_sum 0.398671
    3 between_digits_angle_ccw_sum 0.384744
    4 count_defects 0.337459
    5 sequence_flag_cw 0.304220
    112 missing_digit_3 -0.564346
    113 missing_digit_6 -0.579419
    114 missing_digit_5 -0.597470
    115 missing_digit_4 -0.601174
    116 missing_digit_7 -0.617619

    missing_digit_1

    (Jump to top)
    missing_digit_1 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.221845 0.289694
    2 std 0.415494 0.454253
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 1.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_1 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 25448 255 0.77640 0.70442
    1 1.0 7255 104 0.22134 0.28729
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_2

    (Jump to top)
    missing_digit_2 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.148243 0.178273
    2 std 0.355346 0.383277
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_2 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 27855 295 0.84983 0.81492
    1 1.0 4848 64 0.14791 0.17680
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_3

    (Jump to top)
    missing_digit_3 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.125096 0.172702
    2 std 0.330832 0.378517
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_3 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 28612 297 0.87293 0.82044
    1 1.0 4091 62 0.12481 0.17127
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_4

    (Jump to top)
    missing_digit_4 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.166713 0.250696
    2 std 0.372725 0.434019
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.500000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_4 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 27251 269 0.83141 0.74309
    1 1.0 5452 90 0.16634 0.24862
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_5

    (Jump to top)
    missing_digit_5 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.202153 0.261838
    2 std 0.401612 0.440249
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 1.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_5 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 26092 265 0.79605 0.73204
    1 1.0 6611 94 0.20170 0.25967
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_6

    (Jump to top)
    missing_digit_6 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.131364 0.192201
    2 std 0.337803 0.394580
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_6 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 28407 290 0.86667 0.80110
    1 1.0 4296 69 0.13107 0.19061
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_7

    (Jump to top)
    missing_digit_7 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.126839 0.197772
    2 std 0.332797 0.398875
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_7 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 28555 288 0.87119 0.79558
    1 1.0 4148 71 0.12655 0.19613
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_8

    (Jump to top)
    missing_digit_8 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.120723 0.186630
    2 std 0.325810 0.390158
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_8 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 28755 292 0.87729 0.80663
    1 1.0 3948 67 0.12045 0.18508
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_9

    (Jump to top)
    missing_digit_9 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.175183 0.289694
    2 std 0.380129 0.454253
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 1.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_9 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 26974 255 0.82296 0.70442
    1 1.0 5729 104 0.17479 0.28729
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_10

    (Jump to top)
    missing_digit_10 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.147418 0.236769
    2 std 0.354527 0.425693
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_10 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 27882 274 0.85066 0.75691
    1 1.0 4821 85 0.14708 0.23481
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_11

    (Jump to top)
    missing_digit_11 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.168241 0.222841
    2 std 0.374086 0.416733
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_11 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 27201 279 0.82988 0.77072
    1 1.0 5502 80 0.16786 0.22099
    2 -999.0 74 3 0.00226 0.00829

    missing_digit_12

    (Jump to top)
    missing_digit_12 Train Validation
    0 count 32703.000000 359.000000
    1 mean 0.115158 0.189415
    2 std 0.319217 0.392385
    3 min 0.000000 0.000000
    4 25% 0.000000 0.000000
    5 50% 0.000000 0.000000
    6 75% 0.000000 0.000000
    7 max 1.000000 1.000000
    8 unique values 3.000000 3.000000
    9 unique values / count 0.000100 0.008300
    10 NaNs 74.000000 3.000000
    Most popular values (NaN = -999):
    missing_digit_12 Count in train (desc) Count in validation Share in train Share in validation
    0 0.0 28937 291 0.88284 0.80387
    1 1.0 3766 68 0.11490 0.18785
    2 -999.0 74 3 0.00226 0.00829

    1 dist from cen

    (Jump to top)
    1 dist from cen Train Validation
    0 count 25448.000000 255.000000
    1 mean 361.869732 354.339930
    2 std 50.310698 57.701010
    3 min 3.354102 51.983170
    4 25% 336.580321 330.244742
    5 50% 367.434688 368.160970
    6 75% 393.898464 388.853105
    7 max 618.025889 492.941426
    8 unique values 21148.000000 255.000000
    9 unique values / count 0.645200 0.704400
    10 NaNs 7329.000000 107.000000
    Most popular values (NaN = -999):
    1 dist from cen Count in train (desc) Count in validation Share in train Share in validation
    0 -999.000000 7329 107.0 0.22360 0.54872
    1 380.304155 6 0.0 0.00018 0.00000
    2 383.020887 6 0.0 0.00018 0.00000
    3 400.707187 6 0.0 0.00018 0.00000
    4 362.286144 6 0.0 0.00018 0.00000
    5 353.022662 6 0.0 0.00018 0.00000
    6 371.875315 5 0.0 0.00015 0.00000
    7 393.817851 5 0.0 0.00015 0.00000
    8 391.651950 5 0.0 0.00015 0.00000
    9 370.961588 5 0.0 0.00015 0.00000
    Most correlated values with 1 dist from cen:
    Column Correlation with 1 dist from cen
    0 1 dist from cen 1.000000
    1 2 dist from cen 0.709005
    2 12 dist from cen 0.658284
    3 3 dist from cen 0.542965
    4 11 dist from cen 0.435573
    5 4 dist from cen 0.358795
    111 area_digit_2 -0.269999
    112 height_digit_3 -0.275796
    113 width_digit_12 -0.292792
    114 height_digit_12 -0.301515
    115 area_digit_12 -0.346086

    10 dist from cen

    (Jump to top)
    10 dist from cen Train Validation
    0 count 27882.000000 274.000000
    1 mean 367.418424 362.651154
    2 std 48.060878 61.588089
    3 min 5.852350 37.643060
    4 25% 343.945581 338.489183
    5 50% 372.683512 373.309857
    6 75% 397.112940 401.833946
    7 max 628.776988 505.421853
    8 unique values 22765.000000 275.000000
    9 unique values / count 0.694500 0.759700
    10 NaNs 4895.000000 88.000000
    Most popular values (NaN = -999):
    10 dist from cen Count in train (desc) Count in validation Share in train Share in validation
    0 -999.000000 4895 88.0 0.14934 0.50575
    1 358.793116 6 0.0 0.00018 0.00000
    2 365.735560 6 0.0 0.00018 0.00000
    3 369.040987 6 1.0 0.00018 0.00575
    4 404.515142 5 0.0 0.00015 0.00000
    5 413.559246 5 0.0 0.00015 0.00000
    6 389.403711 5 0.0 0.00015 0.00000
    7 395.501264 5 0.0 0.00015 0.00000
    8 379.692310 5 0.0 0.00015 0.00000
    9 372.320091 5 0.0 0.00015 0.00000
    Most correlated values with 10 dist from cen:
    Column Correlation with 10 dist from cen
    0 10 dist from cen 1.000000
    1 11 dist from cen 0.792786
    2 9 dist from cen 0.700233
    3 12 dist from cen 0.493608
    4 8 dist from cen 0.488509
    5 6 dist from cen 0.424319
    111 between_digits_angle_ccw_sum -0.250501
    112 height_digit_12 -0.267602
    113 area_digit_10 -0.276785
    114 width_digit_10 -0.295011
    115 area_digit_12 -0.296367

    11 dist from cen

    (Jump to top)
    11 dist from cen Train Validation
    0 count 27201.000000 279.000000
    1 mean 368.235873 368.939471
    2 std 48.425983 48.268612
    3 min 11.335784 215.352037
    4 25% 342.212288 343.152948
    5 50% 372.667412 371.481157
    6 75% 399.011278 403.296699
    7 max 613.843832 495.745903
    8 unique values 22258.000000 279.000000
    9 unique values / count 0.679100 0.770700
    10 NaNs 5576.000000 83.000000
    Most popular values (NaN = -999):
    11 dist from cen Count in train (desc) Count in validation Share in train Share in validation
    0 -999.000000 5576 83.0 0.17012 0.47159
    1 350.089274 8 0.0 0.00024 0.00000
    2 392.698994 7 0.0 0.00021 0.00000
    3 410.766661 6 1.0 0.00018 0.00568
    4 378.123326 6 0.0 0.00018 0.00000
    5 371.726042 6 0.0 0.00018 0.00000
    6 397.109871 5 0.0 0.00015 0.00000
    7 382.382335 5 0.0 0.00015 0.00000
    8 373.915097 5 0.0 0.00015 0.00000
    9 348.500359 5 0.0 0.00015 0.00000
    Most correlated values with 11 dist from cen:
    Column Correlation with 11 dist from cen
    0 11 dist from cen 1.000000
    1 10 dist from cen 0.792786
    2 12 dist from cen 0.649639
    3 9 dist from cen 0.534832
    4 1 dist from cen 0.435573
    5 2 dist from cen 0.416683
    111 area_digit_11 -0.281544
    112 area_digit_10 -0.282686
    113 width_digit_10 -0.297822
    114 height_digit_12 -0.302116
    115 area_digit_12 -0.329968

    12 dist from cen

    (Jump to top)
    12 dist from cen Train Validation
    0 count 28937.000000 291.000000
    1 mean 370.796838 370.891134
    2 std 48.005863 56.899557
    3 min 22.102036 94.366308
    4 25% 348.353987 342.058650
    5 50% 377.180328 381.878580
    6 75% 401.186366 406.138700
    7 max 659.571073 571.075520
    8 unique values 21357.000000 292.000000
    9 unique values / count 0.651600 0.806600
    10 NaNs 3840.000000 71.000000
    Most popular values (NaN = -999):
    12 dist from cen Count in train (desc) Count in validation Share in train Share in validation
    0 -999.000000 3840 71.0 0.11716 0.355
    1 386.505175 9 0.0 0.00027 0.000
    2 380.573909 8 0.0 0.00024 0.000
    3 386.000324 7 0.0 0.00021 0.000
    4 370.005405 7 0.0 0.00021 0.000
    5 383.720276 7 0.0 0.00021 0.000
    6 398.130946 7 0.0 0.00021 0.000
    7 360.699667 7 0.0 0.00021 0.000
    8 381.878580 6 1.0 0.00018 0.005
    9 402.657423 6 0.0 0.00018 0.000
    Most correlated values with 12 dist from cen:
    Column Correlation with 12 dist from cen
    0 12 dist from cen 1.000000
    1 1 dist from cen 0.658284
    2 11 dist from cen 0.649639
    3 2 dist from cen 0.561495
    4 10 dist from cen 0.493608
    5 3 dist from cen 0.491578
    111 height_digit_3 -0.319467
    112 area_digit_3 -0.322698
    113 width_digit_12 -0.351274
    114 height_digit_12 -0.412872
    115 area_digit_12 -0.440424

    2 dist from cen

    (Jump to top)
    2 dist from cen Train Validation
    0 count 27855.000000 295.000000
    1 mean 349.116177 340.248534
    2 std 53.313076 58.972478
    3 min 7.905694 95.630800
    4 25% 320.153479 307.196385
    5 50% 353.802911 348.680728
    6 75% 383.428285 379.130664
    7 max 568.624876 486.444498
    8 unique values 22905.000000 295.000000
    9 unique values / count 0.698800 0.814900
    10 NaNs 4922.000000 67.000000
    Most popular values (NaN = -999):
    2 dist from cen Count in train (desc) Count in validation Share in train Share in validation
    0 -999.000000 4922 67.0 0.15017 0.42405
    1 317.539368 7 0.0 0.00021 0.00000
    2 389.168986 6 0.0 0.00018 0.00000
    3 350.708212 6 0.0 0.00018 0.00000
    4 347.940369 6 0.0 0.00018 0.00000
    5 357.220170 6 0.0 0.00018 0.00000
    6 356.863069 6 0.0 0.00018 0.00000
    7 340.009191 5 0.0 0.00015 0.00000
    8 364.900671 5 0.0 0.00015 0.00000
    9 350.058924 5 0.0 0.00015 0.00000
    Most correlated values with 2 dist from cen:
    Column Correlation with 2 dist from cen
    0 2 dist from cen 1.000000
    1 3 dist from cen 0.725259
    2 1 dist from cen 0.709005
    3 12 dist from cen 0.561495
    4 4 dist from cen 0.509722
    5 minute_hand_length 0.441396
    111 height_digit_2 -0.307129
    112 width_digit_12 -0.308594
    113 area_digit_2 -0.309652
    114 height_digit_3 -0.312967
    115 area_digit_12 -0.342764

    3 dist from cen

    (Jump to top)
    3 dist from cen Train Validation
    0 count 28612.000000 297.000000
    1 mean 337.542587 328.683359
    2 std 51.175381 57.556793
    3 min 15.206906 109.592427
    4 25% 308.950947 297.228027
    5 50% 343.432854 338.913337
    6 75% 371.767737 363.792867
    7 max 611.333379 456.444137
    8 unique values 23065.000000 298.000000
    9 unique values / count 0.703700 0.823200
    10 NaNs 4165.000000 65.000000
    Most popular values (NaN = -999):
    3 dist from cen Count in train (desc) Count in validation Share in train Share in validation
    0 -999.000000 4165 65.0 0.12707 0.3869
    1 380.644848 7 0.0 0.00021 0.0000
    2 350.160677 7 0.0 0.00021 0.0000
    3 296.459525 6 0.0 0.00018 0.0000
    4 349.017550 6 0.0 0.00018 0.0000
    5 342.806797 6 0.0 0.00018 0.0000
    6 339.214165 6 0.0 0.00018 0.0000
    7 388.538930 6 0.0 0.00018 0.0000
    8 296.095002 5 0.0 0.00015 0.0000
    9 376.158544 5 0.0 0.00015 0.0000
    Most correlated values with 3 dist from cen:
    Column Correlation with 3 dist from cen
    0 3 dist from cen 1.000000
    1 4 dist from cen 0.761953
    2 2 dist from cen 0.725259
    3 5 dist from cen 0.547068
    4 1 dist from cen 0.542965
    5 12 dist from cen 0.491578
    111 area_digit_3 -0.355528
    112 height_digit_6 -0.373873
    113 area_digit_12 -0.381927
    114 width_digit_12 -0.389002
    115 height_digit_3 -0.416407

    4 dist from cen

    (Jump to top)