Prepare the notebook 🛠¶

In [ ]:

cat(system('curl -sL https://gitlab.aicrowd.com/jyotish/pricing-game-notebook-scripts/raw/r-functions/r/setup.sh > setup.sh && bash setup.sh', intern=TRUE), sep='\n')
source("aicrowd_helpers.R")

TRAINING_DATA_PATH = 'training.csv'
AICROWD_API_KEY = ''  # You can get the key from https://aicrowd.com/participants/me
download_aicrowd_dataset(AICROWD_API_KEY)

In [ ]:

options(width = 130)
options(warn = -1)

Packages 🗃¶

Install and require here all the packages you need to define your model.

Note: Installing packages the first time might take some time.

In [ ]:

install_packages <- function() {
  install.packages("skimr")
  install.packages("corrr")
  install.packages("tidyverse")
}
install_packages()

In [ ]:

global_imports <- function() {
  library(skimr)
  library(corrr)
  library(tidyverse)
}
global_imports()

Loading the data 📲¶

In [ ]:

# Load the dataset.
train_data = read_csv(TRAINING_DATA_PATH)

How does the data look like? 🔍¶

In [ ]:

skim(train_data)

── Data Summary ────────────────────────
                           Values    
Name                       train_data
Number of rows             228216    
Number of columns          26        
_______________________              
Column type frequency:               
  character                11        
  numeric                  15        
________________________             
Group variables            None      

── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────
   skim_variable n_missing complete_rate   min   max empty n_unique whitespace
 1 id_policy             0             1     8     8     0    57054          0
 2 pol_coverage          0             1     3     4     0        4          0
 3 pol_pay_freq          0             1     6     9     0        4          0
 4 pol_payd              0             1     2     3     0        2          0
 5 pol_usage             0             1     7    12     0        4          0
 6 drv_sex1              0             1     1     1     0        2          0
 7 drv_drv2              0             1     2     3     0        2          0
 8 drv_sex2              0             1     1     1     0        3          0
 9 vh_make_model         0             1    16    16     0      975          0
10 vh_fuel               0             1     6     8     0        3          0
11 vh_type               0             1     7    10     0        2          0

── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────
   skim_variable          n_missing complete_rate       mean        sd     p0      p25     p50      p75    p100 hist 
 1 year                           0         1         2.5        1.12     1       1.75     2.5     3.25      4  ▇▇▁▇▇
 2 pol_no_claims_discount         0         1         0.0444     0.118    0       0        0       0         1  ▇▁▁▁▁
 3 pol_duration                   0         1        12.6        8.64     1       5       11      18        44  ▇▅▂▂▁
 4 pol_sit_duration               0         1         4.25       2.62     1       3        4       5        26  ▇▁▁▁▁
 5 drv_age1                       0         1        56.3       15.0     19      45       56      67       104  ▂▇▇▃▁
 6 drv_age_lic1                   0         1        34.1       13.9      1      24       34      44        80  ▂▇▇▃▁
 7 drv_age2                  152896         0.330    48.6       16.3     18      36       47      61       102  ▅▇▆▃▁
 8 drv_age_lic2              152896         0.330    26.7       14.9      1      15       25      38        83  ▇▇▆▂▁
 9 vh_age                         4         1.00     11.1        7.14     1       6       10      15        64  ▇▃▁▁▁
10 vh_speed                    2552         0.989   171.        25.6     95     155      174     183       251  ▁▃▇▃▁
11 vh_value                    2552         0.989 17700.     10536.    1113   11490    16321   22067    101525  ▇▃▁▁▁
12 vh_weight                   2552         0.989  1099.       398.       0     950     1145    1315      2554  ▁▃▇▁▁
13 population                     0         1       571.       673.       0     170      320     610      2550  ▇▂▁▁▁
14 town_surface_area              0         1       205.       163.      18.5    82.1    155.    288.      745. ▇▃▂▁▁
15 claim_amount                   0         1       113.       582.       0       0        0       0     50000  ▇▁▁▁▁

In [35]:

options(width = 100)
glimpse(train_data)

Rows: 228,216
Columns: 26
$ id_policy              <chr> "PL000000", "PL042495", "PL042496", "PL042497", "PL042498", "PL042…
$ year                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ pol_no_claims_discount <dbl> 0.332, 0.000, 0.196, 0.000, 0.000, 0.000, 0.018, 0.000, 0.087, 0.5…
$ pol_coverage           <chr> "Med2", "Med2", "Med1", "Med2", "Med1", "Med2", "Max", "Max", "Med…
$ pol_duration           <dbl> 5, 6, 2, 8, 2, 8, 1, 4, 1, 6, 29, 6, 2, 14, 5, 27, 22, 11, 10, 3, …
$ pol_sit_duration       <dbl> 1, 1, 1, 5, 2, 2, 1, 2, 1, 3, 1, 3, 2, 1, 2, 3, 7, 3, 1, 1, 4, 1, …
$ pol_pay_freq           <chr> "Monthly", "Monthly", "Yearly", "Yearly", "Yearly", "Yearly", "Yea…
$ pol_payd               <chr> "No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",…
$ pol_usage              <chr> "WorkPrivate", "WorkPrivate", "Retired", "WorkPrivate", "Retired",…
$ drv_sex1               <chr> "M", "M", "M", "F", "F", "F", "M", "F", "M", "M", "M", "F", "M", "…
$ drv_age1               <dbl> 35, 60, 55, 54, 65, 68, 41, 51, 44, 53, 55, 52, 47, 43, 47, 84, 64…
$ drv_age_lic1           <dbl> 16, 41, 35, 31, 38, 46, 20, 7, 22, 34, 34, 32, 25, 21, 22, 60, 45,…
$ drv_drv2               <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No"…
$ drv_sex2               <chr> "F", "0", "F", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "…
$ drv_age2               <dbl> 26, NA, 57, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 42, NA, NA, NA…
$ drv_age_lic2           <dbl> 1, NA, 38, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 24, NA, NA, NA,…
$ vh_make_model          <chr> "aparvvfowrjncdhp", "aparvvfowrjncdhp", "iwhqpdfuhrsxyqxe", "kvcdd…
$ vh_age                 <dbl> 8, 10, 8, 4, 13, 16, 1, 28, 12, 14, 15, 12, 15, 12, 5, 1, 17, 19, …
$ vh_fuel                <chr> "Gasoline", "Diesel", "Diesel", "Gasoline", "Gasoline", "Gasoline"…
$ vh_type                <chr> "Tourism", "Tourism", "Commercial", "Tourism", "Tourism", "Tourism…
$ vh_speed               <dbl> 174, 174, 150, 149, 200, 196, 160, 173, 149, 189, 188, 200, 159, 1…
$ vh_value               <dbl> 11040, 11040, 14159, 17233, 19422, 24750, 15245, 13952, 17233, 316…
$ vh_weight              <dbl> 1143, 1143, 1193, 1012, 1315, 1200, 1019, 1112, 1012, 1312, 1305, …
$ population             <dbl> 1270, 1290, 1020, 180, 30, 210, 550, 1760, 140, 810, 120, 50, 870,…
$ town_surface_area      <dbl> 33.1, 51.3, 262.8, 219.7, 70.3, 366.5, 74.0, 103.4, 397.2, 460.7, …
$ claim_amount           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …

Let's look at some charts!¶

In [ ]:

# remove id_policy and convert character columns to factors
train_clean <- train_data %>% 
    select(-id_policy) %>%
    mutate(across(where(is.character), as.factor))

Categorical Variables¶

In [ ]:

train_clean %>% 
  keep(is.factor) %>%
  gather() %>%
  ggplot() +
  geom_bar(mapping = aes(x=value, fill=key), color="black") +
  facet_wrap(~ key, scales = "free") +
  theme(legend.position = "",
        plot.title.position = "plot")+
  labs(title = "Categorical Variable Distributions")

Numeric Variables¶

In [ ]:

train_clean %>% 
  keep(is.numeric) %>%
  gather() %>%
  ggplot() +
  geom_histogram(mapping = aes(x=value, fill=key), color="black") +
  facet_wrap(~ key, scales = "free") +
  scale_x_continuous(n.breaks = 2)+
  theme(legend.position = "",
        plot.title.position = "plot")+
  labs(title = "Numeric Variable Distributions")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Correlations¶

In [ ]:

train_clean %>%
  keep(is.numeric) %>%
  corrr::correlate() %>%
  corrr::network_plot(min_cor = 0.2)

Correlation method: 'pearson'
Missing treated using: 'pairwise.complete.obs'

In [ ]:

train_clean %>%
  keep(is.numeric) %>%
  corrr::correlate() %>%
  corrr::rearrange() %>%
  corrr::shave() %>%
  corrr::fashion()

Correlation method: 'pearson'
Missing treated using: 'pairwise.complete.obs'

A data.frame: 15 × 16
term	drv_age1	drv_age_lic1	drv_age2	drv_age_lic2	pol_duration	pol_sit_duration	year	vh_age	town_surface_area	population	claim_amount	vh_speed	vh_weight	vh_value	pol_no_claims_discount
<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>	<noquote>
drv_age1
drv_age_lic1	.92
drv_age2	.55	.49
drv_age_lic2	.50	.46	.94
pol_duration	.38	.37	.12	.11
pol_sit_duration	.25	.23	.18	.16	.31
year	.07	.08	.07	.07	.13	.43
vh_age	.07	.05	-.02	-.03	.03	.32	.16
town_surface_area	.03	.03	.02	.03	.03	.01	-.00	.02
population	.02	.00	-.01	-.01	.01	.01	-.00	-.03	.10
claim_amount	-.00	-.00	-.00	-.01	-.01	-.03	-.01	-.08	-.01	.01
vh_speed	-.02	-.02	.05	.04	-.00	-.09	-.00	-.18	-.03	.03	.04
vh_weight	-.03	-.00	.04	.03	-.01	-.08	-.00	-.13	-.02	-.01	.04	.57
vh_value	-.04	-.01	.04	.03	-.07	-.09	-.00	-.09	-.03	-.00	.04	.49	.56
pol_no_claims_discount	-.34	-.36	-.12	-.13	-.22	-.13	-.06	-.04	-.01	.03	.02	.01	-.02	-.03

Insurance pricing game

Insurance Pricing Game EDA in R

Prepare the notebook 🛠¶

Packages 🗃¶

Loading the data 📲¶

How does the data look like? 🔍¶

Let's look at some charts!¶

Categorical Variables¶

Numeric Variables¶

Correlations¶

Content

Comments