Insurance Generalised Linear Models (GLMs) in R

Simon

2021/09/05

Requirement

Most general insurers use Generalised/Generalized Linear Models (GLMs) to develop frequency, severity, and propensity models for insurance pricing. Traditionally actuaries would build these models with Emblem or SAS. The examples here are coded in R and based upon a presentation made to the Institute and Faculty of Actuaries. The implementation here will use the glm function in R.

Suggested references

Required packages

Inputs

Outputs

Load packages and functions

library(broom)
library(dplyr)
library(Hmisc)
library(insuranceData)
library(purrr)

#Return the gini score
#model_input = the model object
#df_input    = the dataframe which is used to score the model
#actuals_input = the vector containing the actual scores for comparison
gini_score <- function(model_input, df_input, actuals_input){
  return(rcorr.cens(predict(model_input, newdata = df_input, type = "response"), actuals_input)[["Dxy"]])
  }
#special case of the above which uses the fitted values from the model object for gini score
gini_score_train <- function(model_input, actuals_input){
  return(rcorr.cens(exp(fitted.values(model_input)),actuals_input)[["Dxy"]])
}

1) Acquire data

# import the data and move into a tibble
library(insuranceData)
data("SingaporeAuto")
df <- as_tibble(SingaporeAuto)
rm(SingaporeAuto)

2) Process data

# SingaporeAuto Data Descriptions
# http://instruction.bus.wisc.edu/jfrees/jfreesbooks/Regression%20Modeling/BookWebDec2010/DataDescriptions.pdf
# 
# SexInsured  Gender of insured, including male (M), female(F) and unspecified (U)
# Female      =1 if female, =0 otherwise
# VehicleType The type of vehicle being insured, such as automobile (A), truck (T), and motorcycle (M)
# PC          =1 if private vehicle, =0 otherwise
# Clm Count   Number of claims during the year
# Exp weights Exposure weight or the fraction of the year that the policy is in effect
# LNWEIGHT    Logarithm of exposure weight
# NCD         No Claims Discount. This is based ont he previous accident record of the policyholder.
#             The higher the discount, the better is the prior accident record.
# AgeCat      The age of the policyholder, in years grouped into seven categories.
#             0-6 indicate age groups 21 and younger, 22-25, 26-35, 36-45, 46-55, 56-65, 66 and over, respectively
# VAgeCat     The age of the vehicle, in years, grouped into seven categories.
#             0-6 indicate groups 0, 1, 2, 3-5, 6-10, 11-15, 16 and older, respectively
# AutoAge0    =1 if private vehicle and VAgeCat = 0, =0 otherwise
# AutoAge1    =1 if private vehicle and VAgeCat = 1, =0 otherwise
# AutoAge2    =1 if private vehicle and VAgeCat = 2, =0 otherwise
# AutoAge     =1 if Private vehicle and VAgeCat = 0, 1 or 2, =0 otherwise
# VAgecat1    VAgeCat with categories 0, 1, and 2 combined

# process and clean the data
df_process <- df %>%
  transmute(SexInsured=factor(SexInsured),
            Female=factor(Female),
            Private=factor(PC),
            NCDCat=factor(NCD),
            AgeCat=factor(AgeCat),
            VAgeCat=factor(VAgeCat),
            Exp_weights,
            Clm_Count,
            Frequency = Clm_Count/Exp_weights
            )

#3) Explore and visualise

a) Summary of the variables in the dataset

summary(df_process)
##  SexInsured Female   Private  NCDCat    AgeCat   VAgeCat   Exp_weights      
##  F: 700     0:6783   0:3641   0 :2010   0:3640   0:3079   Min.   :0.005476  
##  M:3145     1: 700   1:3842   10:1458   2: 141   1: 681   1st Qu.:0.279261  
##  U:3638                       20:1836   3:1476   2: 646   Median :0.503764  
##                               30: 411   4:1516   3: 771   Mean   :0.519859  
##                               40: 349   5: 536   4: 924   3rd Qu.:0.752909  
##                               50:1419   6: 157   5:1164   Max.   :1.000000  
##                                         7:  17   6: 218                     
##    Clm_Count         Frequency      
##  Min.   :0.00000   Min.   : 0.0000  
##  1st Qu.:0.00000   1st Qu.: 0.0000  
##  Median :0.00000   Median : 0.0000  
##  Mean   :0.06989   Mean   : 0.1363  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000  
##  Max.   :3.00000   Max.   :22.8281  
## 

b) Graphs for exposures and claims

#Define the base chart parameters
g <- ggplot(data=df_process)+scale_y_continuous(labels = scales::percent)

#Exposure histogram
g+geom_histogram(aes_string(x="Exp_weights",y="..count../sum(..count..)"),binwidth=.005)+ggtitle("Exposure weights")+labs(x=NULL, y="% records")
#Claims bar chart
geom_claims <- g+geom_bar(aes_string(x="Clm_Count",y="..prop..",weight="Exp_weights/sum(Exp_weights)"))+ggtitle("Claim counts")+labs(x=NULL, y="% exposure")
geom_claims
#Facet the claims bar chart by NCD
geom_claims+facet_grid(.~NCDCat)+ggtitle("Claim counts by NCDCat")

4) Build models

a) Split training and validation

set.seed(12345) # reproducible random numbers
df_sample_fraction <- 0.80
df_training_rows <- sample(1:nrow(df_process), size = nrow(df_process) * df_sample_fraction, replace = FALSE)
df_train <- df_process [df_training_rows,]
df_validation <- df_process [-df_training_rows,]

b) Fit the intercept model

model_intercept <- glm(Clm_Count~1, family=poisson(link=log), data=df_train, offset=log(Exp_weights), y=FALSE, model=FALSE)
tidy(model_intercept)
## # A tibble: 1 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    -1.99    0.0486     -41.0       0
paste("Check average model frequency",exp(model_intercept$coefficients[["(Intercept)"]]),"vs data average frequency",sum(df_train$Clm_Count)/sum(df_train$Exp_weights))
## [1] "Check average model frequency 0.136514036028494 vs data average frequency 0.136514036028435"
glance(model_intercept)
## # A tibble: 1 × 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1         2191.    5985 -1497. 2997. 3004.    2191.        5985  5986
head(augment(model_intercept))
## # A tibble: 6 × 8
##   Clm_Count `(offset)` .fitted .resid .std.resid      .hat .sigma     .cooksd
##       <int>      <dbl>   <dbl>  <dbl>      <dbl>     <dbl>  <dbl>       <dbl>
## 1         0     -0.363   -2.35 -0.436     -0.436 0.000224   0.605 0.0000213  
## 2         0     -1.79    -3.78 -0.214     -0.214 0.0000539  0.605 0.00000123 
## 3         0     -1.79    -3.78 -0.214     -0.214 0.0000539  0.605 0.00000123 
## 4         0     -1.38    -3.37 -0.262     -0.262 0.0000813  0.605 0.00000280 
## 5         0     -0.154   -2.15 -0.484     -0.484 0.000277   0.605 0.0000324  
## 6         0     -2.53    -4.52 -0.147     -0.147 0.0000256  0.605 0.000000278

c) Fit a model with all factors

model_full <- glm(Clm_Count~. - Frequency - Exp_weights, family=poisson(link=log), data=df_train, offset=log(Exp_weights), y=FALSE, model=FALSE)
tidy(model_full)
## # A tibble: 22 × 5
##    term        estimate std.error statistic     p.value
##    <chr>          <dbl>     <dbl>     <dbl>       <dbl>
##  1 (Intercept)  -12.1     296.      -0.0407  0.968     
##  2 SexInsuredM    0.211     0.171    1.23    0.217     
##  3 SexInsuredU   10.4     296.       0.0351  0.972     
##  4 Female1       NA        NA       NA      NA         
##  5 Private1      12.1     469.       0.0258  0.979     
##  6 NCDCat10      -0.374     0.142   -2.64    0.00840   
##  7 NCDCat20      -0.435     0.144   -3.03    0.00245   
##  8 NCDCat30      -0.406     0.215   -1.89    0.0589    
##  9 NCDCat40      -0.972     0.302   -3.22    0.00128   
## 10 NCDCat50      -0.697     0.157   -4.43    0.00000946
## # … with 12 more rows
glance(model_full)
## # A tibble: 1 × 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1         2191.    5985 -1444. 2930. 3071.    2084.        5965  5986

d) Use stepwise regression to add and remove factors

model_stepwise <- step(model_intercept, scope=list(lower=model_intercept, upper=model_full), direction = "both")
## Start:  AIC=2996.93
## Clm_Count ~ 1
## 
##              Df Deviance    AIC
## + VAgeCat     6   2122.1 2939.8
## + NCDCat      5   2163.6 2979.3
## + Private     1   2175.6 2983.3
## + SexInsured  2   2174.8 2984.5
## + AgeCat      6   2170.4 2988.1
## <none>            2191.2 2996.9
## + Female      1   2191.2 2998.9
## 
## Step:  AIC=2939.83
## Clm_Count ~ VAgeCat
## 
##              Df Deviance    AIC
## + NCDCat      5   2090.4 2918.1
## <none>            2122.1 2939.8
## + Female      1   2121.0 2940.7
## + Private     1   2122.1 2941.8
## + SexInsured  2   2121.0 2942.7
## + AgeCat      6   2116.5 2946.2
## - VAgeCat     6   2191.2 2996.9
## 
## Step:  AIC=2918.13
## Clm_Count ~ VAgeCat + NCDCat
## 
##              Df Deviance    AIC
## <none>            2090.4 2918.1
## + Female      1   2089.4 2919.1
## + Private     1   2089.4 2919.1
## + SexInsured  2   2088.1 2919.8
## + AgeCat      6   2086.8 2926.6
## - NCDCat      5   2122.1 2939.8
## - VAgeCat     6   2163.6 2979.3
tidy(model_stepwise)
## # A tibble: 12 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept) -1.50       0.0972  -15.4    8.84e-54
##  2 VAgeCat1     0.145      0.156     0.925  3.55e- 1
##  3 VAgeCat2     0.361      0.151     2.38   1.72e- 2
##  4 VAgeCat3     0.00372    0.167     0.0222 9.82e- 1
##  5 VAgeCat4    -0.498      0.181    -2.75   5.88e- 3
##  6 VAgeCat5    -1.18       0.216    -5.46   4.76e- 8
##  7 VAgeCat6    -1.59       0.584    -2.73   6.33e- 3
##  8 NCDCat10    -0.372      0.142    -2.63   8.64e- 3
##  9 NCDCat20    -0.439      0.144    -3.05   2.26e- 3
## 10 NCDCat30    -0.387      0.213    -1.82   6.95e- 2
## 11 NCDCat40    -0.955      0.301    -3.18   1.50e- 3
## 12 NCDCat50    -0.689      0.148    -4.65   3.31e- 6
glance(model_stepwise)
## # A tibble: 1 × 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1         2191.    5985 -1447. 2918. 2998.    2090.        5974  5986

5) Compare model outputs

a) Declare model objects into a tibble and calculate test statistics and objects

#Declare model objects into a tibble
model_table <- tibble(description = c("Intercept", "Full", "Stepwise"),
                      model = list(model_intercept, model_full, model_stepwise))
#Calculate validation statistics and objects
model_table <- model_table %>%
  mutate(Deviance_train  = model %>% map(glance) %>% map_dbl("deviance"),
         AIC             = model %>% map(glance) %>% map_dbl("AIC"),
         gini_train      = model %>% map_dbl(gini_score_train,                          actuals_input = df_train$Frequency),
         gini_validation = model %>% map_dbl(gini_score      ,df_input = df_validation, actuals_input = df_validation$Frequency)
         )
model_table
## # A tibble: 3 × 6
##   description model  Deviance_train   AIC gini_train gini_validation
##   <chr>       <list>          <dbl> <dbl>      <dbl>           <dbl>
## 1 Intercept   <glm>           2191. 2997.      0.280           0.262
## 2 Full        <glm>           2084. 2930.      0.403           0.346
## 3 Stepwise    <glm>           2090. 2918.      0.396           0.335

b) Determine factor significance

drop1(model_stepwise, test="Chisq")
## Single term deletions
## 
## Model:
## Clm_Count ~ VAgeCat + NCDCat
##         Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>       2090.4 2918.1                     
## VAgeCat  6   2163.6 2979.3 73.213 8.949e-14 ***
## NCDCat   5   2122.1 2939.8 31.702 6.807e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6) Reduce the size of the glm object

Trimming the Fat from glm() Models in R

stripGlmLR = function(cm) {
  cm$y = c()
  cm$model = c()
  
  cm$residuals = c()
  cm$fitted.values = c()
  cm$effects = c()
  cm$qr$qr = c()  
  cm$linear.predictors = c()
  cm$weights = c()
  cm$prior.weights = c()
  cm$data = c()

  
  cm$family$variance = c()
  cm$family$dev.resids = c()
  cm$family$aic = c()
  cm$family$validmu = c()
  cm$family$simulate = c()
  attr(cm$terms,".Environment") = c()
  attr(cm$formula,".Environment") = c()
  
  cm
}
model_stepwise <- stripGlmLR(model_stepwise)