Requirement
Most general insurers use Generalised/Generalized Linear Models (GLMs) to develop frequency, severity, and propensity models for insurance pricing. Traditionally actuaries would build these models with Emblem or SAS. The examples here are coded in R and based upon a presentation made to the Institute and Faculty of Actuaries. The implementation here will use the glm
function in R.
Suggested references
- CAS Monograph No. 5: Generalized Linear Models for Insurance Rating, 2nd Edition
- Trimming the Fat from glm() Models in R
- The
glm
function in R is not particularly memory efficient. This article describes some modifications that can be used to reduce memory usage.
- The
- Using PROC GENMOD to find a fair house insurance rate for the Norwegian market
Required packages
broom
- Convert Statistical Objects into Tidy Tibblesdplyr
- A Grammar of Data Manipulationdplyr
is used in lieu ofdata.table
as it is easier to introduce to new users of R
Hmisc
- Harrell Miscellaneous- Functions from this package are used to determine the Gini score from the fitted models
insuranceData
- A Collection of Insurance Datasets Useful in Risk Classification in Non-life Insurancepurrr
- Functional Programming Tools
Inputs
SingaporeAuto
- This is a dataset from theinsuranceData
package containing a number of rating factors
Outputs
df_process
- The processed datag+geom_histogram
- Exposure histogram to validate that exposures are calculated monthlygeom_claims
- Claims distribution chart to check the distribution of exposures with claimsgeom_claims+facet_grid
- Claims count by No Claims Discount (NCD) to validate that customers with NCD=0 are more likely to have a claimmodel_intercept
- Frequency intercept model with no factorsmodel_full
- Frequency model with all factors includedmodel_stepwise
- Frequency model with stepwise regressionmodel_table
- Comparison table of model performance on training and validation datadrop1(model_stepwise, test="Chisq")
- Chi-squared test on factor significance using single deletions, similar to SAS Type III
Load packages and functions
library(broom)
library(dplyr)
library(Hmisc)
library(insuranceData)
library(purrr)
#Return the gini score
#model_input = the model object
#df_input = the dataframe which is used to score the model
#actuals_input = the vector containing the actual scores for comparison
gini_score <- function(model_input, df_input, actuals_input){
return(rcorr.cens(predict(model_input, newdata = df_input, type = "response"), actuals_input)[["Dxy"]])
}
#special case of the above which uses the fitted values from the model object for gini score
gini_score_train <- function(model_input, actuals_input){
return(rcorr.cens(exp(fitted.values(model_input)),actuals_input)[["Dxy"]])
}
1) Acquire data
# import the data and move into a tibble
library(insuranceData)
data("SingaporeAuto")
df <- as_tibble(SingaporeAuto)
rm(SingaporeAuto)
2) Process data
# SingaporeAuto Data Descriptions
# http://instruction.bus.wisc.edu/jfrees/jfreesbooks/Regression%20Modeling/BookWebDec2010/DataDescriptions.pdf
#
# SexInsured Gender of insured, including male (M), female(F) and unspecified (U)
# Female =1 if female, =0 otherwise
# VehicleType The type of vehicle being insured, such as automobile (A), truck (T), and motorcycle (M)
# PC =1 if private vehicle, =0 otherwise
# Clm Count Number of claims during the year
# Exp weights Exposure weight or the fraction of the year that the policy is in effect
# LNWEIGHT Logarithm of exposure weight
# NCD No Claims Discount. This is based ont he previous accident record of the policyholder.
# The higher the discount, the better is the prior accident record.
# AgeCat The age of the policyholder, in years grouped into seven categories.
# 0-6 indicate age groups 21 and younger, 22-25, 26-35, 36-45, 46-55, 56-65, 66 and over, respectively
# VAgeCat The age of the vehicle, in years, grouped into seven categories.
# 0-6 indicate groups 0, 1, 2, 3-5, 6-10, 11-15, 16 and older, respectively
# AutoAge0 =1 if private vehicle and VAgeCat = 0, =0 otherwise
# AutoAge1 =1 if private vehicle and VAgeCat = 1, =0 otherwise
# AutoAge2 =1 if private vehicle and VAgeCat = 2, =0 otherwise
# AutoAge =1 if Private vehicle and VAgeCat = 0, 1 or 2, =0 otherwise
# VAgecat1 VAgeCat with categories 0, 1, and 2 combined
# process and clean the data
df_process <- df %>%
transmute(SexInsured=factor(SexInsured),
Female=factor(Female),
Private=factor(PC),
NCDCat=factor(NCD),
AgeCat=factor(AgeCat),
VAgeCat=factor(VAgeCat),
Exp_weights,
Clm_Count,
Frequency = Clm_Count/Exp_weights
)
#3) Explore and visualise
a) Summary of the variables in the dataset
summary(df_process)
## SexInsured Female Private NCDCat AgeCat VAgeCat Exp_weights
## F: 700 0:6783 0:3641 0 :2010 0:3640 0:3079 Min. :0.005476
## M:3145 1: 700 1:3842 10:1458 2: 141 1: 681 1st Qu.:0.279261
## U:3638 20:1836 3:1476 2: 646 Median :0.503764
## 30: 411 4:1516 3: 771 Mean :0.519859
## 40: 349 5: 536 4: 924 3rd Qu.:0.752909
## 50:1419 6: 157 5:1164 Max. :1.000000
## 7: 17 6: 218
## Clm_Count Frequency
## Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median : 0.0000
## Mean :0.06989 Mean : 0.1363
## 3rd Qu.:0.00000 3rd Qu.: 0.0000
## Max. :3.00000 Max. :22.8281
##
b) Graphs for exposures and claims
#Define the base chart parameters
g <- ggplot(data=df_process)+scale_y_continuous(labels = scales::percent)
#Exposure histogram
g+geom_histogram(aes_string(x="Exp_weights",y="..count../sum(..count..)"),binwidth=.005)+ggtitle("Exposure weights")+labs(x=NULL, y="% records")
#Claims bar chart
geom_claims <- g+geom_bar(aes_string(x="Clm_Count",y="..prop..",weight="Exp_weights/sum(Exp_weights)"))+ggtitle("Claim counts")+labs(x=NULL, y="% exposure")
geom_claims
#Facet the claims bar chart by NCD
geom_claims+facet_grid(.~NCDCat)+ggtitle("Claim counts by NCDCat")
4) Build models
a) Split training and validation
set.seed(12345) # reproducible random numbers
df_sample_fraction <- 0.80
df_training_rows <- sample(1:nrow(df_process), size = nrow(df_process) * df_sample_fraction, replace = FALSE)
df_train <- df_process [df_training_rows,]
df_validation <- df_process [-df_training_rows,]
b) Fit the intercept model
model_intercept <- glm(Clm_Count~1, family=poisson(link=log), data=df_train, offset=log(Exp_weights), y=FALSE, model=FALSE)
tidy(model_intercept)
## # A tibble: 1 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.99 0.0486 -41.0 0
paste("Check average model frequency",exp(model_intercept$coefficients[["(Intercept)"]]),"vs data average frequency",sum(df_train$Clm_Count)/sum(df_train$Exp_weights))
## [1] "Check average model frequency 0.136514036028494 vs data average frequency 0.136514036028435"
glance(model_intercept)
## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 2191. 5985 -1497. 2997. 3004. 2191. 5985 5986
head(augment(model_intercept))
## # A tibble: 6 × 8
## Clm_Count `(offset)` .fitted .resid .std.resid .hat .sigma .cooksd
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -0.363 -2.35 -0.436 -0.436 0.000224 0.605 0.0000213
## 2 0 -1.79 -3.78 -0.214 -0.214 0.0000539 0.605 0.00000123
## 3 0 -1.79 -3.78 -0.214 -0.214 0.0000539 0.605 0.00000123
## 4 0 -1.38 -3.37 -0.262 -0.262 0.0000813 0.605 0.00000280
## 5 0 -0.154 -2.15 -0.484 -0.484 0.000277 0.605 0.0000324
## 6 0 -2.53 -4.52 -0.147 -0.147 0.0000256 0.605 0.000000278
c) Fit a model with all factors
model_full <- glm(Clm_Count~. - Frequency - Exp_weights, family=poisson(link=log), data=df_train, offset=log(Exp_weights), y=FALSE, model=FALSE)
tidy(model_full)
## # A tibble: 22 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -12.1 296. -0.0407 0.968
## 2 SexInsuredM 0.211 0.171 1.23 0.217
## 3 SexInsuredU 10.4 296. 0.0351 0.972
## 4 Female1 NA NA NA NA
## 5 Private1 12.1 469. 0.0258 0.979
## 6 NCDCat10 -0.374 0.142 -2.64 0.00840
## 7 NCDCat20 -0.435 0.144 -3.03 0.00245
## 8 NCDCat30 -0.406 0.215 -1.89 0.0589
## 9 NCDCat40 -0.972 0.302 -3.22 0.00128
## 10 NCDCat50 -0.697 0.157 -4.43 0.00000946
## # … with 12 more rows
glance(model_full)
## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 2191. 5985 -1444. 2930. 3071. 2084. 5965 5986
d) Use stepwise regression to add and remove factors
model_stepwise <- step(model_intercept, scope=list(lower=model_intercept, upper=model_full), direction = "both")
## Start: AIC=2996.93
## Clm_Count ~ 1
##
## Df Deviance AIC
## + VAgeCat 6 2122.1 2939.8
## + NCDCat 5 2163.6 2979.3
## + Private 1 2175.6 2983.3
## + SexInsured 2 2174.8 2984.5
## + AgeCat 6 2170.4 2988.1
## <none> 2191.2 2996.9
## + Female 1 2191.2 2998.9
##
## Step: AIC=2939.83
## Clm_Count ~ VAgeCat
##
## Df Deviance AIC
## + NCDCat 5 2090.4 2918.1
## <none> 2122.1 2939.8
## + Female 1 2121.0 2940.7
## + Private 1 2122.1 2941.8
## + SexInsured 2 2121.0 2942.7
## + AgeCat 6 2116.5 2946.2
## - VAgeCat 6 2191.2 2996.9
##
## Step: AIC=2918.13
## Clm_Count ~ VAgeCat + NCDCat
##
## Df Deviance AIC
## <none> 2090.4 2918.1
## + Female 1 2089.4 2919.1
## + Private 1 2089.4 2919.1
## + SexInsured 2 2088.1 2919.8
## + AgeCat 6 2086.8 2926.6
## - NCDCat 5 2122.1 2939.8
## - VAgeCat 6 2163.6 2979.3
tidy(model_stepwise)
## # A tibble: 12 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.50 0.0972 -15.4 8.84e-54
## 2 VAgeCat1 0.145 0.156 0.925 3.55e- 1
## 3 VAgeCat2 0.361 0.151 2.38 1.72e- 2
## 4 VAgeCat3 0.00372 0.167 0.0222 9.82e- 1
## 5 VAgeCat4 -0.498 0.181 -2.75 5.88e- 3
## 6 VAgeCat5 -1.18 0.216 -5.46 4.76e- 8
## 7 VAgeCat6 -1.59 0.584 -2.73 6.33e- 3
## 8 NCDCat10 -0.372 0.142 -2.63 8.64e- 3
## 9 NCDCat20 -0.439 0.144 -3.05 2.26e- 3
## 10 NCDCat30 -0.387 0.213 -1.82 6.95e- 2
## 11 NCDCat40 -0.955 0.301 -3.18 1.50e- 3
## 12 NCDCat50 -0.689 0.148 -4.65 3.31e- 6
glance(model_stepwise)
## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 2191. 5985 -1447. 2918. 2998. 2090. 5974 5986
5) Compare model outputs
a) Declare model objects into a tibble and calculate test statistics and objects
#Declare model objects into a tibble
model_table <- tibble(description = c("Intercept", "Full", "Stepwise"),
model = list(model_intercept, model_full, model_stepwise))
#Calculate validation statistics and objects
model_table <- model_table %>%
mutate(Deviance_train = model %>% map(glance) %>% map_dbl("deviance"),
AIC = model %>% map(glance) %>% map_dbl("AIC"),
gini_train = model %>% map_dbl(gini_score_train, actuals_input = df_train$Frequency),
gini_validation = model %>% map_dbl(gini_score ,df_input = df_validation, actuals_input = df_validation$Frequency)
)
model_table
## # A tibble: 3 × 6
## description model Deviance_train AIC gini_train gini_validation
## <chr> <list> <dbl> <dbl> <dbl> <dbl>
## 1 Intercept <glm> 2191. 2997. 0.280 0.262
## 2 Full <glm> 2084. 2930. 0.403 0.346
## 3 Stepwise <glm> 2090. 2918. 0.396 0.335
b) Determine factor significance
drop1(model_stepwise, test="Chisq")
## Single term deletions
##
## Model:
## Clm_Count ~ VAgeCat + NCDCat
## Df Deviance AIC LRT Pr(>Chi)
## <none> 2090.4 2918.1
## VAgeCat 6 2163.6 2979.3 73.213 8.949e-14 ***
## NCDCat 5 2122.1 2939.8 31.702 6.807e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
6) Reduce the size of the glm
object
Trimming the Fat from glm() Models in R
stripGlmLR = function(cm) {
cm$y = c()
cm$model = c()
cm$residuals = c()
cm$fitted.values = c()
cm$effects = c()
cm$qr$qr = c()
cm$linear.predictors = c()
cm$weights = c()
cm$prior.weights = c()
cm$data = c()
cm$family$variance = c()
cm$family$dev.resids = c()
cm$family$aic = c()
cm$family$validmu = c()
cm$family$simulate = c()
attr(cm$terms,".Environment") = c()
attr(cm$formula,".Environment") = c()
cm
}
model_stepwise <- stripGlmLR(model_stepwise)