Summary

Mean squared prediction errors for HDtweedie and TDboost model on dataCar and Principal components.

HDtweedie CV_20: error rates
cv1 cv5 cv9 cv13 cv17 mean
Original Data 2377401 1373430 2136314 1532071 1102510 1440613
PCA Data 2378171 1374129 2137165 1532731 1103110 1441284
TDBoost CV_20: error rates
cv1 cv5 cv9 cv13 cv17 mean
Original Data 2316359 1313726 2072233 1479964 1049088 1386597
PCA Data 2352115 1349163 2103491 1513364 1082238 1418921

Explanation

The data used in the analysis is “dataCar” from the package “insuranceData”. The purpose of the analysis is to predict the auto insurance premium and examine whether the analysis result on the reduced data from PCA is robust. Also, the analysis compared the performance of gradient boosted tweedie compound poisson model(TDBoost) with the grouped lasso Tweedie model(HDtweedie).

Output

1. Principal Component Analysis on “dataCar”.

Data featuring.

data("dataCar")           # "insuranceData" package.
data<-dataCar[,-11]       # Remove ""X_OBSTAT_"" from the dataset.
data<-model.matrix(~ ., data=data)[,-1]
data<-as.data.frame(data)

data$insu.prem <- data$numclaims * data$claimcst0
data <- data %>%
        select(-one_of("claimcst0","numclaims"))

Check correlations in the data using heatmap

Before doing PCA, we can see the correlation between variables in the data. It is known that if the most of correlation values are lower than 0.3, PCA may not work.

cor_df<- cor(data,data, method="pearson")
corrplot(cor_df,order="hclust")

We see that among explanatory variables except the responses “clm”, “numclaims”, and “claimcst0”, only “veh_value” has strong negative correlation with “veh_age”.

Scree plot and the total variance explained by the five components.

Based on the scree plot, we will use 5 principal components, and we see that those 5 principal components explain 72% of the total variance in both cases.

par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")

paste0("Total variance explained by the first five PC is ",
       round( sum(pca$sdev[1:5]^2)/ sum(pca$sdev^2) ,4))
## [1] "Total variance explained by the first five PC is 0.3245"

Generate a function that returns the indices of rows in kth fold.

set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
  fold_size<- floor( nrow(data)/kf )
  if (k<20 & k>0)
  {
    k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
  }else if(k==20) {
   
    k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
  }
  else {
    k_fold <-NA
  }
  return(k_fold)   
}

Train the HDtweedie model on 5 Principal components through 20 folds cross validation.

Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.

reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)

for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(insu.prem)
  test_X <- test_set %>%
            select(-one_of("insu.prem"))
  train_y <- train_set %>%
            select(insu.prem)
  train_X <- train_set %>%
            select(-one_of("insu.prem"))
  
  ## for zero or constant columns.
  test_X<- test_X+ rnorm(1,0, 1e-7) ; train_X<- train_X+ rnorm(1,0, 1e-7)
  
  ## Note that data need to be scaled.
  pca_test = prcomp(test_X, scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[,1:5])
  pca_train <- as.matrix(pca_train$x[,1:5])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  group1<- c( 1,2,3, rep(4,12),5,6,rep(7,5),8)
  HD.X   <-cv.HDtweedie(x= train_X, y=train_y, p=1.5, group=group1,
          alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
  HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
                   nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), ## almost non-penalized
                           eps= 0.0001, pred.loss="mse", nfolds=5,
                           )                         
  pred.HD.X   <- predict(HD.X,type="link", newx = test_X)
  pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test ,s=1e-7)
  error.HD.X[i]    <-  mean(( pred.HD.X - test_y )^2)
  error.HD.pca[i]  <-  mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca) 

Train the TDBoost model on 5 Principal components through 20 folds cross validation.

set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)

for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(insu.prem)
  test_X <- test_set %>%
            select(-one_of("insu.prem"))
  train_y <- train_set %>%
            select(insu.prem)
  train_X <- train_set %>%
            select(-one_of("insu.prem"))
  
  ## for zero or constant columns.
  test_X<- test_X+ rnorm(5,0, 1e-7) ; train_X<- train_X+ rnorm(5,0, 1e-7)
  ## Note that data need to be scaled.
  pca_test = prcomp(test_X, scale. = T, center = T)
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test  <- as.matrix(pca_test$x[,1:5])
  pca_train <- as.matrix(pca_train$x[,1:5])
  
  test_data  <- cbind(test_y,test_X)
  train_data <- cbind(train_y,train_X)
  pca_train_data <- cbind(train_y,pca_train)
  pca_test_data  <- cbind(test_y,pca_test)

  TDB.X <- TDboost( insu.prem~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data = train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                        
  
  
  TDB.pca <- TDboost( insu.prem~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data =  pca_train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                         
  pred.TDB.X   <- predict(TDB.X, newdata = test_data,
                          n.trees = 100, type="response")
  pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data, 
                          n.trees = 100, type="response")
  error.TDB.X[i]    <-  mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
  error.TDB.pca[i]  <-  mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca)