Summary

Mean squared prediction errors for HDtweedie and TDboost model on Health Insurance data from United States Census Bureau and its 10 Principal components.

HDtweedie CV_20: Mean squared prediction errors
cv1 cv5 cv9 cv13 cv17 mean
Original Data 6065175 6266148 5213060 4604486 5488610 12522153
num of selected Var 2 2 2 2 2 2
10 PCA 6063815 6267014 5220875 4613727 5489220 12526939
Note:
The dimension of data:78424 observation, 470 columns
TDBoost CV_20: Mean squared prediction errrors
cv1 cv5 cv9 cv13 cv17 mean
Original Data 5203467 5274056 4369558 3833882 4655499 11616832
PCA Data 5277328 5394966 4465203 3950445 4765100 11728839
Note:
The dimension of data:78424 observation, 470 columns

Explanation

The data used in the analysis is “Health Insurance Data” United States Census Bureau(2015,2016,2017 and 2018). https://www.census.gov/data/datasets/time-series/demo/health-insurance/cps-asec-split-panel-tests.html

The purpose of the analysis is to predict the home insurance premium and examine whether the PCA can be as effective as the lasso penalization for sparse modeling. The study used HWteedie on 10 Principal components without any penalization and compared the mean squared prediction errors with lasso penalized Hdtweedie model.

The analysis showed that the lasso penalization in HDtweedie selected two variables among 440 columns and its prediction performed almost the same or even better than the performance of the same HDtweedie with 10 principal components but without the lasso penalization. Therefore, the result implies that the HDtweedie with lasso penalization is more effective than the Hdtweedie using PCA.

Also, the study examined whether the PCA can improve the prediction performance of gradient boosted tweedie compound poisson model(TDBoost). There was no sufficeint evidence to support that PCA improve the TDBoost’s prediction power.

The data after PCA is smaller than the original data, so the only case in which PCA improves TDBoost is when there is an overfitting caused by multicollinearity.

However, from the results of variable selections in HDtweedie above, it seems that there is not much of a relationship among variables in the insurance datasets.

The fact that a single variable was selected in HDtweedie for the data implies that not only there isn’t much of a relationship, but also most variable vectors do not contain much information about insurance premiums.

This means that, there is no significant multicollinearity, hence no overfitting problem in insurance datasets and that is why PCA does not improve TDBoost.

Output

Data featuring.

setwd("C:/R directory/Joonwon_tte6")
data15 <- read_sas("pppub15_par.sas7bdat")
data16 <- read_sas("pp-splitpanel-trad16.sas7bdat")
data17 <- read_sas("pp_splitpanel_trad17.sas7bdat")
data18 <- read_sas("pppub18early_18par.sas7bdat")
data<- rbind(data15,data16,data17,data18)
# data.copy <- data
data$Premium <- data$PHIP_VAL
data <- data[data$Premium>=0,] # Premium has to be positive
data<- data[,-c(1)]   ## remove id column
# make the data a numeric matrix.
data<- as.data.frame(data)
for (i in 1:dim(data)[2])
{
  data[,i]<- as.numeric(data[,i])
}
# remove constant columns to avoid singular matrix.
captures<- rep(0, dim(data)[2])  
for (i in 1:dim(data)[2])
{
  captures[i]<- ifelse( length(table(data[,i]))==1,1,0)
}
const.col <- which(captures==1) # constant columns
data<- data[,-const.col]
## setwd("C:/R directory/Joonwon_tte6")
## save.image(“C:/R directory/Joonwon_tte3/workspace.RData”)      work space
## saveRDS(data, file= "Joonwon_tte6.Rds")
## data <- readRDS(file = "Joonwon_tte6.Rds") 
## rmarkdown::pandoc_convert('Joonwon_tte3.html', output = 'Joonwon_tte3.pdf')
# Scree plot for PC selection.
par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")

paste0("Total variance explained by the first eight PC's is ",
       round( sum(pca$sdev[1:10]^2)/ sum(pca$sdev^2) ,4))

Generate a function that returns the indices of rows in kth fold.

set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
  fold_size<- floor( nrow(data)/kf )
  if (k<20 & k>0)
  {
    k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
  }else if(k==20) {
   
    k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
  }
  else {
    k_fold <-NA
  }
  return(k_fold)   
}

Train the HDtweedie model on 10 Principal components through 20 folds cross validation.

Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.

reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
dim.red <- numeric(20)

# data <- data[,-24]  
for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
##  constant columns in train.
  
captures<- rep(0, dim(train_X)[2])
for (j in 1:dim(train_X)[2])
{
  captures[j]<- ifelse( length(table(train_X[,j]))==1,1,0)
}
const.train <- which(captures==1)

## constant columns in test_X.
  
captures<- rep(0, dim(test_X)[2])
for (j in 1:dim(test_X)[2])
  {
captures[j]<- ifelse( length(table(test_X[,j]))==1,1,0)
  }
const.test <- which(captures==1)

##  if remove.train is integer(0), then train_X[,-remove.train] returns 0 columns
# remove.train <- c(const.train)
# remove.test <- c(const.test)
remove.list <- c(const.test,const.train)
if (length(remove.list)>0)
{
  train_X <- train_X[, -remove.list]
  test_X <- test_X[,-remove.list]
}
else {
}
  
  ## PCA
  pca_test =  prcomp(test_X,  scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[ ,1:10])
  pca_train <- as.matrix(pca_train$x[ ,1:10])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  
  HD.X   <-cv.HDtweedie(x= train_X, y=train_y, p=1.5, 
          alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
  HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
                   nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), 
                   ## almost non-penalized
                           eps= 0.0001, pred.loss="mse", nfolds=5, )  
## check sparse model  
lamb.min <- HD.X$lambda.min
lamb.pca <- HD.pca$lambda.min ##(nearly zero)
length(coef(HD.X, s=lamb.min)[coef(HD.X,s=lamb.min)>0])-1 

dim.red[i] <- length(coef(HD.X,s=lamb.min )[coef(HD.X,s=lamb.min)>0])-1

  pred.HD.X   <- predict(HD.X, type="link", newx = test_X, s=lamb.min)
  pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test,s=lamb.pca)
  
  
  error.HD.X[i]    <-  mean(( pred.HD.X - test_y )^2)
  error.HD.pca[i]  <-  mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca) 
cat("The number of selected variables in each folds are", dim.red)
error.HD.X
error.HD.pca
mean.error.HD.X
mean.error.HD.pca

Train the TDBoost model on 10 Principal components through 20 folds cross validation.

mean.error.TDB.X 
mean.error.TDB.pca 
error.TDB.X
error.TDB.pca