Summary

Mean squared prediction errors for HDtweedie and TDboost model on Health Insurance data from United States Census Bureau and its 10 Principal components.

HDtweedie CV_20: Mean squared prediction errors
	cv1	cv5	cv9	cv13	cv17	mean
Original Data	6065175	6266148	5213060	4604486	5488610	12522153
num of selected Var	2	2	2	2	2	2
10 PCA	6063815	6267014	5220875	4613727	5489220	12526939
Note:
The dimension of data:78424 observation, 470 columns

TDBoost CV_20: Mean squared prediction errrors
	cv1	cv5	cv9	cv13	cv17	mean
Original Data	5203467	5274056	4369558	3833882	4655499	11616832
PCA Data	5277328	5394966	4465203	3950445	4765100	11728839
Note:
The dimension of data:78424 observation, 470 columns

Explanation

The data used in the analysis is “Health Insurance Data” United States Census Bureau(2015,2016,2017 and 2018). https://www.census.gov/data/datasets/time-series/demo/health-insurance/cps-asec-split-panel-tests.html

The purpose of the analysis is to predict the home insurance premium and examine whether the PCA can be as effective as the lasso penalization for sparse modeling. The study used HWteedie on 10 Principal components without any penalization and compared the mean squared prediction errors with lasso penalized Hdtweedie model.

The analysis showed that the lasso penalization in HDtweedie selected two variables among 440 columns and its prediction performed almost the same or even better than the performance of the same HDtweedie with 10 principal components but without the lasso penalization. Therefore, the result implies that the HDtweedie with lasso penalization is more effective than the Hdtweedie using PCA.

Also, the study examined whether the PCA can improve the prediction performance of gradient boosted tweedie compound poisson model(TDBoost). There was no sufficeint evidence to support that PCA improve the TDBoost’s prediction power.

The data after PCA is smaller than the original data, so the only case in which PCA improves TDBoost is when there is an overfitting caused by multicollinearity.

However, from the results of variable selections in HDtweedie above, it seems that there is not much of a relationship among variables in the insurance datasets.

The fact that a single variable was selected in HDtweedie for the data implies that not only there isn’t much of a relationship, but also most variable vectors do not contain much information about insurance premiums.

This means that, there is no significant multicollinearity, hence no overfitting problem in insurance datasets and that is why PCA does not improve TDBoost.

Output

Data featuring.

setwd("C:/R directory/Joonwon_tte6")
data15 <- read_sas("pppub15_par.sas7bdat")
data16 <- read_sas("pp-splitpanel-trad16.sas7bdat")
data17 <- read_sas("pp_splitpanel_trad17.sas7bdat")
data18 <- read_sas("pppub18early_18par.sas7bdat")
data<- rbind(data15,data16,data17,data18)

# data.copy <- data
data$Premium <- data$PHIP_VAL
data <- data[data$Premium>=0,] # Premium has to be positive
data<- data[,-c(1)]   ## remove id column

# make the data a numeric matrix.
data<- as.data.frame(data)
for (i in 1:dim(data)[2])
{
  data[,i]<- as.numeric(data[,i])
}

# remove constant columns to avoid singular matrix.
captures<- rep(0, dim(data)[2])  
for (i in 1:dim(data)[2])
{
  captures[i]<- ifelse( length(table(data[,i]))==1,1,0)
}
const.col <- which(captures==1) # constant columns
data<- data[,-const.col]
## setwd("C:/R directory/Joonwon_tte6")
## save.image(“C:/R directory/Joonwon_tte3/workspace.RData”)      work space
## saveRDS(data, file= "Joonwon_tte6.Rds")
## data <- readRDS(file = "Joonwon_tte6.Rds") 
## rmarkdown::pandoc_convert('Joonwon_tte3.html', output = 'Joonwon_tte3.pdf')

# Scree plot for PC selection.
par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")

paste0("Total variance explained by the first eight PC's is ",
       round( sum(pca$sdev[1:10]^2)/ sum(pca$sdev^2) ,4))

Generate a function that returns the indices of rows in kth fold.

set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
  fold_size<- floor( nrow(data)/kf )
  if (k<20 & k>0)
  {
    k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
  }else if(k==20) {
   
    k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
  }
  else {
    k_fold <-NA
  }
  return(k_fold)   
}

Train the HDtweedie model on 10 Principal components through 20 folds cross validation.

Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.

reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
dim.red <- numeric(20)

# data <- data[,-24]  
for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
##  constant columns in train.
  
captures<- rep(0, dim(train_X)[2])
for (j in 1:dim(train_X)[2])
{
  captures[j]<- ifelse( length(table(train_X[,j]))==1,1,0)
}
const.train <- which(captures==1)

## constant columns in test_X.
  
captures<- rep(0, dim(test_X)[2])
for (j in 1:dim(test_X)[2])
  {
captures[j]<- ifelse( length(table(test_X[,j]))==1,1,0)
  }
const.test <- which(captures==1)

##  if remove.train is integer(0), then train_X[,-remove.train] returns 0 columns
# remove.train <- c(const.train)
# remove.test <- c(const.test)
remove.list <- c(const.test,const.train)
if (length(remove.list)>0)
{
  train_X <- train_X[, -remove.list]
  test_X <- test_X[,-remove.list]
}
else {
}
  
  ## PCA
  pca_test =  prcomp(test_X,  scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[ ,1:10])
  pca_train <- as.matrix(pca_train$x[ ,1:10])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  
  HD.X   <-cv.HDtweedie(x= train_X, y=train_y, p=1.5, 
          alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
  HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
                   nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), 
                   ## almost non-penalized
                           eps= 0.0001, pred.loss="mse", nfolds=5, )  
## check sparse model  
lamb.min <- HD.X$lambda.min
lamb.pca <- HD.pca$lambda.min ##(nearly zero)
length(coef(HD.X, s=lamb.min)[coef(HD.X,s=lamb.min)>0])-1 

dim.red[i] <- length(coef(HD.X,s=lamb.min )[coef(HD.X,s=lamb.min)>0])-1

  pred.HD.X   <- predict(HD.X, type="link", newx = test_X, s=lamb.min)
  pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test,s=lamb.pca)
  
  
  error.HD.X[i]    <-  mean(( pred.HD.X - test_y )^2)
  error.HD.pca[i]  <-  mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca)

cat("The number of selected variables in each folds are", dim.red)
error.HD.X
error.HD.pca
mean.error.HD.X
mean.error.HD.pca

Train the TDBoost model on 10 Principal components through 20 folds cross validation.

mean.error.TDB.X 
mean.error.TDB.pca 
error.TDB.X
error.TDB.pca

Joonwon_tte4

Joonwon Lee