Summary

Mean squared prediction errors for HDtweedie and TDboost model on Life Insurance data and its Principal components.

HDtweedie CV_20: Mean squared prediction errrors
cv1 cv5 cv9 cv13 cv17 mean
Original Data 43937268 25708805 101999394 46674848 8871844 32303222
PCA Data 43935947 25708285 102006576 46673437 8871144 32304094
TDBoost CV_20: Mean squared prediction errrors
cv1 cv5 cv9 cv13 cv17 mean
Original Data 41908389 23920831 99462852 44648444 7473933 30432479
PCA Data 41966963 24038593 99814069 44753124 7511136 30568193

Explanation

The data used in the analysis is “Life Insurance Policy Data” from kaggle website. https://www.kaggle.com/blackclover1/life-insurance-policy-data

The purpose of the analysis is to predict the life insurance premium and examine whether the PCA can be as effective as the lasso penalization for sparse modeling. The study used HWteedie on 5 Principal components without any penalization and compared the mean squared prediction errors with lasso penalized model in HDtweedie. The result implies that the PCA is as effective as the lasso penalization.

Also, the study examined whether the PCA can improve the performance of gradient boosted tweedie compound poisson model(TDBoost). There was no sufficeint evidence to support the claim.

Output

Data featuring.

setwd("C:/R directory/Joonwon_tte2")
data <- read.csv("Kaggle.csv", sep=";", header=TRUE, fill=TRUE) 
data <- data[,-c(21,22)]      # Remove seemingly irrelevant variables.
data <- data %>%
        mutate (Issue.Date= substr(Issue.Date,1,3)) # Change date into a factor variable.

# Split variables into categorical and continuous variables.

data$Premium <- as.numeric(gsub(",","",data$Premium)) # Use gsub to replace "," with "".
data$BENEFIT <- as.numeric(gsub(",","",data$BENEFIT))

factor.list <- c(5,8,9,10,12,16,20)
for( i  in 1:length(factor.list))
{
  data[,factor.list[i]]<- as.factor( data[,factor.list[i]] )
}

Missing values.

The missing values in continous variables were filled using the modes of kernel densities. The missing values in categorical variables were filled using the k-nearest neighbors algorithm.

categorical.ind  <- c(5,8,9,10,12,16,20)
continuous.ind   <- c(1,2,3,4,6,7,11,13,14,15,17,18,19)

# Fill missing values in categorical variables using the modes of kernel densities.

for (i in 1:length(categorical.ind))
{
  j<- categorical.ind[i]
  probs <- as.vector(table(na.omit(data[,j]))/length( na.omit(data[,j]) ) )
  missing.ind <- which(is.na(data[,j])==TRUE, arr.ind=TRUE)
  if (length(missing.ind)>0)
  { 
    for (k in 1:length(missing.ind))
    {
      # Extract the row indices chosen by a random multinomial simulation.
      bin <- which( rmultinom(1,1,probs)==1, arr.ind=TRUE)[1]   
      data[missing.ind[k],j] <- levels(data[,j])[bin]
    }
  }
}
mis.con <- rep(0,length(continuous.ind))
for (i in 1:length(continuous.ind))
{ 
  j <- continuous.ind[i]
  missing.ind <- which( is.na(data[,j])==TRUE)
  mis.con[i]<-ifelse( length(missing.ind)>0, 1,0)
  
}
cont.mis <-colnames(data[, c(continuous.ind[which(mis.con==1)])])
cat("The continuous variables containing missing values are",
    cont.mis[1], "and", cont.mis[2] )
# library(RANN)
knn <- preProcess( data, method= "knnImpute", k=10, na.remove = TRUE, knnSummary=mean)
mis <- c("BENEFIT", "Premium")
for (i in mis)
{
  missing.ind <- which( is.na(data[,i])==TRUE)
  pred<- predict(knn , data[missing.ind, ], na.action=na.pass) 
  for (k in 1:length(missing.ind))
  { 
    # de-normalize and get original data.
    data[missing.ind[k], i]<- pred[k,i]*knn$std[i]+knn$mean[i]
    
  }
  
}
## saveRDS(data, file = "data.Rds")
## setwd("C:/R directory/Joonwon_tte2")
## data.copy <- readRDS(file = "data.Rds") 
data<-model.matrix(~ ., data=data)
data <- data[,-1]
data <- as.data.frame(data)
data <- data[data$Premium>=0,]
data <- data[, -10]  ## data[,10] is a constant vector

Scree plot and the total variance explained by the eight principal components.

Based on the scree plot, we will use 8 principal components, and we see that those 8 principal components explain 40% of the total variance in both cases.

par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")

paste0("Total variance explained by the first eight PC's is ",
       round( sum(pca$sdev[1:8]^2)/ sum(pca$sdev^2) ,4))

Generate a function that returns the indices of rows in kth fold.

set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
  fold_size<- floor( nrow(data)/kf )
  if (k<20 & k>0)
  {
    k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
  }else if(k==20) {
   
    k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
  }
  else {
    k_fold <-NA
  }
  return(k_fold)   
}

Train the HDtweedie model on 8 Principal components through 20 folds cross validation.

Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.

reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
# data <- data[,-24]  
for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
  ## for zero or constant columns.
  test_X[,24] <- test_X[,24] + rnorm(dim(test_X)[1],0, 1e-7)
  
  ## Note that data need to be scaled.
  pca_test =  prcomp(test_X,  scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[,1:8])
  pca_train <- as.matrix(pca_train$x[,1:8])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  
  HD.X   <-cv.HDtweedie(x= train_X, y=train_y, p=1.5, 
          alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
  HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
                   nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), ## almost non-penalized
                           eps= 0.0001, pred.loss="mse", nfolds=5,
                           )                         
  pred.HD.X   <- predict(HD.X,type="link", newx = test_X)
  pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test ,s=1e-7)
  error.HD.X[i]    <-  mean(( pred.HD.X - test_y )^2)
  error.HD.pca[i]  <-  mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca) 

Train the TDBoost model on 8 Principal components through 20 folds cross validation.

set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)

for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
  ## for zero or constant columns.
  test_X[,24] <- test_X[,24] + rnorm(dim(test_X)[1],0, 1e-7)
  
  ## Note that data need to be scaled.
  pca_test = prcomp(test_X, scale. = T, center = T)
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test  <- as.matrix(pca_test$x[,1:5])
  pca_train <- as.matrix(pca_train$x[,1:5])
  
  test_data  <- cbind(test_y,test_X)
  train_data <- cbind(train_y,train_X)
  pca_train_data <- cbind(train_y,pca_train)
  pca_test_data  <- cbind(test_y,pca_test)

  TDB.X <- TDboost( Premium~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data = train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                        
  
  
  TDB.pca <- TDboost( Premium~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data =  pca_train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                         
  pred.TDB.X   <- predict(TDB.X, newdata = test_data,
                          n.trees = 100, type="response")
  pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data, 
                          n.trees = 100, type="response")
  error.TDB.X[i]    <-  mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
  error.TDB.pca[i]  <-  mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca) 


## setwd("C:/R directory/Joonwon_tte2")
## save.image(“C:/R directory/Joonwon_tte2/workspace.RData”)      work space
## data.copy <- readRDS(file = "TDB.X.Rds") 
## rmarkdown::pandoc_convert('Joonwon_tte2.html', output = 'Joonwon_tte2.pdf')