HDtweedie CV_20: Mean squared prediction errrors
	cv1	cv5	cv9	cv13	cv17	mean
Original Data	43937268	25708805	101999394	46674848	8871844	32303222
PCA Data	43935947	25708285	102006576	46673437	8871144	32304094

TDBoost CV_20: Mean squared prediction errrors
	cv1	cv5	cv9	cv13	cv17	mean
Original Data	41908389	23920831	99462852	44648444	7473933	30432479
PCA Data	41966963	24038593	99814069	44753124	7511136	30568193

Output

Data featuring.

setwd("C:/R directory/Joonwon_tte2")
data <- read.csv("Kaggle.csv", sep=";", header=TRUE, fill=TRUE) 
data <- data[,-c(21,22)]      # Remove seemingly irrelevant variables.
data <- data %>%
        mutate (Issue.Date= substr(Issue.Date,1,3)) # Change date into a factor variable.

# Split variables into categorical and continuous variables.

data$Premium <- as.numeric(gsub(",","",data$Premium)) # Use gsub to replace "," with "".
data$BENEFIT <- as.numeric(gsub(",","",data$BENEFIT))

factor.list <- c(5,8,9,10,12,16,20)
for( i  in 1:length(factor.list))
{
  data[,factor.list[i]]<- as.factor( data[,factor.list[i]] )
}

Missing values.

The missing values in continous variables were filled using the modes of kernel densities. The missing values in categorical variables were filled using the k-nearest neighbors algorithm.

categorical.ind  <- c(5,8,9,10,12,16,20)
continuous.ind   <- c(1,2,3,4,6,7,11,13,14,15,17,18,19)

# Fill missing values in categorical variables using the modes of kernel densities.

for (i in 1:length(categorical.ind))
{
  j<- categorical.ind[i]
  probs <- as.vector(table(na.omit(data[,j]))/length( na.omit(data[,j]) ) )
  missing.ind <- which(is.na(data[,j])==TRUE, arr.ind=TRUE)
  if (length(missing.ind)>0)
  { 
    for (k in 1:length(missing.ind))
    {
      # Extract the row indices chosen by a random multinomial simulation.
      bin <- which( rmultinom(1,1,probs)==1, arr.ind=TRUE)[1]   
      data[missing.ind[k],j] <- levels(data[,j])[bin]
    }
  }
}

mis.con <- rep(0,length(continuous.ind))
for (i in 1:length(continuous.ind))
{ 
  j <- continuous.ind[i]
  missing.ind <- which( is.na(data[,j])==TRUE)
  mis.con[i]<-ifelse( length(missing.ind)>0, 1,0)
  
}
cont.mis <-colnames(data[, c(continuous.ind[which(mis.con==1)])])
cat("The continuous variables containing missing values are",
    cont.mis[1], "and", cont.mis[2] )

# library(RANN)
knn <- preProcess( data, method= "knnImpute", k=10, na.remove = TRUE, knnSummary=mean)
mis <- c("BENEFIT", "Premium")
for (i in mis)
{
  missing.ind <- which( is.na(data[,i])==TRUE)
  pred<- predict(knn , data[missing.ind, ], na.action=na.pass) 
  for (k in 1:length(missing.ind))
  { 
    # de-normalize and get original data.
    data[missing.ind[k], i]<- pred[k,i]*knn$std[i]+knn$mean[i]
    
  }
  
}
## saveRDS(data, file = "data.Rds")
## setwd("C:/R directory/Joonwon_tte2")
## data.copy <- readRDS(file = "data.Rds")

data<-model.matrix(~ ., data=data)
data <- data[,-1]
data <- as.data.frame(data)
data <- data[data$Premium>=0,]
data <- data[, -10]  ## data[,10] is a constant vector

Scree plot and the total variance explained by the eight principal components.

Based on the scree plot, we will use 8 principal components, and we see that those 8 principal components explain 40% of the total variance in both cases.

par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")

paste0("Total variance explained by the first eight PC's is ",
       round( sum(pca$sdev[1:8]^2)/ sum(pca$sdev^2) ,4))

Generate a function that returns the indices of rows in kth fold.

set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
  fold_size<- floor( nrow(data)/kf )
  if (k<20 & k>0)
  {
    k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
  }else if(k==20) {
   
    k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
  }
  else {
    k_fold <-NA
  }
  return(k_fold)   
}

Train the HDtweedie model on 8 Principal components through 20 folds cross validation.

Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.

reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
# data <- data[,-24]  
for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
  ## for zero or constant columns.
  test_X[,24] <- test_X[,24] + rnorm(dim(test_X)[1],0, 1e-7)
  
  ## Note that data need to be scaled.
  pca_test =  prcomp(test_X,  scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[,1:8])
  pca_train <- as.matrix(pca_train$x[,1:8])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  
  HD.X   <-cv.HDtweedie(x= train_X, y=train_y, p=1.5, 
          alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
  HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
                   nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), ## almost non-penalized
                           eps= 0.0001, pred.loss="mse", nfolds=5,
                           )                         
  pred.HD.X   <- predict(HD.X,type="link", newx = test_X)
  pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test ,s=1e-7)
  error.HD.X[i]    <-  mean(( pred.HD.X - test_y )^2)
  error.HD.pca[i]  <-  mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca)

Train the TDBoost model on 8 Principal components through 20 folds cross validation.

set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)

for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
  ## for zero or constant columns.
  test_X[,24] <- test_X[,24] + rnorm(dim(test_X)[1],0, 1e-7)
  
  ## Note that data need to be scaled.
  pca_test = prcomp(test_X, scale. = T, center = T)
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test  <- as.matrix(pca_test$x[,1:5])
  pca_train <- as.matrix(pca_train$x[,1:5])
  
  test_data  <- cbind(test_y,test_X)
  train_data <- cbind(train_y,train_X)
  pca_train_data <- cbind(train_y,pca_train)
  pca_test_data  <- cbind(test_y,pca_test)

  TDB.X <- TDboost( Premium~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data = train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                        
  
  
  TDB.pca <- TDboost( Premium~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data =  pca_train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                         
  pred.TDB.X   <- predict(TDB.X, newdata = test_data,
                          n.trees = 100, type="response")
  pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data, 
                          n.trees = 100, type="response")
  error.TDB.X[i]    <-  mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
  error.TDB.pca[i]  <-  mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca) 


## setwd("C:/R directory/Joonwon_tte2")
## save.image(“C:/R directory/Joonwon_tte2/workspace.RData”)      work space
## data.copy <- readRDS(file = "TDB.X.Rds") 
## rmarkdown::pandoc_convert('Joonwon_tte2.html', output = 'Joonwon_tte2.pdf')

Life Insurance Premium Prediction using Principal Components

Joonwon Lee

2021 5 24

Summary