Summary

Mean squared prediction errors for HDtweedie and TDboost model on Home Insurance data and its Principal components.

HDtweedie CV_20: Mean squared prediction errrors
cv1 cv5 cv9 cv13 cv17 mean
Original Data 53279 53380 53119 55310 53698 53609
PCA Data 53327 53395 53132 55367 53757 53652
TDBoost CV_20: Mean squared prediction errrors
cv1 cv5 cv9 cv13 cv17 mean
Original Data 6694 6946 6940 7863 7034 7075
PCA Data 7223 7481 7467 8805 7907 7768

Explanation

The data used in the analysis is “Home Insurance Policy Data” from kaggle website. https://www.kaggle.com/ycanario/home-insurance

The purpose of the analysis is to predict the home insurance premium and examine whether the PCA can be as effective as the lasso penalization for sparse modeling. The study used HWteedie on 25 Principal components without any penalization and compared the mean squared prediction errors with lasso penalized model in HDtweedie. The result implies that the PCA is as effective as the lasso penalization.

Also, the study examined whether the PCA can improve the prediction performance of gradient boosted tweedie compound poisson model(TDBoost). There was no sufficeint evidence to support that PCA improve the TDBoost’s prediction power.

Output

Data featuring.

setwd("C:/R directory/Joonwon_tte3")
data <- read.csv("home_insurance.csv",  header=TRUE, fill=TRUE) 

## make Date variables as a 
data[,1] <- as.factor(substr(data[,1],7,10))
## unify the notation "007, 2007" -> 07
data[,1]<- as.factor(as.numeric(data[,1]))

data[,2] <- as.factor(substr(data[,2],7,10)) 
data[,21] <- as.factor(substr(data[,21],7,10))
 # remove variables with too many NA's.
data <- data[,-c(42,44,60,61,62)] 
data <- data[,-c(60,61)] # 61:remove policy and 60: index of rows.

## 31,32
## Split variables into categorical and continuous variables.
categorical.ind   <- c(3:8,10,12,16,17,22:26,30,31,32,33,34,36,38:40,42,43,44:57,59)
continuous.ind   <- c(1,2,9,11,13:15,18:20,21,27:29,35,37,41,58)

for( i  in 1:length(categorical.ind))
{
  data[,categorical.ind[i]]<- as.factor( data[,categorical.ind[i]] )
}

for( i  in 1:length(continuous.ind))
{
  data[,continuous.ind[i]]<- as.numeric( data[,continuous.ind[i]] )
}
# data.copy <- data
data<-model.matrix(~ ., data=data)
data <- as.data.frame(data)
data$Premium <- data$LAST_ANN_PREM_GROSS
data <- data[data$Premium>=0,] # Premium has to be positive
data<- data[,-c(1)]   ## remove intercept
# find constant columns
captures<- rep(0, dim(data)[2])  
for (i in 1:dim(data)[2])
{
  captures[i]<- ifelse( length(table(data[,i]))==1,1,0)
}
const.col <- which(captures==1) # constant columns
data<- data[,-const.col]

Missing values.

The missing values in continous variables were filled using the modes of kernel densities. The missing values in categorical variables were filled using the column means.

# Fill missing values in categorical variables using the modes of kernel densities.

for (i in 1:length(categorical.ind))
{
  j<- categorical.ind[i]
  probs <- as.vector(table(na.omit(data[,j]))/length( na.omit(data[,j]) ) )
  missing.ind <- which(is.na(data[,j])==TRUE, arr.ind=TRUE)
  if (length(missing.ind)>0)
  { 
    for (k in 1:length(missing.ind))
    {
      # Extract the row indices chosen by a random multinomial simulation.
      bin <- which( rmultinom(1,1,probs)==1, arr.ind=TRUE)[1]   
      data[missing.ind[k],j] <- levels(data[,j])[bin]
    }
  }
}
# Find continuous variables which containing at least one missing value.
mis.con <- rep(0,length(continuous.ind))
for (i in 1:length(continuous.ind))
{ 
  j <- continuous.ind[i]
  missing.ind <- which( is.na(data[,j])==TRUE)
  mis.con[i]<-ifelse( length(missing.ind)>0, 1,0)
  
}
cont.mis <-colnames(data[, c(continuous.ind[which(mis.con==1)])])
# Fill missing values in continuous variables using the column means.
mis <- cont.mis
for (i in mis)
{
  missing.ind <- which( is.na(data[,i])==TRUE)
    data[missing.ind, i]<- mean( data[,i] , na.rm = TRUE)
  
}

## setwd("C:/R directory/Joonwon_tte3")
## saveRDS(data, file = "home_insurance.Rds")
## data.copy <- readRDS(file = "home_insurance.Rds") 

Scree plot and the total variance explained by the eight principal components.

Based on the scree plot, we will use 25 principal components, and we see that those 25 principal components explain 52% of the total variance in both cases.

par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")

paste0("Total variance explained by the first eight PC's is ",
       round( sum(pca$sdev[1:25]^2)/ sum(pca$sdev^2) ,4))

Generate a function that returns the indices of rows in kth fold.

set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
  fold_size<- floor( nrow(data)/kf )
  if (k<20 & k>0)
  {
    k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
  }else if(k==20) {
   
    k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
  }
  else {
    k_fold <-NA
  }
  return(k_fold)   
}

Train the HDtweedie model on 25 Principal components through 20 folds cross validation.

Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.

reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
# data <- data[,-24]  
for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
##  constant columns in train.
  
captures<- rep(0, dim(train_X)[2])
for (j in 1:dim(train_X)[2])
{
  captures[j]<- ifelse( length(table(train_X[,j]))==1,1,0)
}
const.train <- which(captures==1)

## constant columns in test_X.
  
captures<- rep(0, dim(test_X)[2])
for (j in 1:dim(test_X)[2])
  {
captures[j]<- ifelse( length(table(test_X[,j]))==1,1,0)
  }
const.test <- which(captures==1)
remove.list <- c(const.test,const.train)
train_X <- train_X[, -remove.list]
test_X <- test_X[,-remove.list]
  
  
  ## PCA
  pca_test =  prcomp(test_X,  scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[ ,1:25])
  pca_train <- as.matrix(pca_train$x[ ,1:25])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  
  HD.X   <-cv.HDtweedie(x= train_X, y=train_y, p=1.5, 
          alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
  HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
                   nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), 
                   ## almost non-penalized
                           eps= 0.0001, pred.loss="mse", nfolds=5,
                           )                         
  pred.HD.X   <- predict(HD.X, type="link", newx = test_X, s=1e-7)
  pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test, s=1e-7)
  error.HD.X[i]    <-  mean(( pred.HD.X - test_y )^2)
  error.HD.pca[i]  <-  mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca) 

Train the TDBoost model on 25 Principal components through 20 folds cross validation.

set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)
# data <- data[,-24]  
for (i in 1:reps)
{
  test_set <-  data[k_fold(data,i),]
  train_set <- data[-k_fold(data,i),]
  
  test_y <- test_set %>%
            select(Premium)
  test_X <- test_set %>%
            select(-one_of("Premium"))
  train_y <- train_set %>%
            select(Premium)
  train_X <- train_set %>%
            select(-one_of("Premium"))
  
##  constant columns in train.
  
captures<- rep(0, dim(train_X)[2])
for (j in 1:dim(train_X)[2])
{
  captures[j]<- ifelse( length(table(train_X[,j]))==1,1,0)
}
const.train <- which(captures==1)

## constant columns in test_X.
  
captures<- rep(0, dim(test_X)[2])
for (j in 1:dim(test_X)[2])
  {
captures[j]<- ifelse( length(table(test_X[,j]))==1,1,0)
  }
const.test <- which(captures==1)
remove.list <- c(const.test,const.train)
train_X <- train_X[, -remove.list]
test_X <- test_X[,-remove.list]
  
  
  ## PCA
  pca_test =  prcomp(test_X,  scale. = T, center = T) 
  pca_train = prcomp(train_X, scale. = T, center = T) 
  pca_test <-  as.matrix(pca_test$x[ ,1:25])
  pca_train <- as.matrix(pca_train$x[ ,1:25])
  
  test_y <- as.matrix(test_y)
  test_X <- as.matrix(test_X)
  train_y <- as.matrix(train_y)
  train_X <- as.matrix(train_X)
  
  test_data  <- as.data.frame(cbind(test_y,test_X))
  train_data <- as.data.frame(cbind(train_y,train_X))
  pca_train_data <- as.data.frame(cbind(train_y,pca_train))
  pca_test_data  <- as.data.frame(cbind(test_y,pca_test))

  TDB.X <- TDboost( Premium~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data = train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                        
  
  
  TDB.pca <- TDboost( Premium~., 
                        distribution = list(name="EDM",alpha=1.5),
                        data =  pca_train_data,
                        n.trees = 100,
                        n.minobsinnode = 10,
                        shrinkage = 0.001, # learning rate
                        bag.fraction = 0.5,
                        train.fraction = 1.0,
                        cv.folds=5
                  )
                         
  pred.TDB.X   <- predict(TDB.X, newdata = test_data,
                          n.trees = 100, type="response")
  pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data, 
                          n.trees = 100, type="response")
  error.TDB.X[i] <- mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
  error.TDB.pca[i]  <-  mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca) 


## setwd("C:/R directory/Joonwon_tte3")
## save.image(“C:/R directory/Joonwon_tte3/workspace.RData”)      work space
## data.copy <- readRDS(file = "TDB.X.Rds") 
## rmarkdown::pandoc_convert('Joonwon_tte3.html', output = 'Joonwon_tte3.pdf')