Mean squared prediction errors for HDtweedie and TDboost model on Life Insurance data and its Principal components.
cv1 | cv5 | cv9 | cv13 | cv17 | mean | |
---|---|---|---|---|---|---|
Original Data | 43937268 | 25708805 | 101999394 | 46674848 | 8871844 | 32303222 |
PCA Data | 43935947 | 25708285 | 102006576 | 46673437 | 8871144 | 32304094 |
cv1 | cv5 | cv9 | cv13 | cv17 | mean | |
---|---|---|---|---|---|---|
Original Data | 41908389 | 23920831 | 99462852 | 44648444 | 7473933 | 30432479 |
PCA Data | 41966963 | 24038593 | 99814069 | 44753124 | 7511136 | 30568193 |
The data used in the analysis is “Life Insurance Policy Data” from kaggle website. https://www.kaggle.com/blackclover1/life-insurance-policy-data
The purpose of the analysis is to predict the life insurance premium and examine whether the PCA can be as effective as the lasso penalization for sparse modeling. The study used HWteedie on 5 Principal components without any penalization and compared the mean squared prediction errors with lasso penalized model in HDtweedie. The result implies that the PCA is as effective as the lasso penalization.
Also, the study examined whether the PCA can improve the performance of gradient boosted tweedie compound poisson model(TDBoost). There was no sufficeint evidence to support the claim.
setwd("C:/R directory/Joonwon_tte2")
data <- read.csv("Kaggle.csv", sep=";", header=TRUE, fill=TRUE)
data <- data[,-c(21,22)] # Remove seemingly irrelevant variables.
data <- data %>%
mutate (Issue.Date= substr(Issue.Date,1,3)) # Change date into a factor variable.
# Split variables into categorical and continuous variables.
data$Premium <- as.numeric(gsub(",","",data$Premium)) # Use gsub to replace "," with "".
data$BENEFIT <- as.numeric(gsub(",","",data$BENEFIT))
factor.list <- c(5,8,9,10,12,16,20)
for( i in 1:length(factor.list))
{
data[,factor.list[i]]<- as.factor( data[,factor.list[i]] )
}
The missing values in continous variables were filled using the modes of kernel densities. The missing values in categorical variables were filled using the k-nearest neighbors algorithm.
categorical.ind <- c(5,8,9,10,12,16,20)
continuous.ind <- c(1,2,3,4,6,7,11,13,14,15,17,18,19)
# Fill missing values in categorical variables using the modes of kernel densities.
for (i in 1:length(categorical.ind))
{
j<- categorical.ind[i]
probs <- as.vector(table(na.omit(data[,j]))/length( na.omit(data[,j]) ) )
missing.ind <- which(is.na(data[,j])==TRUE, arr.ind=TRUE)
if (length(missing.ind)>0)
{
for (k in 1:length(missing.ind))
{
# Extract the row indices chosen by a random multinomial simulation.
bin <- which( rmultinom(1,1,probs)==1, arr.ind=TRUE)[1]
data[missing.ind[k],j] <- levels(data[,j])[bin]
}
}
}
mis.con <- rep(0,length(continuous.ind))
for (i in 1:length(continuous.ind))
{
j <- continuous.ind[i]
missing.ind <- which( is.na(data[,j])==TRUE)
mis.con[i]<-ifelse( length(missing.ind)>0, 1,0)
}
cont.mis <-colnames(data[, c(continuous.ind[which(mis.con==1)])])
cat("The continuous variables containing missing values are",
cont.mis[1], "and", cont.mis[2] )
# library(RANN)
knn <- preProcess( data, method= "knnImpute", k=10, na.remove = TRUE, knnSummary=mean)
mis <- c("BENEFIT", "Premium")
for (i in mis)
{
missing.ind <- which( is.na(data[,i])==TRUE)
pred<- predict(knn , data[missing.ind, ], na.action=na.pass)
for (k in 1:length(missing.ind))
{
# de-normalize and get original data.
data[missing.ind[k], i]<- pred[k,i]*knn$std[i]+knn$mean[i]
}
}
## saveRDS(data, file = "data.Rds")
## setwd("C:/R directory/Joonwon_tte2")
## data.copy <- readRDS(file = "data.Rds")
data<-model.matrix(~ ., data=data)
data <- data[,-1]
data <- as.data.frame(data)
data <- data[data$Premium>=0,]
data <- data[, -10] ## data[,10] is a constant vector
Based on the scree plot, we will use 8 principal components, and we see that those 8 principal components explain 40% of the total variance in both cases.
par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")
paste0("Total variance explained by the first eight PC's is ",
round( sum(pca$sdev[1:8]^2)/ sum(pca$sdev^2) ,4))
set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
fold_size<- floor( nrow(data)/kf )
if (k<20 & k>0)
{
k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
}else if(k==20) {
k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
}
else {
k_fold <-NA
}
return(k_fold)
}
Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.
reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
# data <- data[,-24]
for (i in 1:reps)
{
test_set <- data[k_fold(data,i),]
train_set <- data[-k_fold(data,i),]
test_y <- test_set %>%
select(Premium)
test_X <- test_set %>%
select(-one_of("Premium"))
train_y <- train_set %>%
select(Premium)
train_X <- train_set %>%
select(-one_of("Premium"))
## for zero or constant columns.
test_X[,24] <- test_X[,24] + rnorm(dim(test_X)[1],0, 1e-7)
## Note that data need to be scaled.
pca_test = prcomp(test_X, scale. = T, center = T)
pca_train = prcomp(train_X, scale. = T, center = T)
pca_test <- as.matrix(pca_test$x[,1:8])
pca_train <- as.matrix(pca_train$x[,1:8])
test_y <- as.matrix(test_y)
test_X <- as.matrix(test_X)
train_y <- as.matrix(train_y)
train_X <- as.matrix(train_X)
HD.X <-cv.HDtweedie(x= train_X, y=train_y, p=1.5,
alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
nlambda = 2,lambda = abs(rnorm(2,0,1e-9)), ## almost non-penalized
eps= 0.0001, pred.loss="mse", nfolds=5,
)
pred.HD.X <- predict(HD.X,type="link", newx = test_X)
pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test ,s=1e-7)
error.HD.X[i] <- mean(( pred.HD.X - test_y )^2)
error.HD.pca[i] <- mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca)
set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)
for (i in 1:reps)
{
test_set <- data[k_fold(data,i),]
train_set <- data[-k_fold(data,i),]
test_y <- test_set %>%
select(Premium)
test_X <- test_set %>%
select(-one_of("Premium"))
train_y <- train_set %>%
select(Premium)
train_X <- train_set %>%
select(-one_of("Premium"))
## for zero or constant columns.
test_X[,24] <- test_X[,24] + rnorm(dim(test_X)[1],0, 1e-7)
## Note that data need to be scaled.
pca_test = prcomp(test_X, scale. = T, center = T)
pca_train = prcomp(train_X, scale. = T, center = T)
pca_test <- as.matrix(pca_test$x[,1:5])
pca_train <- as.matrix(pca_train$x[,1:5])
test_data <- cbind(test_y,test_X)
train_data <- cbind(train_y,train_X)
pca_train_data <- cbind(train_y,pca_train)
pca_test_data <- cbind(test_y,pca_test)
TDB.X <- TDboost( Premium~.,
distribution = list(name="EDM",alpha=1.5),
data = train_data,
n.trees = 100,
n.minobsinnode = 10,
shrinkage = 0.001, # learning rate
bag.fraction = 0.5,
train.fraction = 1.0,
cv.folds=5
)
TDB.pca <- TDboost( Premium~.,
distribution = list(name="EDM",alpha=1.5),
data = pca_train_data,
n.trees = 100,
n.minobsinnode = 10,
shrinkage = 0.001, # learning rate
bag.fraction = 0.5,
train.fraction = 1.0,
cv.folds=5
)
pred.TDB.X <- predict(TDB.X, newdata = test_data,
n.trees = 100, type="response")
pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data,
n.trees = 100, type="response")
error.TDB.X[i] <- mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
error.TDB.pca[i] <- mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca)
## setwd("C:/R directory/Joonwon_tte2")
## save.image(“C:/R directory/Joonwon_tte2/workspace.RData”) work space
## data.copy <- readRDS(file = "TDB.X.Rds")
## rmarkdown::pandoc_convert('Joonwon_tte2.html', output = 'Joonwon_tte2.pdf')