Mean squared prediction errors for HDtweedie and TDboost model on Home Insurance data and its Principal components.
| cv1 | cv5 | cv9 | cv13 | cv17 | mean | |
|---|---|---|---|---|---|---|
| Original Data | 53279 | 53380 | 53119 | 55310 | 53698 | 53609 |
| PCA Data | 53327 | 53395 | 53132 | 55367 | 53757 | 53652 |
| cv1 | cv5 | cv9 | cv13 | cv17 | mean | |
|---|---|---|---|---|---|---|
| Original Data | 6694 | 6946 | 6940 | 7863 | 7034 | 7075 |
| PCA Data | 7223 | 7481 | 7467 | 8805 | 7907 | 7768 |
The data used in the analysis is “Home Insurance Policy Data” from kaggle website. https://www.kaggle.com/ycanario/home-insurance
The purpose of the analysis is to predict the home insurance premium and examine whether the PCA can be as effective as the lasso penalization for sparse modeling. The study used HWteedie on 25 Principal components without any penalization and compared the mean squared prediction errors with lasso penalized model in HDtweedie. The result implies that the PCA is as effective as the lasso penalization.
Also, the study examined whether the PCA can improve the prediction performance of gradient boosted tweedie compound poisson model(TDBoost). There was no sufficeint evidence to support that PCA improve the TDBoost’s prediction power.
setwd("C:/R directory/Joonwon_tte3")
data <- read.csv("home_insurance.csv", header=TRUE, fill=TRUE)
## make Date variables as a
data[,1] <- as.factor(substr(data[,1],7,10))
## unify the notation "007, 2007" -> 07
data[,1]<- as.factor(as.numeric(data[,1]))
data[,2] <- as.factor(substr(data[,2],7,10))
data[,21] <- as.factor(substr(data[,21],7,10))
# remove variables with too many NA's.
data <- data[,-c(42,44,60,61,62)]
data <- data[,-c(60,61)] # 61:remove policy and 60: index of rows.
## 31,32
## Split variables into categorical and continuous variables.
categorical.ind <- c(3:8,10,12,16,17,22:26,30,31,32,33,34,36,38:40,42,43,44:57,59)
continuous.ind <- c(1,2,9,11,13:15,18:20,21,27:29,35,37,41,58)
for( i in 1:length(categorical.ind))
{
data[,categorical.ind[i]]<- as.factor( data[,categorical.ind[i]] )
}
for( i in 1:length(continuous.ind))
{
data[,continuous.ind[i]]<- as.numeric( data[,continuous.ind[i]] )
}
# data.copy <- data
data<-model.matrix(~ ., data=data)
data <- as.data.frame(data)
data$Premium <- data$LAST_ANN_PREM_GROSS
data <- data[data$Premium>=0,] # Premium has to be positive
data<- data[,-c(1)] ## remove intercept
# find constant columns
captures<- rep(0, dim(data)[2])
for (i in 1:dim(data)[2])
{
captures[i]<- ifelse( length(table(data[,i]))==1,1,0)
}
const.col <- which(captures==1) # constant columns
data<- data[,-const.col]
The missing values in continous variables were filled using the modes of kernel densities. The missing values in categorical variables were filled using the column means.
# Fill missing values in categorical variables using the modes of kernel densities.
for (i in 1:length(categorical.ind))
{
j<- categorical.ind[i]
probs <- as.vector(table(na.omit(data[,j]))/length( na.omit(data[,j]) ) )
missing.ind <- which(is.na(data[,j])==TRUE, arr.ind=TRUE)
if (length(missing.ind)>0)
{
for (k in 1:length(missing.ind))
{
# Extract the row indices chosen by a random multinomial simulation.
bin <- which( rmultinom(1,1,probs)==1, arr.ind=TRUE)[1]
data[missing.ind[k],j] <- levels(data[,j])[bin]
}
}
}
# Find continuous variables which containing at least one missing value.
mis.con <- rep(0,length(continuous.ind))
for (i in 1:length(continuous.ind))
{
j <- continuous.ind[i]
missing.ind <- which( is.na(data[,j])==TRUE)
mis.con[i]<-ifelse( length(missing.ind)>0, 1,0)
}
cont.mis <-colnames(data[, c(continuous.ind[which(mis.con==1)])])
# Fill missing values in continuous variables using the column means.
mis <- cont.mis
for (i in mis)
{
missing.ind <- which( is.na(data[,i])==TRUE)
data[missing.ind, i]<- mean( data[,i] , na.rm = TRUE)
}
## setwd("C:/R directory/Joonwon_tte3")
## saveRDS(data, file = "home_insurance.Rds")
## data.copy <- readRDS(file = "home_insurance.Rds")
Based on the scree plot, we will use 25 principal components, and we see that those 25 principal components explain 52% of the total variance in both cases.
par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")
paste0("Total variance explained by the first eight PC's is ",
round( sum(pca$sdev[1:25]^2)/ sum(pca$sdev^2) ,4))
set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
fold_size<- floor( nrow(data)/kf )
if (k<20 & k>0)
{
k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
}else if(k==20) {
k_fold <- n[ (fold_size*(k-1)+1):nrow(data)]
}
else {
k_fold <-NA
}
return(k_fold)
}
Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.
reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
# data <- data[,-24]
for (i in 1:reps)
{
test_set <- data[k_fold(data,i),]
train_set <- data[-k_fold(data,i),]
test_y <- test_set %>%
select(Premium)
test_X <- test_set %>%
select(-one_of("Premium"))
train_y <- train_set %>%
select(Premium)
train_X <- train_set %>%
select(-one_of("Premium"))
## constant columns in train.
captures<- rep(0, dim(train_X)[2])
for (j in 1:dim(train_X)[2])
{
captures[j]<- ifelse( length(table(train_X[,j]))==1,1,0)
}
const.train <- which(captures==1)
## constant columns in test_X.
captures<- rep(0, dim(test_X)[2])
for (j in 1:dim(test_X)[2])
{
captures[j]<- ifelse( length(table(test_X[,j]))==1,1,0)
}
const.test <- which(captures==1)
remove.list <- c(const.test,const.train)
train_X <- train_X[, -remove.list]
test_X <- test_X[,-remove.list]
## PCA
pca_test = prcomp(test_X, scale. = T, center = T)
pca_train = prcomp(train_X, scale. = T, center = T)
pca_test <- as.matrix(pca_test$x[ ,1:25])
pca_train <- as.matrix(pca_train$x[ ,1:25])
test_y <- as.matrix(test_y)
test_X <- as.matrix(test_X)
train_y <- as.matrix(train_y)
train_X <- as.matrix(train_X)
HD.X <-cv.HDtweedie(x= train_X, y=train_y, p=1.5,
alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
HD.pca <-cv.HDtweedie(x= pca_train, y=train_y,
nlambda = 2,lambda = abs(rnorm(2,0,1e-9)),
## almost non-penalized
eps= 0.0001, pred.loss="mse", nfolds=5,
)
pred.HD.X <- predict(HD.X, type="link", newx = test_X, s=1e-7)
pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test, s=1e-7)
error.HD.X[i] <- mean(( pred.HD.X - test_y )^2)
error.HD.pca[i] <- mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca)
set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)
# data <- data[,-24]
for (i in 1:reps)
{
test_set <- data[k_fold(data,i),]
train_set <- data[-k_fold(data,i),]
test_y <- test_set %>%
select(Premium)
test_X <- test_set %>%
select(-one_of("Premium"))
train_y <- train_set %>%
select(Premium)
train_X <- train_set %>%
select(-one_of("Premium"))
## constant columns in train.
captures<- rep(0, dim(train_X)[2])
for (j in 1:dim(train_X)[2])
{
captures[j]<- ifelse( length(table(train_X[,j]))==1,1,0)
}
const.train <- which(captures==1)
## constant columns in test_X.
captures<- rep(0, dim(test_X)[2])
for (j in 1:dim(test_X)[2])
{
captures[j]<- ifelse( length(table(test_X[,j]))==1,1,0)
}
const.test <- which(captures==1)
remove.list <- c(const.test,const.train)
train_X <- train_X[, -remove.list]
test_X <- test_X[,-remove.list]
## PCA
pca_test = prcomp(test_X, scale. = T, center = T)
pca_train = prcomp(train_X, scale. = T, center = T)
pca_test <- as.matrix(pca_test$x[ ,1:25])
pca_train <- as.matrix(pca_train$x[ ,1:25])
test_y <- as.matrix(test_y)
test_X <- as.matrix(test_X)
train_y <- as.matrix(train_y)
train_X <- as.matrix(train_X)
test_data <- as.data.frame(cbind(test_y,test_X))
train_data <- as.data.frame(cbind(train_y,train_X))
pca_train_data <- as.data.frame(cbind(train_y,pca_train))
pca_test_data <- as.data.frame(cbind(test_y,pca_test))
TDB.X <- TDboost( Premium~.,
distribution = list(name="EDM",alpha=1.5),
data = train_data,
n.trees = 100,
n.minobsinnode = 10,
shrinkage = 0.001, # learning rate
bag.fraction = 0.5,
train.fraction = 1.0,
cv.folds=5
)
TDB.pca <- TDboost( Premium~.,
distribution = list(name="EDM",alpha=1.5),
data = pca_train_data,
n.trees = 100,
n.minobsinnode = 10,
shrinkage = 0.001, # learning rate
bag.fraction = 0.5,
train.fraction = 1.0,
cv.folds=5
)
pred.TDB.X <- predict(TDB.X, newdata = test_data,
n.trees = 100, type="response")
pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data,
n.trees = 100, type="response")
error.TDB.X[i] <- mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
error.TDB.pca[i] <- mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca)
## setwd("C:/R directory/Joonwon_tte3")
## save.image(“C:/R directory/Joonwon_tte3/workspace.RData”) work space
## data.copy <- readRDS(file = "TDB.X.Rds")
## rmarkdown::pandoc_convert('Joonwon_tte3.html', output = 'Joonwon_tte3.pdf')