Error rates for HDtweedie and TDboost model on dataCar and Principal components.
| cv1 | cv5 | cv9 | cv13 | cv17 | mean | |
|---|---|---|---|---|---|---|
| Original Data | 2377401 | 1373430 | 2136314 | 1532071 | 1102510 | 1440613 |
| PCA Data | 2378171 | 1374129 | 2137165 | 1532731 | 1103110 | 1441284 |
| cv1 | cv5 | cv9 | cv13 | cv17 | mean | |
|---|---|---|---|---|---|---|
| Original Data | 2316359 | 1313726 | 2072233 | 1479964 | 1049088 | 1386597 |
| PCA Data | 2352115 | 1349163 | 2103491 | 1513364 | 1082238 | 1418921 |
The data used in the analysis is “dataCar” from the package “insuranceData”. The purpose of the analysis is to predict the auto insurance premium and examine whether the analysis result on the reduced data from PCA is robust. Also, the analysis compared the performance of gradient boosted tweedie compound poisson model(TDBoost) with the grouped lasso Tweedie model(HDtweedie).
data("dataCar") # "insuranceData" package.
data<-dataCar[,-11] # Remove ""X_OBSTAT_"" from the dataset.
data<-model.matrix(~ ., data=data)[,-1]
data<-as.data.frame(data)
data$insu.prem <- data$numclaims * data$claimcst0
data <- data %>%
select(-one_of("claimcst0","numclaims"))
Before doing PCA, we can see the correlation between variables in the data. It is known that if the most of correlation values are lower than 0.3, PCA may not work.
cor_df<- cor(data,data, method="pearson")
corrplot(cor_df,order="hclust")
We see that among explanatory variables except the responses “clm”, “numclaims”, and “claimcst0”, only “veh_value” has strong negative correlation with “veh_age”.
Based on the scree plot, we will use 5 principal components, and we see that those 5 principal components explain 72% of the total variance in both cases.
par(mfrow=c(1,2))
pca = prcomp(data, scale. = T, center = T)
plot(pca$sdev^2, type="b", xlab="PC Number", ylab="Variance", main="Scree Plot")
paste0("Total variance explained by the first five PC is ",
round( sum(pca$sdev[1:5]^2)/ sum(pca$sdev^2) ,4))
## [1] "Total variance explained by the first five PC is 0.3245"
set.seed(22)
n <- sample(1:nrow(data), nrow(data), replace=FALSE)
kf<- 20
k_fold <- function(data,k) # Function returns the indices of rows in kth fold.
{
fold_size<- floor( nrow(data)/kf )
if (k<20)
{
k_fold <- n[ (fold_size*(k-1)+1):(fold_size*k)]
} else {
k_fold <- n[ (fold_size*(k-1)):nrow(data)]
}
return(k_fold)
}
Note that using only one subset of the data for training purposes can make the model biased. So we use k-fold cross validation method.
reps <- 20
error.HD.X <- numeric(20)
error.HD.pca <- numeric(20)
for (i in 1:reps)
{
test_set <- data[k_fold(data,i),]
train_set <- data[-k_fold(data,i),]
test_y <- test_set %>%
select(insu.prem)
test_X <- test_set %>%
select(-one_of("insu.prem"))
train_y <- train_set %>%
select(insu.prem)
train_X <- train_set %>%
select(-one_of("insu.prem"))
## for zero or constant columns.
test_X<- test_X+ rnorm(5,0, 1e-7) ; train_X<- train_X+ rnorm(5,0, 1e-7)
## Note that data need to be scaled.
pca_test = prcomp(test_X, scale. = T, center = T)
pca_train = prcomp(train_X, scale. = T, center = T)
pca_test <- as.matrix(pca_test$x[,1:5])
pca_train <- as.matrix(pca_train$x[,1:5])
test_y <- as.matrix(test_y)
test_X <- as.matrix(test_X)
train_y <- as.matrix(train_y)
train_X <- as.matrix(train_X)
group1<- c( 1,2,3, rep(4,12),5,6,rep(7,5),8)
HD.X <- cv.HDtweedie(x= train_X, y=train_y, p=1.5, group=group1,
alpha=1, eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
HD.pca <- cv.HDtweedie(x= pca_train, y=train_y, p=1.5, alpha=1,
eps= 0.0001, nlambda = 50, pred.loss="mse", nfolds=5)
pred.HD.X <- predict(HD.X,type="link", newx = test_X)
pred.HD.pca <- predict(HD.pca,type="link",newx= pca_test)
error.HD.X[i] <- mean(( pred.HD.X - test_y )^2)
error.HD.pca[i] <- mean(( pred.HD.pca - test_y )^2)
}
mean.error.HD.X = mean(error.HD.X)
mean.error.HD.pca = mean(error.HD.pca)
set.seed(22)
reps <- 20
error.TDB.X <- numeric(20)
error.TDB.pca <- numeric(20)
for (i in 1:reps)
{
test_set <- data[k_fold(data,i),]
train_set <- data[-k_fold(data,i),]
test_y <- test_set %>%
select(insu.prem)
test_X <- test_set %>%
select(-one_of("insu.prem"))
train_y <- train_set %>%
select(insu.prem)
train_X <- train_set %>%
select(-one_of("insu.prem"))
## for zero or constant columns.
test_X<- test_X+ rnorm(5,0, 1e-7) ; train_X<- train_X+ rnorm(5,0, 1e-7)
## Note that data need to be scaled.
pca_test = prcomp(test_X, scale. = T, center = T)
pca_train = prcomp(train_X, scale. = T, center = T)
pca_test <- as.matrix(pca_test$x[,1:5])
pca_train <- as.matrix(pca_train$x[,1:5])
test_data <- cbind(test_y,test_X)
train_data <- cbind(train_y,train_X)
pca_train_data <- cbind(train_y,pca_train)
pca_test_data <- cbind(test_y,pca_test)
TDB.X <- TDboost( insu.prem~.,
distribution = list(name="EDM",alpha=1.5),
data = train_data,
n.trees = 100,
n.minobsinnode = 10,
shrinkage = 0.001, # learning rate
bag.fraction = 0.5,
train.fraction = 1.0,
cv.folds=5
)
TDB.pca <- TDboost( insu.prem~.,
distribution = list(name="EDM",alpha=1.5),
data = pca_train_data,
n.trees = 100,
n.minobsinnode = 10,
shrinkage = 0.001, # learning rate
bag.fraction = 0.5,
train.fraction = 1.0,
cv.folds=5
)
pred.TDB.X <- predict(TDB.X, newdata = test_data,
n.trees = 100, type="response")
pred.TDB.pca <- predict(TDB.pca, newdata = pca_test_data,
n.trees = 100, type="response")
error.TDB.X[i] <- mean(( pred.TDB.X - as.numeric(as.matrix(test_y)) )^2)
error.TDB.pca[i] <- mean( ( pred.TDB.pca - as.numeric(as.matrix(test_y)) )^2 )
}
mean.error.TDB.X = mean(error.TDB.X)
mean.error.TDB.pca = mean(error.TDB.pca)