knitr::include_graphics('/Users/Biljana/R files/PA/anz_image.png')
This task is based on a synthesised transaction dataset containing 3 months’ worth of transactions for 100 hypothetical customers. It contains purchases, recurring transactions, and salary transactions.The dataset is designed to simulate realistic transaction behaviours that are observed in ANZ’s real transaction data.
Our first task is to identify annual salary for each customer and to explore correlations between annual salary and various customer attributes (e.g. age). These attributes could be those that are readily available in the data (e.g. age) or those that we construct or derive ourselves (e.g. those relating to purchasing behaviour). Once we found some correlations, our second task is to predict the annual salary for each customer by using the attributes we identified above.
In order to achieve this goal, we will try a number of standard linear and non-linear algorithms to spot-check our problem in R. In general, our analysis can be broken down into 5 steps:
knitr::opts_chunk$set(fig.width = 6, fig.height = 7,
warning = FALSE, message = FALSE)
# Import the dataset
anz_data <- read.csv("/Users/Biljana/R files/PA/ANZ synthesised transaction dataset.csv",
header = TRUE, na.strings = c("","NA"))
# Change the format of date column
anz_data$date <- as.Date(anz_data$date, format = "%d/%m/%Y")
knitr::kable(head(anz_data[1:6, 1:7]),
caption = "Table 1: Glimse at the first seven columns of the
ANZ synthesised transaction dataset",
align = 'c', format = "markdown")
| status | card_present_flag | bpay_biller_code | account | currency | long_lat | txn_description |
|---|---|---|---|---|---|---|
| authorized | 1 | NA | ACC-1598451071 | AUD | 153.41 -27.95 | POS |
| authorized | 0 | NA | ACC-1598451071 | AUD | 153.41 -27.95 | SALES-POS |
| authorized | 1 | NA | ACC-1222300524 | AUD | 151.23 -33.94 | POS |
| authorized | 1 | NA | ACC-1037050564 | AUD | 153.10 -27.66 | SALES-POS |
| authorized | 1 | NA | ACC-1598451071 | AUD | 153.41 -27.95 | SALES-POS |
| posted | NA | NA | ACC-1608363396 | AUD | 151.22 -33.87 | PAYMENT |
# Create the initial dataframe to store the results
df_customer = data.frame(customer_id = unique(anz_data$customer_id))
# Create a mode function to find out what is the salary payment frequency
mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Create a loop to process through all salary payments for each customer
for (i in seq(nrow(df_customer))) {
transaction_data <- anz_data[anz_data$customer_id == df_customer$customer_id[i]
& anz_data$txn_description ==
"PAY/SALARY",c("amount","date")] %>%
dplyr::group_by(date) %>%
dplyr::summarise(amount = sum(amount))
total_sum <- sum(transaction_data$amount)
count = dim(transaction_data)[1]
if (count == 0) {
df_customer$frequency[i] = NA
df_customer$level[i] = NA
} else {
s = c()
lvl = c()
for (j in seq(count - 1)) {
s = c(s,(transaction_data$date[j + 1] - transaction_data$date[j]))
lvl = c(lvl,transaction_data$amount[j])}
lvl = c(lvl,tail(transaction_data$amount,n = 1))
df_customer$frequency[i] = mode(s)
df_customer$level[i] = mode(lvl)
}
}
df_customer$annual_salary = df_customer$level / df_customer$frequency * 365.25
df_customer_table <- as.data.frame(df_customer)
knitr::kable(head(df_customer_table, n = 10),
caption = "Table 2: Calculated annual salary by customer",
align = 'c', format = "markdown")
| customer_id | frequency | level | annual_salary |
|---|---|---|---|
| CUS-2487424745 | 7 | 1013.67 | 52891.85 |
| CUS-2142601169 | 7 | 1002.13 | 52289.71 |
| CUS-1614226872 | 7 | 892.09 | 46547.98 |
| CUS-2688605418 | 14 | 2320.30 | 60534.97 |
| CUS-4123612273 | 7 | 1068.04 | 55728.80 |
| CUS-3026014945 | 14 | 2840.15 | 74097.48 |
| CUS-2031327464 | 7 | 2280.36 | 118985.93 |
| CUS-2317998716 | 14 | 2639.76 | 68869.45 |
| CUS-1462656821 | 14 | 3903.95 | 101851.27 |
| CUS-3142625864 | 7 | 2588.01 | 135038.66 |
# Distribution of customers' annual salary
hist(df_customer$annual_salary[!is.na(df_customer$annual_salary)],breaks = c(seq(28000,140000,by = 10000)),
col = rgb(0,0,0,0.5), main = "Histogram of Annual Salary", xlab = 'Income, ($)')
# Create a dataframe to summarize customers' consumption behavior
df_attributes <- anz_data %>%
# use anz_data to summarize customers' consumption behavior
dplyr::select(customer_id, gender, age, amount, date, balance) %>%
group_by(customer_id) %>%
dplyr::mutate(num_trans = n(),
avg_weekly_trans = round(7*n()/length(unique(date)),0),
max_amount = max(amount),
num_large_trans = sum(amount > 100),
# an arbitrary $100 benchmark is selected
use_num_day = length(unique(date)),
avg_trans_amount = mean(amount, na.rm = TRUE),
med_balance = median(balance,na.rm = TRUE)) %>%
dplyr::select(-c("amount","date","balance")) %>%
unique()
# Assign gender as binary numeric variable
df_attributes$gender_num <- ifelse(df_attributes$gender == "M",1,0)
# Assign age by groups as binary numeric variables
df_attributes$age_below20 <- ifelse(df_attributes$age < 20,1,0)
df_attributes$age_btw20n40 <- ifelse(df_attributes$age >= 20 & df_attributes$age < 40,1,0)
df_attributes$age_btw40n60 <- ifelse(df_attributes$age >= 40 & df_attributes$age < 60,1,0)
# Merge all the attributes into single dataframe and select relevant attributes
customer_data <- merge(df_customer, df_attributes)
# Remove columns: customer_id, frequency and level
customer_data <- customer_data[ ,c(4:17)] # 14 columns left
# 1. Scatter plot of annual salary versus age
ggplot(customer_data, aes(x = age, y = annual_salary,
color = gender)) + geom_point(size = 2) +
stat_smooth(aes(group = 1), method = "lm", se = TRUE, position = "identity", col = "grey56") +
theme_ipsum() +
labs(title = "Annual Salary vs Age",
subtitle = "ANZ Customer Database", y = "Annual salary, ($)", x = "Age") +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
# 2. Scatter plot of annual salary versus avg_trans_amount
ggplot(customer_data, aes(x = avg_trans_amount, y = annual_salary,
color = gender)) + geom_point(size = 2) +
stat_smooth(aes(group = 1), method = "lm", se = TRUE, position = "identity", col = "grey56") +
theme_ipsum() +
labs(title = "Annual Salary vs Average Transactions Amount",
subtitle = "ANZ Customer Database", y = "Annual salary, ($)", x = "Avgerage transactions amount, ($)") +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
# 3. Scatter plot of annual salary versus maximum transactions amount
ggplot(customer_data, aes(x = max_amount, y = annual_salary,
color = gender)) + geom_point(size = 2) +
stat_smooth(aes(group = 1), method = "lm", se = TRUE, position = "identity", col = "grey56") +
theme_ipsum() +
labs(title = "Annual Salary vs Maximum Transactions Amount",
subtitle = "ANZ Customer Database", y = "Annual salary, ($)", x = "Maximum transactions amount, ($)") +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
# 4. Scatter plot of annual salary versus number of transactions by customer
ggplot(customer_data, aes(x = num_trans, y = annual_salary,
color = gender)) + geom_point(size = 2) +
stat_smooth(aes(group = 1), method = "lm", se = TRUE, position = "identity", col = "grey56") +
theme_ipsum() +
labs(title = "Annual Salary vs Transactions Count",
subtitle = "ANZ Customer Database", y = "Annual salary, ($)", x = "Number of transactions by customer") +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.title = element_blank(),
plot.title = element_text(vjust = 2, face = 'bold'),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(),
axis.title.x = element_text(size = 12, margin = margin(10,0,0,0)),
axis.title.y = element_text(size = 14, margin = margin(0,10,0,0)))
We will split the loaded dataset into two, 80% of which we will use to train our models and 20% that we will hold back as a validation dataset. Later, we will estimate the accuracy of the models we create on validation dataset.
# First to eliminate categorical gender variable that was kept for plotting scatter plots above
customer_data <- customer_data[ ,-2] # 13 columns left
# Create train and test datasets
set.seed(7)
validationIndex <- createDataPartition(customer_data$annual_salary, p = 0.80, list = FALSE)
# select 20% of the data for validation
validation <- customer_data[-validationIndex,]
# use the remaining 80% of data to training and testing the models
dataset <- customer_data[validationIndex,]
dim(dataset)
## [1] 80 13
We have 80 instances to work with and can confrm the data has 13 attributes including the response variable annual_salary.
options(width = 100)
sapply(dataset, class)
## annual_salary age num_trans avg_weekly_trans max_amount
## "numeric" "integer" "integer" "numeric" "numeric"
## num_large_trans use_num_day avg_trans_amount med_balance gender_num
## "integer" "integer" "numeric" "numeric" "numeric"
## age_below20 age_btw20n40 age_btw40n60
## "numeric" "numeric" "numeric"
We can see that nine of the attributes are numeric while four are integers.
options(width = 100)
sum_m <- sapply(dataset, summary)
df_sum <- as.data.frame(sum_m)
t(df_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## annual_salary 29405.02008 49510.6158 59888.3467 67741.8712 83591.9629 135038.6646
## age 18.00000 22.0000 28.0000 31.3500 38.5000 78.0000
## num_trans 25.00000 78.0000 102.0000 113.0250 137.5000 292.0000
## avg_weekly_trans 8.00000 11.0000 13.0000 13.9125 16.0000 27.0000
## max_amount 761.33000 1406.8100 2160.1200 2458.5293 3068.9650 8835.9800
## num_large_trans 5.00000 15.0000 19.0000 19.0000 24.0000 35.0000
## use_num_day 19.00000 43.7500 56.0000 54.5125 66.0000 84.0000
## avg_trans_amount 74.46502 156.4472 208.0583 233.3170 289.2953 693.6329
## med_balance 1204.40000 4940.1750 7361.7675 16970.3526 13447.7850 199392.6800
## gender_num 0.00000 0.0000 1.0000 0.5875 1.0000 1.0000
## age_below20 0.00000 0.0000 0.0000 0.1250 0.0000 1.0000
## age_btw20n40 0.00000 0.0000 1.0000 0.6250 1.0000 1.0000
## age_btw40n60 0.00000 0.0000 0.0000 0.2250 0.0000 1.0000
skew <- apply(dataset, 2, skewness)
print(skew)
## annual_salary age num_trans avg_weekly_trans max_amount
## 0.83908429 1.31195247 1.00253616 1.01438773 1.64122927
## num_large_trans use_num_day avg_trans_amount med_balance gender_num
## -0.06872131 -0.25306455 1.43340744 4.25679392 -0.34884122
## age_below20 age_btw20n40 age_btw40n60
## 2.22539899 -0.50674564 1.29248715
The further the distribution of the skew value from zero, the larger the skew to the left (negative skew value) or right (positive skew value)
Let’s look at visualizations of individual attributes and start with histograms to get a sense of the data distributions. Some of the attributes have skewed Gaussian distribution to the right like annual_salary, use_num_day, avg_trans_amount and med_balance. We are going to keep the data intact for now.
# Plot histograms for each input (explanatory) variable
par(mfrow = c(4,3), oma = c(4,3,4,3), mar = c(1,4,3,2))
for (i in 2:13) {
graphics::hist(dataset[,i], main = names(dataset[i]), xlab = "", col = "grey50")
}
# Density plots for each variable
mycolors = c('red','darkturquoise','blue',"orange", "magenta1","green", "red",
"darkturquoise", "blue", "orange", "magenta1", "green", "red")
par(mfrow = c(4,3), oma = c(0,1,0,1), mar = c(1,6,3,2))
for (i in 2:13) {
graphics::plot(density(dataset[,i]), main = names(dataset)[i], col = mycolors[i])
}
We can see that the data has a different range and some variables have extreme outliers.
# Box plots for each variable
par(mfrow = c(4,3), oma = c(0,1,0,1), mar = c(1,6,2,2))
for (i in 2:13) {
graphics::boxplot(dataset[,i], main = names(dataset)[i], col = mycolors[i])
}
Let’s look at some visualizations of the interactions between variables and start with scatterplot matrix. We can see that some of the higher correlated attributes do show good structure in their relationship. There is a linear (diagonal) relationship between annual salary and max_amount and num_trans and avg_weekly_trans.
# Scactterplot matrices for all attributes
dataset_scatter <- sapply(dataset[ , 1:9], jitter)
pairs(dataset_scatter[ , 1:9], col = mycolors)
Lets take a look at the correlation between each pair of numeric attributes except the response variable. This is interesting. We can see that some of the attributes have a strong correlation (e.g. > 0:70 or < 0:70). For example:
These attributes are candidates for removal because correlated attributes are reducing the accuracy of the linear algorithms. This is collinearity and we may see better results with regression algorithms if the correlated attributes are removed. Later, we will perform data transformation with powerful data transforms provided by caret package.
# Correlation between all of the numeric attributes except annual_salary
annual_salary <- dataset$annual_salary # assign annual_salary variable as vector
# Create a dataframe for correlation plot
dataset_cor <- dataset[ ,-1]
# Plot the correlation matrix
correlation_matrix <- cor(dataset_cor)
corrplot.mixed(correlation_matrix, order = "hclust", tl.pos = "lt",
upper = "ellipse")
There is a lot of structure in this dataset. We need to think about transformations that we could apply such as:
We will select a number of different algorithms capable of working on this regression problem. We will use 10-fold cross validation with 3 repeats. We will evaluate algorithms using the RMSE and R2 metrics. RMSE or Root Mean Squared Error is the average deviation of the predictions from the observations. It is useful to get a gross idea of how well (or not) an algorithm is performing in the units of the response variable. Values closest to zero are the best. R2 (R Squared) or also called the coefficient of determination provides a goodness-of-fit measure for the predictions to the observations. This is a value between 0 and 1 for no-fit and perfect fit respectively.
The 6 algorithms selected are:
Summary of the estimated model accuracy shows that GLMNET has the lowest RMSE value followed closely by SVM and the other linear algorithms GLM and LM. We can also see that GLMNET has the best fit for the data in its R2 measures.
# Run algorithms using 10-fold cross validation
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
metric <- "RMSE"
# LM
set.seed(7)
fit.lm <- train(annual_salary~., data = dataset, method = "lm",
metric = metric, preProc = c("center","scale"), trControl = trainControl)
# GLM
set.seed(7)
fit.glm <- train(annual_salary~., data = dataset, method = "glm",
metric = metric, preProc = c("center","scale"), trControl = trainControl)
# GLMNET
set.seed(7)
fit.glmnet <- train(annual_salary~., data = dataset, method = "glmnet", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# SVM
set.seed(7)
fit.svm <- train(annual_salary~., data = dataset, method = "svmRadial", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# CART
set.seed(7)
grid <- expand.grid(.cp = c(0, 0.05, 0.1))
fit.cart <- train(annual_salary~., data = dataset, method = "rpart", metric = metric, tuneGrid = grid,
preProc = c("center", "scale"), trControl = trainControl)
# KNN
set.seed(7)
fit.knn <- train(annual_salary~., data = dataset, method = "knn", metric = metric, preProc = c("center","scale"), trControl = trainControl)
# Compare algorithms
results <- resamples(list(LM = fit.lm, GLM = fit.glm, GLMNET = fit.glmnet, SVM = fit.svm,
CART = fit.cart, KNN = fit.knn))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: LM, GLM, GLMNET, SVM, CART, KNN
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 5013.393 8308.151 11425.22 11422.86 15035.91 21502.90 0
## GLM 5013.393 8308.151 11425.22 11422.86 15035.91 21502.90 0
## GLMNET 4696.183 9023.446 10981.26 11943.78 14743.57 22107.69 0
## SVM 4602.416 9575.353 10914.46 11427.42 13555.50 18439.23 0
## CART 9131.547 14255.133 17228.12 16804.42 18871.45 25307.57 0
## KNN 9616.880 14605.725 16202.39 16371.10 19155.68 26068.12 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 5480.502 10553.82 13911.31 15574.26 20038.16 29546.19 0
## GLM 5480.502 10553.82 13911.31 15574.26 20038.16 29546.19 0
## GLMNET 5343.156 10848.22 14312.71 15207.22 19039.37 28172.47 0
## SVM 6046.713 11857.18 15291.45 15158.98 18298.61 27016.83 0
## CART 12333.291 18028.91 22724.39 22005.19 24238.21 32554.89 0
## KNN 11434.658 17087.63 19426.16 20793.92 24240.70 33927.36 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 0.2360709475 0.6610246 0.7799467 0.7288462 0.8520424 0.9414055 0
## GLM 0.2360709475 0.6610246 0.7799467 0.7288462 0.8520424 0.9414055 0
## GLMNET 0.3149319987 0.5752619 0.7819953 0.7182934 0.8805322 0.9784577 0
## SVM 0.4264874171 0.5986924 0.7578499 0.7295999 0.8626795 0.9595145 0
## CART 0.0005629575 0.1100722 0.3720714 0.3446217 0.5927935 0.7030182 0
## KNN 0.0258492429 0.2652490 0.3923599 0.4197579 0.6317977 0.8655879 0
dotplot(results)
In this step we will remove the highly correlated attributes and see what efect that has on the evaluation metrics. The findCorrelation() function from the caret package identified num_trans attribute as a highly correlated attribute. Now let’s try the same 6 algorithms from our base line experiment. Comparing the results, we can see that this has made the RMSE worse or without any effect for the linear and the non-linear algorithms. The correlated attribute we removed is contributing to the accuracy of the models.
Highly correlated atributes
# Remove correlated attributes
set.seed(7)
cutoff <- 0.70
highly_correlated <- findCorrelation(correlation_matrix, cutoff = cutoff)
for (value in highly_correlated) {
print(names(dataset_cor)[value])
}
## [1] "num_trans"
# Create a new dataset without highly corrected features
dataset_features <- dataset_cor[,-highly_correlated]
# Attach annual_salary to the df
dataset_features <- cbind(dataset_features, annual_salary)
Summary of estimated model accuracy
# Run algorithms using 10-fold cross validation
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
metric <- "RMSE"
# lm
set.seed(7)
fit.lm <- train(annual_salary~., data = dataset_features, method = "lm", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# GLM
set.seed(7)
fit.glm <- train(annual_salary~., data = dataset_features, method = "glm", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# GLMNET
set.seed(7)
fit.glmnet <- train(annual_salary~., data = dataset_features, method = "glmnet", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# SVM
set.seed(7)
fit.svm <- train(annual_salary~., data = dataset_features, method = "svmRadial", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# CART
set.seed(7)
grid <- expand.grid(.cp = c(0, 0.05, 0.1))
fit.cart <- train(annual_salary~., data = dataset_features, method = "rpart", metric = metric,
tuneGrid = grid, preProc = c("center", "scale"), trControl = trainControl)
# KNN
set.seed(7)
fit.knn <- train(annual_salary~., data = dataset_features, method = "knn", metric = metric,
preProc = c("center", "scale"), trControl = trainControl)
# Compare algorithms
feature_results <- resamples(list(LM = fit.lm, GLM = fit.glm, GLMNET = fit.glmnet, SVM = fit.svm,
CART = fit.cart, KNN = fit.knn))
summary(feature_results)
##
## Call:
## summary.resamples(object = feature_results)
##
## Models: LM, GLM, GLMNET, SVM, CART, KNN
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 4694.446 8275.321 11353.58 12039.05 15295.84 24125.61 0
## GLM 4694.446 8275.321 11353.58 12039.05 15295.84 24125.61 0
## GLMNET 4696.183 9023.446 10981.21 11937.91 14743.57 22107.69 0
## SVM 4217.368 9016.490 11072.03 11341.85 13403.26 18866.25 0
## CART 9131.547 13802.408 17557.47 16742.37 19522.47 25307.57 0
## KNN 7751.733 13624.079 15592.26 15824.24 18676.36 23841.36 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 6133.993 10401.21 15454.56 16196.10 20383.13 31821.85 0
## GLM 6133.993 10401.21 15454.56 16196.10 20383.13 31821.85 0
## GLMNET 5343.156 10848.22 14312.71 15201.20 19016.39 28172.47 0
## SVM 5540.528 11659.49 15353.94 15118.98 18275.07 27233.32 0
## CART 12333.291 18028.91 22724.39 21972.94 24967.15 32554.89 0
## KNN 8930.242 16724.38 19847.35 20250.48 23456.42 30601.62 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 0.2207560902 0.5611020 0.7428550 0.6833884 0.8334635 0.9840862 0
## GLM 0.2207560902 0.5611020 0.7428550 0.6833884 0.8334635 0.9840862 0
## GLMNET 0.3203855514 0.5752619 0.7820678 0.7185651 0.8805322 0.9784577 0
## SVM 0.4358797656 0.5705037 0.7708142 0.7282387 0.8732897 0.9739348 0
## CART 0.0004506748 0.1076372 0.3420854 0.3542048 0.6399029 0.7121509 0
## KNN 0.0018784617 0.2459110 0.4074271 0.4156191 0.6007388 0.9120653 0
dotplot(feature_results)
We can see that this transformation indeed decreased the RMSE and increased the R2 on all except the CART algorithm. The RMSE of GLMNET dropped to an average of 9499.142 and it is the lowest RMSE value.
# Run algorithms using 10-fold cross validation
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
metric <- "RMSE"
# lm
set.seed(7)
fit.lm <- train(annual_salary~., data = dataset, method = "lm", metric = metric,
preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
# GLM
set.seed(7)
fit.glm <- train(annual_salary~., data = dataset, method = "glm", metric = metric,
preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
# GLMNET
set.seed(7)
fit.glmnet <- train(annual_salary~., data = dataset, method = "glmnet", metric = metric,
preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
# SVM
set.seed(7)
fit.svm <- train(annual_salary~., data = dataset, method = "svmRadial", metric = metric,
preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
# CART
set.seed(7)
grid <- expand.grid(.cp = c(0, 0.05, 0.1))
fit.cart <- train(annual_salary~., data = dataset, method = "rpart",
metric = metric, tuneGrid = grid, preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
# KNN
set.seed(7)
fit.knn <- train(annual_salary~., data = dataset, method = "knn", metric = metric,
preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
# Compare algorithms
transformResults <- resamples(list(LM = fit.lm, GLM = fit.glm, GLMNET = fit.glmnet, SVM = fit.svm,
CART = fit.cart, KNN = fit.knn))
summary(transformResults)
##
## Call:
## summary.resamples(object = transformResults)
##
## Models: LM, GLM, GLMNET, SVM, CART, KNN
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 4700.177 6369.428 7672.118 8158.791 9987.886 14931.29 0
## GLM 4700.177 6369.428 7672.118 8158.791 9987.886 14931.29 0
## GLMNET 3794.044 5807.775 6761.051 7648.689 8826.644 14989.22 0
## SVM 3216.751 7900.375 9523.904 9619.287 12076.117 14479.66 0
## CART 9131.547 14255.133 17228.121 16804.415 18871.452 25307.57 0
## KNN 8001.910 12283.028 14394.338 14421.760 17937.115 19914.27 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 5055.555 7949.744 9211.659 9973.989 11566.45 16958.22 0
## GLM 5055.555 7949.744 9211.659 9973.989 11566.45 16958.22 0
## GLMNET 3961.752 6886.772 8562.749 9499.142 12574.83 17207.80 0
## SVM 3876.917 10212.262 12556.541 12668.528 15924.99 20910.63 0
## CART 12333.291 18028.914 22724.389 22005.192 24238.21 32554.89 0
## KNN 8751.421 15540.670 17605.033 18042.219 21607.26 25350.09 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 0.6093564525 0.8713789 0.9144807 0.8807190 0.9419504 0.9674493 0
## GLM 0.6093564525 0.8713789 0.9144807 0.8807190 0.9419504 0.9674493 0
## GLMNET 0.6347990698 0.8862601 0.9213778 0.8917791 0.9529729 0.9801058 0
## SVM 0.5503851117 0.7257945 0.8401372 0.8068209 0.8974803 0.9588406 0
## CART 0.0005629575 0.1100722 0.3720714 0.3446217 0.5927935 0.7030182 0
## KNN 0.1309540262 0.3501362 0.5547306 0.5412556 0.7379328 0.9212458 0
dotplot(transformResults)
The accuracy of the well performing algorithms can be improved by tuning their parameters. We will consider tuning the parameters of GLMNET (Penalized Linear Regression) model. The GLMNET model actually fits many models at once. We can exploit this by passing a large number of lambda values, which control the amount of penalization in the model. train() is smart enough to only fit one model per alpha value and pass all of the lambda values at once for simultaneous fitting.
Often it is more useful to simply think of alpha as controlling the mixing between the two penalties and lambda controlling the amount of penalization. Alpha takes values between 0 and 1. Using alpha = 1 is pure lasso regression and alpha = 0 is pure ridge regression. Now we would try to fit a mixture of the two models (i.e. an elastic net) using an alpha between 0 and 1. For example, alpha = 0.05 would be 95% ridge regression and 5% lasso regression. The final values used for our model are alpha = 0.55 and lambda = 286.5643.
In the obtained results, we can notice that we have tried three alpha values: 0.10, 0.55, and 1. The best result uses alpha = 0.55 which is somewhere between ridge and lasso.
Now we try a much larger model search with expanded grid. By setting tuneLength = 10, we will search 10 alpha values and 10 lambda values for each. We do not need to worry about overfitting because this is penalized regression. This gives us a decent RMSE = 9379.896.
Estimated accuracy of GLMNET
print(fit.glmnet)
## glmnet
##
## 80 samples
## 12 predictors
##
## Pre-processing: centered (12), scaled (12), Box-Cox transformation (8)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 72, 72, 72, 72, 72, 72, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 28.65643 9867.532 0.8832388 8064.130
## 0.10 286.56429 9660.667 0.8881459 7815.917
## 0.10 2865.64287 11352.496 0.8444213 9026.426
## 0.55 28.65643 9847.609 0.8838320 8050.279
## 0.55 286.56429 9499.142 0.8917791 7648.689
## 0.55 2865.64287 11514.538 0.8469132 8973.647
## 1.00 28.65643 9846.204 0.8840158 8045.478
## 1.00 286.56429 9504.850 0.8910525 7668.987
## 1.00 2865.64287 12623.286 0.8241667 9845.055
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.55 and lambda = 286.5643.
# We first setup our cross-validation strategy
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
metric <- "RMSE"
set.seed(7)
tuneGridb <- expand.grid(.alpha = seq(0, 1, 0.05),
.lambda = c(1:10/10, length = 250))
anz_elnet_int = train(annual_salary ~., data = dataset, method = "glmnet", metric = metric,
preProc = c("BoxCox"), trControl = trainControl, tuneLength = 10,
tuneGrid = tuneGridb)
plot(anz_elnet_int, main = "Algorithm Tuning Results for GLMNET on the ANZ Dataset")
get_best_result = function(caret_fit) {
best = which(rownames(caret_fit$results) == rownames(caret_fit$bestTune))
best_result = caret_fit$results[best, ]
rownames(best_result) = NULL
best_result
}
get_best_result(anz_elnet_int)
## alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.85 250 9379.896 0.8815389 7501.152 2965.531 0.09541763 2160.812
We can try some ensemble methods based on boosting and bagging techniques for decision trees. Our goal is to see if we can further decrease our RMSE value. Lets take a closer look at the following ensemble methods:
Our results clearly demonstrate that Cubist is the most accurate with average RMSE = 9221.86 that is lower than that achieved by tuning GLMNET. Let’s dive deeper into Cubist and see if we can tune it further and get more skill out of it. Cubist has two parameters that are tunable with caret: committees which is the number of boosting operations and neighbors which is used during prediction and is the number of instances used to correct the rule based prediction. Let’s first look at the default tuning parameter used by caret that resulted in our accurate model.
# Try ensembles
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
metric <- "RMSE"
# Random Forest
set.seed(7)
fit.rf <- train(annual_salary~., data = dataset, method = "rf", metric = metric,
preProc = c("BoxCox"), trControl = trainControl)
# Stochastic Gradient Boosting
set.seed(7)
fit.gbm <- train(annual_salary~., data = dataset, method = "gbm", metric = metric,
preProc = c("BoxCox"), trControl = trainControl, verbose = FALSE)
# Cubist
set.seed(7)
fit.cubist <- train(annual_salary~., data = dataset, method = "cubist", metric = metric,
preProc = c("BoxCox"), trControl = trainControl)
# Compare algorithms
ensemble_results <- resamples(list(RF = fit.rf, GBM = fit.gbm, CUBIST = fit.cubist))
summary(ensemble_results)
##
## Call:
## summary.resamples(object = ensemble_results)
##
## Models: RF, GBM, CUBIST
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 5813.646 9084.092 10548.051 11516.942 14310.876 18501.49 0
## GBM 3574.502 8787.078 11063.645 10730.560 12821.759 15760.63 0
## CUBIST 3658.280 6023.502 7019.405 7400.854 8244.081 12162.72 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 6898.140 11011.773 13338.947 14788.95 19026.95 26072.81 0
## GBM 4712.571 11018.086 13175.517 13670.74 17075.19 21755.96 0
## CUBIST 4645.985 7208.399 8133.389 9221.86 11140.72 15821.21 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.3120527 0.6083242 0.7277457 0.7145160 0.8366432 0.9239531 0
## GBM 0.5130901 0.6500458 0.7620168 0.7481148 0.8343221 0.9522939 0
## CUBIST 0.6325717 0.8837906 0.9131694 0.8879406 0.9469858 0.9661574 0
dotplot(ensemble_results)
# Cubist's parameters
print(fit.cubist)
## Cubist
##
## 80 samples
## 12 predictors
##
## Pre-processing: Box-Cox transformation (8)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 72, 72, 72, 72, 72, 72, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 10556.082 0.8541639 8519.924
## 1 5 10105.795 0.8593155 8026.975
## 1 9 10120.205 0.8654163 8250.717
## 10 0 9487.879 0.8907622 7735.895
## 10 5 9342.679 0.8864379 7468.185
## 10 9 9420.510 0.8912483 7662.553
## 20 0 9368.750 0.8919549 7682.739
## 20 5 9221.860 0.8879406 7400.854
## 20 9 9306.174 0.8926766 7580.644
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
We can see that the best RMSE was achieved with committees = 20 and neighbors = 5. Let’s use a grid search to tune around those values. We’ll try all committees between 5 and 25 and spot-check a neighbors above and below 5. We can see that we have achieved a more accurate model again with RMSE of 9136.876 using committees = 24 and neighbors = 7.
# Tune the Cubist algorithm
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
metric <- "RMSE"
set.seed(7)
grid <- expand.grid(.committees = seq(5, 25, by = 1), .neighbors = c(3, 5, 7))
tune.cubist <- train(annual_salary~., data = dataset, method = "cubist", metric = metric,
preProc = c("BoxCox"), tuneGrid = grid, trControl = trainControl)
print(tune.cubist)
## Cubist
##
## 80 samples
## 12 predictors
##
## Pre-processing: Box-Cox transformation (8)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 72, 72, 72, 72, 72, 72, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 5 3 9601.134 0.8737252 7736.086
## 5 5 9367.776 0.8835248 7507.525
## 5 7 9307.381 0.8883369 7539.951
## 6 3 9550.664 0.8758472 7729.873
## 6 5 9269.868 0.8872149 7471.241
## 6 7 9195.297 0.8924788 7508.316
## 7 3 9634.476 0.8739306 7740.890
## 7 5 9388.624 0.8845605 7519.659
## 7 7 9322.539 0.8893933 7555.389
## 8 3 9580.128 0.8757120 7742.598
## 8 5 9292.956 0.8872766 7452.737
## 8 7 9233.247 0.8921275 7488.004
## 9 3 9675.263 0.8738587 7793.435
## 9 5 9405.144 0.8849408 7515.474
## 9 7 9335.576 0.8898538 7557.642
## 10 3 9622.303 0.8753008 7745.126
## 10 5 9342.679 0.8864379 7468.185
## 10 7 9285.231 0.8913310 7492.024
## 11 3 9640.485 0.8740224 7754.795
## 11 5 9373.421 0.8848132 7494.869
## 11 7 9306.717 0.8897718 7543.901
## 12 3 9593.599 0.8749305 7719.005
## 12 5 9310.871 0.8859963 7446.077
## 12 7 9248.999 0.8910498 7474.412
## 13 3 9605.108 0.8751531 7731.220
## 13 5 9331.581 0.8860397 7481.190
## 13 7 9262.807 0.8911609 7510.812
## 14 3 9557.029 0.8754854 7684.711
## 14 5 9265.512 0.8868577 7409.424
## 14 7 9205.992 0.8919293 7441.344
## 15 3 9590.262 0.8755992 7736.234
## 15 5 9314.227 0.8865548 7476.255
## 15 7 9239.911 0.8916977 7504.699
## 16 3 9539.243 0.8759907 7681.529
## 16 5 9243.616 0.8874222 7404.375
## 16 7 9182.958 0.8924552 7428.913
## 17 3 9550.435 0.8760057 7694.123
## 17 5 9260.692 0.8873698 7417.371
## 17 7 9201.515 0.8923136 7450.524
## 18 3 9534.056 0.8761320 7681.827
## 18 5 9233.362 0.8876934 7407.884
## 18 7 9178.574 0.8926358 7432.080
## 19 3 9541.955 0.8761006 7687.106
## 19 5 9250.982 0.8875313 7414.266
## 19 7 9190.123 0.8924571 7440.233
## 20 3 9522.000 0.8763283 7674.462
## 20 5 9221.860 0.8879406 7400.854
## 20 7 9164.651 0.8928322 7417.675
## 21 3 9516.084 0.8766609 7663.840
## 21 5 9220.710 0.8882621 7394.116
## 21 7 9166.552 0.8930865 7420.285
## 22 3 9498.098 0.8770302 7651.156
## 22 5 9197.374 0.8887549 7381.627
## 22 7 9149.267 0.8935121 7406.740
## 23 3 9502.076 0.8768903 7653.688
## 23 5 9203.674 0.8886392 7385.601
## 23 7 9154.857 0.8933522 7416.417
## 24 3 9489.155 0.8772658 7651.982
## 24 5 9182.980 0.8891233 7376.918
## 24 7 9136.876 0.8938084 7404.540
## 25 3 9501.558 0.8771734 7652.485
## 25 5 9200.882 0.8889484 7387.331
## 25 7 9150.514 0.8936473 7418.957
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 24 and neighbors = 7.
It looks like that cubist result is our most accurate model.
plot(tune.cubist, main = "Tuning Paremeters of Cubist on the ANZ Dataset")
In the last step we will finalize this process by creating a new standalone Cubist model with the parameters above trained using the whole dataset and then, we are ready to evaluate the model on validation dataset.
# Prepare the data transform using training data
set.seed(7)
x <- dataset[,2:13]
y <- dataset[,1]
preprocessParams <- preProcess(x, method = c("BoxCox"))
transX <- predict(preprocessParams, x)
# Train the final model
final_model <- cubist(x = transX, y = y, committees = 24)
# Transform the validation dataset
set.seed(7)
valX <- validation[,2:13]
trans_valX <- predict(preprocessParams, valX)
valY <- validation[,1]
# Use final model to make predictions on the validation dataset
predictions <- predict(final_model, newdata = trans_valX, neighbors = 7)
# Calculate RMSE
rmse <- RMSE(predictions, valY)
r2 <- R2(predictions, valY)
# Estimated accuracy on validation dataset
rmse <- round(rmse, 2)
r2 <- round(r2, 4)
values <- c(rmse, r2)
names <- c("RMSE", "Rsquared")
setNames(values, names)
## RMSE Rsquared
## 13023.3900 0.7648
As we can see a cubist model does deliver balance between data and its predictive powers. Cubist models are realtively easy to follow which can be beneficial when convincing business leaders and users of the value the model can bring to the business.