Welcome to the exercises of module 5! Please try to solve the following exercises, and if you do not remember how to write the code for certain things, have a look again at the previous scripts. Programming is about learning by doing! So, do not hesitate to look for help in the internet, too. Remember that there is almost always more than just one way to solve a problem in R!
Import the wine data set and split it into a training (70%) and testing (30%) data set. Create a random forest model trained with the training data set, that predicts the parameter assigned to your group, using ntree = 500 and mtry = 4.
library(randomForest)
dir <- "C:/Users/Max/Desktop/IAMO_neu/eLearning/Module_5_Random_Forests/solutions/"
data <- read.csv(paste0(dir, "data/wine_taste.csv"))
data$taste <- as.factor(data$taste)
set.seed(123)
subsample <- sample(nrow(data), 0.7 * nrow(data))
train_set <- data[subsample, ]
test_set <- data[-subsample, ]
model_A <- randomForest(alcohol ~. , data = train_set, ntree = 500, mtry = 4, importance = TRUE)
Apply your trained model to the testing data set and report the R-Square between observed and predicted values. Change your model four times using different values for “ntree” and “mtry” and report the changes in R-Square between observed and predicted values.
## apply model from exercise 1
pred_A <- predict(model_A, newdata = test_set)
rsquare_A <- cor(pred_A, test_set$alcohol)^2
rsquare_A
## [1] 0.7765222
## train new models with different parameters
model_B <- randomForest(alcohol ~. , data = train_set, ntree = 10, mtry = 2)
model_C <- randomForest(alcohol ~. , data = train_set, ntree = 100, mtry = 2)
model_D <- randomForest(alcohol ~. , data = train_set, ntree = 10, mtry = 8)
model_E <- randomForest(alcohol ~. , data = train_set, ntree = 100, mtry = 8)
model_rsquares <- data.frame(model = c("B", "C", "D", "E"), ntree = c(10, 100, 10, 100), mtry = c(2, 2, 8, 8))
model_rsquares$model[1] <- cor(predict(model_B, newdata = test_set), test_set$alcohol)^2
model_rsquares$model[2] <- cor(predict(model_C, newdata = test_set), test_set$alcohol)^2
model_rsquares$model[3] <- cor(predict(model_D, newdata = test_set), test_set$alcohol)^2
model_rsquares$model[4] <- cor(predict(model_E, newdata = test_set), test_set$alcohol)^2
model_rsquares
| model | ntree | mtry |
|---|---|---|
| 0.697338258444473 | 10 | 2 |
| 0.74598254867528 | 100 | 2 |
| 0.74329288810468 | 10 | 8 |
| 0.777094749064282 | 100 | 8 |
## --> the model with 10 tree and mtry = 2 performs worst; the model with 100 trees and mtry = 8 performs best!
What are the four most important variables in your model, according to the variable “%IncMSE”? Create partial dependence plots for these four variables, and export them as PNGs.
varImpPlot(model_A, 1, sort = T, n.var = 11, main = "Variable Importance")
## the four most important variables in predicting alcohol content are:
## density, residual sugar, fixed acidity and sulphates
library(pdp)
library(ggplot2)
library(gridExtra)
## if you use "partialPlot()", you cannot arrange multiple plots in one frame to only export one PNG.
## However, you can achieve that with the package "pdp" and the functions "partial()", "autoplot()" and "grid.arrange()" in the following way:
pd1 <- partial(model_A, pred.var = c("density"), chull = TRUE)
plot1 <- autoplot(pd1, contour = TRUE)
pd2 <- partial(model_A, pred.var = c("residual.sugar"), chull = TRUE)
plot2 <- autoplot(pd2, contour = TRUE)
pd3 <- partial(model_A, pred.var = c("fixed.acidity"), chull = TRUE)
plot3 <- autoplot(pd3, contour = TRUE)
pd4 <- partial(model_A, pred.var = c("sulphates"), chull = TRUE)
plot4 <- autoplot(pd4, contour = TRUE)
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
png(paste0(dir, "plot_ex3.png"), height=600, width=800)
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
dev.off()
## png
## 2
Use your own data set (see specifications in the Slack channel) for a random forest classification or regression. Briefly explain in text form what your data set is about, what the variables mean, and what your response variable is. Split the data set into training and testing, and apply the trained model to the testing data set. In case your response variable is categorical, please export the confusion matrix between observed and predicted values as CSV; if the response variable is numeric, please plot the observed against the predicted values in a scatter plot and export it as PNG.
What are the most important variables in your model? Does the variable ranking meet your expectations? Please plot the partial dependencies that you find most interesting, and export them as PNGs. Please explain the reasons that could lay behind the patterns in these plots.
Predicting historical yield
data <- read.csv(paste0(dir, "data/winterwheat_historical.csv"))
data$region <- as.factor(data$region)
data <- data[complete.cases(data),]
#rmarkdown::render(paste0(dir, "/VIF.Rmd"))
delete <- NULL
for (j in 1:length(data))
{ if(length(unique(data[,j]))==1) {
if(unique(data[,j])==0) { delete <- c(delete,j) } } }
delete_vars1 <- names(data)[delete]
data <- data[, which(names(data)%in% c(delete_vars1, "year")==F)]
#final_vars <- vif_func(data[,-c(1:2)])
#delete_vars2 <- names(data)[which(names(data) %in% final_vars==F)][-c(1:2)]
#data <- data[, c(1, 2, which(names(data) %in% final_vars)),]
## The following variables were excluded:
#c(delete_vars1, delete_vars2)
N <- 20
varimp_track <- as.data.frame(matrix(ncol=N+1, nrow=length(data)-1))
varimp_track[,1] <- names(data)[-2]
names(varimp_track) <- c("variable", paste("varimp_run", c(1:N), sep="_"))
predictor_vars <- names(data)[-c(1:2)]
partialdep_track <- list()
for (y in 1:N) {
set.seed(1000+y)
data_index <- sample(nrow(data), 0.7*nrow(data), replace = FALSE)
data_train <- data[data_index,]
data_test <- data[-data_index,]
model <- randomForest(yield ~ ., data = data_train, ntree = 300, importance = TRUE)
## fill column y+1 in "varimp_track"
varimp_track[,y+1] <- importance(model, type=1)[,1]
for (p in 1:length(predictor_vars))
{ minvar <- min(data[which(names(data)%in%predictor_vars[p])])
maxvar <- max(data[which(names(data)%in%predictor_vars[p])])
df <- as.data.frame(seq(from=minvar,to=maxvar, by=(maxvar-minvar)/50))
names(df) <- predictor_vars[p]
if(y==1) {
partialdep_track[[p]] <- pdp::partial(model, pred.var = predictor_vars[p], pred.grid = df, train = data_train)
} else {
partialdep_track[[p]] <- merge(partialdep_track[[p]],
pdp::partial(model, pred.var = predictor_vars[p], pred.grid = df, train = data_train),
by = predictor_vars[p], all = T)
}
names(partialdep_track)[p] <- predictor_vars[p]
names(partialdep_track[[p]])[y+1] <- paste("iteration", y, sep = "_")
}
}
saveRDS(varimp_track, paste0(dir, "varimp.rds"))
saveRDS(partialdep_track, paste0(dir, "partialdep.rds"))
Plot model results
varimp_mean <- data.frame(variable = varimp_track$variable, mean = rowMeans(varimp_track[,c(2:N+1)]))
for (i in 1:NROW(varimp_mean))
{ varimp_mean$parameter[i] <- strsplit(varimp_mean$variable[i],split="_")[[1]][1]
varimp_mean$phase[i] <- strsplit(varimp_mean$variable[i],split="_")[[1]][2]
}
varimp_mean <- varimp_mean[-1,]
varimp_mean$mean_factor <- cut(varimp_mean$mean,
breaks=c(-1,2.5,5,7.5,10),
labels=c("< 2.5", "2.5-5", "5-7.5", "7.5-10"))
library(viridis)
plot1 <- ggplot(varimp_mean,aes(x = phase , y= parameter, fill = mean_factor)) +
geom_tile(colour="white", size=0.2) +
geom_text(aes(label = round(mean, 1)), color="white", fontface="bold", size=8) +
labs(x="",y="") +
scale_fill_manual(values=rev(magma(9)[2:8]), name="%IncMSE") +
theme(text=element_text(size=26),
legend.key.size = unit(2,"line"),
axis.text.x=element_text(angle=45,hjust=1))
plot1
png(paste0(dir, "plot_ex07_A.png"), height = 700, width=600)
plot1
dev.off()
## png
## 2
## the four most important variables are:
topvars <- varimp_mean[rev(order(varimp_mean$mean)),]$variable[c(1:4)]
pd_plots <- list()
for (i in 1:length(topvars))
{ partialdep_sel <- partialdep_track[[which(names(partialdep_track)==topvars[i])]]
partialdep_summary <- data.frame(x_values = partialdep_sel[,1],
partdep_mean = apply(partialdep_sel[2:N+1], 1, mean),
partdep_sd = apply(partialdep_sel[2:N+1], 1, sd))
pd_plots[[i]] <- ggplot(data = partialdep_summary, aes(x = x_values, y = partdep_mean)) +
geom_line(aes(), size = 1.3) +
geom_ribbon(aes(ymin = partdep_mean-partdep_sd,
ymax = partdep_mean+partdep_sd), alpha=0.2) +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
ggtitle(paste0("Part. dep. of yield on ", topvars[i])) +
xlab(topvars[i]) +
ylab("Yield (ct/ha)")
}
library(gridExtra)
plot2 <- grid.arrange(pd_plots[[1]], pd_plots[[2]], pd_plots[[3]], pd_plots[[4]], ncol=2)
png(paste0(dir, "plot_ex07_B.png"), height = 800, width=1000)
plot(plot2)
dev.off()
## png
## 2
Predicting future yield
data <- read.csv(paste0(dir, "data/winterwheat_historical.csv"))
data$region <- as.factor(data$region)
data <- data[complete.cases(data),]
data_hist <- data
library(dplyr)
## import future data and select same columns as in the historical data
data_fut <- read.csv(paste0(dir, "data/winterwheat_future_new.csv"))
#data_fut <- select(data_fut, c("rcp", "period", names(data_hist)[-2]))
data_fut <- filter(data_fut, region != "Yerevan")
data_fut$region <- as.factor(data_fut$region)
## separate future data into 4 data frames
data_45_nearfut <- filter(data_fut, rcp == "rcp45", period == "2041_2060")
data_85_nearfut <- filter(data_fut, rcp == "rcp85", period == "2041_2060")
data_45_farfut <- filter(data_fut, rcp == "rcp45", period == "2081_2099")
data_85_farfut <- filter(data_fut, rcp == "rcp85", period == "2081_2099")
## join the 4 data frames in a list
testing_data <- list(data_45_nearfut, data_85_nearfut, data_45_farfut, data_85_farfut)
names(testing_data) <- c("RCP4.5_nearfut", "RCP8.5_nearfut", "RCP4.5_farfut", "RCP8.5_farfut")
## run models
N <- 10
## save predictions for each of the 4 data sets as a separate list object
pred_track <- list()
for (t in 1:length(testing_data))
{ data_train <- data_hist[,-2]
data_test <- testing_data[[t]]
pred_track[[t]] <- as.data.frame(matrix(ncol=1+N, nrow=NROW(data_test)))
pred_track[[t]][,1] <- as.character(data_test$region)
names(pred_track[[t]]) <- c("region", paste("prediction_run", c(1:N), sep="_"))
for (y in 1:N) {
model <- randomForest(yield ~ ., data = data_train, ntree = 300, importance = TRUE)
pred <- predict(model, data_test)
pred_track[[t]][which(rownames(pred_track[[t]])%in%rownames(data_test)), y+1] <- pred
}
write.csv(pred_track[[t]], paste0(dir, "predictions_", names(testing_data)[t], ".csv"), row.names=F)
}
## import historical yield data and calculate long-year average
yield_raw <- read.csv(paste0(dir, "data/winterwheat.csv"), sep=";")
yield <- data.frame(marz = yield_raw$region, mean_hist_yield = rowMeans(yield_raw[,c(5:20)]))
## name list elements in "pred_track"
names(pred_track) <- names(testing_data)
## for each of the four datasets:
for (t in 1:length(pred_track))
## calculate mean predictions across all N model runs
{ yield_fut <- data.frame(marz = pred_track[[t]]$region,
mean_yield = rowMeans(pred_track[[t]][,c(2:N+1)]))
names(yield_fut)[2] <- names(pred_track)[t]
## append "yield_fut" to "yield"
yield <- merge(yield, yield_fut, by="marz")
## calculate percent change
yield[[paste(names(pred_track)[t], "change", sep="_")]] <- ((yield[[names(pred_track)[t]]] - yield$mean_hist_yield)/yield$mean_hist_yield)*100
}
head(yield)
| marz | mean_hist_yield | RCP4.5_nearfut | RCP4.5_nearfut_change | RCP8.5_nearfut | RCP8.5_nearfut_change | RCP4.5_farfut | RCP4.5_farfut_change | RCP8.5_farfut | RCP8.5_farfut_change |
|---|---|---|---|---|---|---|---|---|---|
| Aragatsotn | 24.26250 | 24.39713 | 0.5548877 | 24.10466 | -0.6505670 | 24.41871 | 0.6438478 | 22.82669 | -5.917835 |
| Ararat | 38.77500 | 29.10108 | -24.9488582 | 29.00362 | -25.2001990 | 29.11261 | -24.9191123 | 28.30540 | -27.000912 |
| Armavir | 35.35000 | 29.20260 | -17.3901095 | 28.83992 | -18.4160564 | 28.96604 | -18.0592994 | 28.25071 | -20.082857 |
| Gegharkunik | 24.78750 | 24.51343 | -1.1056947 | 24.87039 | 0.3344129 | 25.28053 | 1.9890352 | 23.56604 | -4.927733 |
| Kotayk | 19.57500 | 23.34950 | 19.2822572 | 23.54539 | 20.2829604 | 23.82218 | 21.6969617 | 22.17259 | 13.269937 |
| Lori | 25.60625 | 24.72745 | -3.4319674 | 25.17914 | -1.6680100 | 25.70046 | 0.3679223 | 23.64132 | -7.673618 |
## add some NAs for Yerevan (otherwise, the join with the shapefile will not work)
yield[11,] <- c("Yerevan", rep(NA,11))
## re-format data on yield changes
library(reshape2)
yield_melted <- melt(yield[,c(1, 4, 6, 8, 10)], id.vars="marz")
yield_melted$RCP <- substring(yield_melted$variable, 1, 6)
yield_melted$period <- substring(yield_melted$variable, 8, 14)
names(yield_melted)[1] <- "region"
head(yield_melted)
| region | variable | value | RCP | period |
|---|---|---|---|---|
| Aragatsotn | RCP4.5_nearfut_change | 0.554887705840834 | RCP4.5 | nearfut |
| Ararat | RCP4.5_nearfut_change | -24.9488581639882 | RCP4.5 | nearfut |
| Armavir | RCP4.5_nearfut_change | -17.3901094871392 | RCP4.5 | nearfut |
| Gegharkunik | RCP4.5_nearfut_change | -1.10569470125697 | RCP4.5 | nearfut |
| Kotayk | RCP4.5_nearfut_change | 19.2822572252968 | RCP4.5 | nearfut |
| Lori | RCP4.5_nearfut_change | -3.4319673835417 | RCP4.5 | nearfut |
## load shapefile and fortify it
library(rgdal)
arm <- readOGR(paste0(dir, "shapefiles/ARM_marzes_short_simplified_500.shp"), verbose=FALSE)
arm_forti <- fortify(arm)
arm_forti$region <- NA
## add region names to "arm_forti"
arm_forti$region[arm_forti$group==0.1] <- "Syunik"
arm_forti$region[arm_forti$group==1.1] <- "Vayotz Dzor"
arm_forti$region[arm_forti$group==2.1] <- "Ararat"
arm_forti$region[arm_forti$group==3.1] <- "Armavir"
arm_forti$region[arm_forti$group==4.1] <- "Gegharkunik"
arm_forti$region[arm_forti$group==5.1] <- "Yerevan"
arm_forti$region[arm_forti$group==6.1] <- "Aragatsotn"
arm_forti$region[arm_forti$group==7.1] <- "Kotayk"
arm_forti$region[arm_forti$group==8.1] <- "Shirak"
arm_forti$region[arm_forti$group==9.1] <- "Tavush"
arm_forti$region[arm_forti$group==10.1] <- "Lori"
## join yield changes with "arm_forti"
plotdata <- merge(arm_forti, yield_melted[-2], by="region", sort=FALSE, all=TRUE)
plotdata$value <- as.numeric(plotdata$value)
## define maximum absolute value to center color legend
maxabs <- max(abs(na.omit(plotdata$value)))
## plot
library(pals)
plot3 <- ggplot(plotdata, aes(x=long, y=lat, group=group, fill=value)) +
ggtitle("Projected change in Winter Wheat yield") +
geom_polygon(color="black", size=0.25) +
coord_equal() + xlab("") + ylab("") +
facet_grid(period~RCP) +
theme(axis.ticks = element_blank(),
axis.text = element_blank()) +
scale_fill_gradientn("Change in %", colors = rev(as.vector(coolwarm(25))), limits=c(-maxabs, maxabs))
plot3
png(paste0(dir, "plot_ex08.png"), height = 800, width=1000)
plot(plot3 + theme(text=element_text(size=20),
legend.text = element_text(size=26),
plot.title = element_text(size=34),
legend.key.size = unit(1, 'cm'),
legend.title = element_text(size=30)))
dev.off()
## png
## 2