Welcome to the exercises of module 5! Please try to solve the following exercises, and if you do not remember how to write the code for certain things, have a look again at the previous scripts. Programming is about learning by doing! So, do not hesitate to look for help in the internet, too. Remember that there is almost always more than just one way to solve a problem in R!

Part A: The wine data set

Exercise 01 (10 points)

Import the wine data set and split it into a training (70%) and testing (30%) data set. Create a random forest model trained with the training data set, that predicts the parameter assigned to your group, using ntree = 500 and mtry = 4.

library(randomForest)

dir <- "C:/Users/Max/Desktop/IAMO_neu/eLearning/Module_5_Random_Forests/solutions/"

data <- read.csv(paste0(dir, "data/wine_taste.csv"))
data$taste <- as.factor(data$taste)

set.seed(123)
subsample <- sample(nrow(data), 0.7 * nrow(data))

train_set <- data[subsample, ]
test_set  <- data[-subsample, ]

model_A <- randomForest(alcohol ~. , data = train_set, ntree = 500, mtry = 4, importance = TRUE)

Exercise 02 (10 points)

Apply your trained model to the testing data set and report the R-Square between observed and predicted values. Change your model four times using different values for “ntree” and “mtry” and report the changes in R-Square between observed and predicted values.

## apply model from exercise 1
pred_A <- predict(model_A, newdata = test_set)
rsquare_A <- cor(pred_A, test_set$alcohol)^2
rsquare_A
## [1] 0.7765222

## train new models with different parameters
model_B <- randomForest(alcohol ~. , data = train_set, ntree = 10, mtry = 2)
model_C <- randomForest(alcohol ~. , data = train_set, ntree = 100, mtry = 2)
model_D <- randomForest(alcohol ~. , data = train_set, ntree = 10, mtry = 8)
model_E <- randomForest(alcohol ~. , data = train_set, ntree = 100, mtry = 8)

model_rsquares <- data.frame(model = c("B", "C", "D", "E"), ntree = c(10, 100, 10, 100), mtry = c(2, 2, 8, 8))

model_rsquares$model[1] <- cor(predict(model_B, newdata = test_set), test_set$alcohol)^2
model_rsquares$model[2] <- cor(predict(model_C, newdata = test_set), test_set$alcohol)^2
model_rsquares$model[3] <- cor(predict(model_D, newdata = test_set), test_set$alcohol)^2
model_rsquares$model[4] <- cor(predict(model_E, newdata = test_set), test_set$alcohol)^2

model_rsquares
model ntree mtry
0.697338258444473 10 2
0.74598254867528 100 2
0.74329288810468 10 8
0.777094749064282 100 8

## --> the model with 10 tree and mtry = 2 performs worst; the model with 100 trees and mtry = 8 performs best!

Exercise 03 (10 points)

What are the four most important variables in your model, according to the variable “%IncMSE”? Create partial dependence plots for these four variables, and export them as PNGs.

varImpPlot(model_A, 1, sort = T, n.var = 11, main = "Variable Importance")

## the four most important variables in predicting alcohol content are: 
## density, residual sugar, fixed acidity and sulphates

library(pdp)

library(ggplot2)
library(gridExtra)

## if you use "partialPlot()", you cannot arrange multiple plots in one frame to only export one PNG. 
## However, you can achieve that with the package "pdp" and the functions "partial()", "autoplot()" and "grid.arrange()" in the following way:

pd1 <- partial(model_A, pred.var = c("density"), chull = TRUE)
plot1 <- autoplot(pd1, contour = TRUE)
pd2 <- partial(model_A, pred.var = c("residual.sugar"), chull = TRUE)
plot2 <- autoplot(pd2, contour = TRUE)
pd3 <- partial(model_A, pred.var = c("fixed.acidity"), chull = TRUE)
plot3 <- autoplot(pd3, contour = TRUE)
pd4 <- partial(model_A, pred.var = c("sulphates"), chull = TRUE)
plot4 <- autoplot(pd4, contour = TRUE)

grid.arrange(plot1, plot2, plot3, plot4, ncol=2)


png(paste0(dir, "plot_ex3.png"), height=600, width=800)
  grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
dev.off()
## png 
##   2

Part B: Your own data set

Exercise 04 (20 points)

Use your own data set (see specifications in the Slack channel) for a random forest classification or regression. Briefly explain in text form what your data set is about, what the variables mean, and what your response variable is. Split the data set into training and testing, and apply the trained model to the testing data set. In case your response variable is categorical, please export the confusion matrix between observed and predicted values as CSV; if the response variable is numeric, please plot the observed against the predicted values in a scatter plot and export it as PNG.

Exercise 05 (10 points)

What are the most important variables in your model? Does the variable ranking meet your expectations? Please plot the partial dependencies that you find most interesting, and export them as PNGs. Please explain the reasons that could lay behind the patterns in these plots.

Part C: Climate data set

Exercise 06 (10 points)

Predicting historical yield

  • Load the historical data of the crop assigned to your group from the folder “historical_climate”. Delete all incomplete observations (i.e. rows where there is any “NA” value).
data <- read.csv(paste0(dir, "data/winterwheat_historical.csv"))
data$region <- as.factor(data$region)
data <- data[complete.cases(data),]
  • Remove all predictor variables that are zero for all observations, and that have a Variance Inflation Factor of above 10. Report the names of all variables that you excluded.
#rmarkdown::render(paste0(dir, "/VIF.Rmd"))
delete <- NULL
for (j in 1:length(data))
{ if(length(unique(data[,j]))==1) {
    if(unique(data[,j])==0) { delete <- c(delete,j) } } }
delete_vars1 <- names(data)[delete]

data <- data[, which(names(data)%in% c(delete_vars1, "year")==F)]

#final_vars <- vif_func(data[,-c(1:2)])
#delete_vars2 <- names(data)[which(names(data) %in% final_vars==F)][-c(1:2)]
#data <- data[, c(1, 2, which(names(data) %in% final_vars)),]

## The following variables were excluded:
#c(delete_vars1, delete_vars2)
  • Train and test a random forest model 20 times, each time drawing different samples for training (70%) and testing (30%). Save the variable importances and the partial dependencies of each model run. Export the variable importances and partial dependencies as RDS-files using the function saveRDS().
N <- 20

varimp_track        <- as.data.frame(matrix(ncol=N+1, nrow=length(data)-1))
varimp_track[,1]    <- names(data)[-2]
names(varimp_track) <- c("variable", paste("varimp_run", c(1:N), sep="_"))     

predictor_vars  <- names(data)[-c(1:2)] 
partialdep_track <- list()
            
for (y in 1:N) { 

  set.seed(1000+y)
        
  data_index <- sample(nrow(data), 0.7*nrow(data), replace = FALSE)
  data_train <- data[data_index,]
  data_test  <- data[-data_index,]
  
  model   <- randomForest(yield ~ ., data = data_train, ntree = 300, importance = TRUE)
  
  ## fill column y+1 in "varimp_track"
  varimp_track[,y+1] <- importance(model, type=1)[,1]

  for (p in 1:length(predictor_vars))
  { minvar    <- min(data[which(names(data)%in%predictor_vars[p])])
    maxvar    <- max(data[which(names(data)%in%predictor_vars[p])])
    df        <- as.data.frame(seq(from=minvar,to=maxvar, by=(maxvar-minvar)/50))
    names(df) <- predictor_vars[p]
    
    if(y==1) {
       partialdep_track[[p]]      <- pdp::partial(model, pred.var = predictor_vars[p], pred.grid = df, train = data_train)
    } else {
       partialdep_track[[p]]      <- merge(partialdep_track[[p]], 
                                            pdp::partial(model, pred.var = predictor_vars[p], pred.grid = df, train = data_train), 
                                            by = predictor_vars[p], all = T) 
    }
    names(partialdep_track)[p] <- predictor_vars[p]
    names(partialdep_track[[p]])[y+1] <- paste("iteration", y, sep = "_")
  }
}

saveRDS(varimp_track, paste0(dir, "varimp.rds"))
saveRDS(partialdep_track, paste0(dir, "partialdep.rds"))

Exercise 07 (10 points)

Plot model results

  • Plot the mean variable importances across all 20 model runs with ggplot() and geom_tile(). Save your plot as PNG.
varimp_mean <- data.frame(variable = varimp_track$variable, mean = rowMeans(varimp_track[,c(2:N+1)]))
for (i in 1:NROW(varimp_mean))
{ varimp_mean$parameter[i] <- strsplit(varimp_mean$variable[i],split="_")[[1]][1]
  varimp_mean$phase[i]     <- strsplit(varimp_mean$variable[i],split="_")[[1]][2]
}

varimp_mean <- varimp_mean[-1,]

varimp_mean$mean_factor <- cut(varimp_mean$mean,
                               breaks=c(-1,2.5,5,7.5,10),
                               labels=c("< 2.5", "2.5-5", "5-7.5", "7.5-10"))

library(viridis)

plot1 <- ggplot(varimp_mean,aes(x = phase , y= parameter, fill = mean_factor)) +
               geom_tile(colour="white", size=0.2) + 
               geom_text(aes(label = round(mean, 1)), color="white", fontface="bold", size=8) +
               labs(x="",y="") +
               scale_fill_manual(values=rev(magma(9)[2:8]), name="%IncMSE") +
               theme(text=element_text(size=26), 
                     legend.key.size = unit(2,"line"),
                     axis.text.x=element_text(angle=45,hjust=1))

plot1


png(paste0(dir, "plot_ex07_A.png"), height = 700, width=600)
  plot1
dev.off()
## png 
##   2
  • For the 4 most important variables, calculate the mean and standard deviation of their partial dependencies across all 20 model runs. Plot the partial dependencies of these 4 variables, arrange the 4 resulting plots in one window using grid.arrange(), and save your plot as PNG.
## the four most important variables are:
topvars <- varimp_mean[rev(order(varimp_mean$mean)),]$variable[c(1:4)]

pd_plots <- list()

for (i in 1:length(topvars))
{ partialdep_sel <- partialdep_track[[which(names(partialdep_track)==topvars[i])]]

  partialdep_summary <- data.frame(x_values     = partialdep_sel[,1], 
                                   partdep_mean = apply(partialdep_sel[2:N+1], 1, mean),
                                   partdep_sd   = apply(partialdep_sel[2:N+1], 1, sd))
  
  pd_plots[[i]] <- ggplot(data = partialdep_summary, aes(x = x_values, y = partdep_mean)) + 
                        geom_line(aes(), size = 1.3) +
                        geom_ribbon(aes(ymin = partdep_mean-partdep_sd, 
                                        ymax = partdep_mean+partdep_sd), alpha=0.2) +
                        theme(axis.text.x=element_text(angle=45,hjust=1)) +
                        ggtitle(paste0("Part. dep. of yield on ", topvars[i])) +
                        xlab(topvars[i]) +
                        ylab("Yield (ct/ha)")  
}

library(gridExtra)

plot2 <- grid.arrange(pd_plots[[1]], pd_plots[[2]], pd_plots[[3]], pd_plots[[4]], ncol=2)


png(paste0(dir, "plot_ex07_B.png"), height = 800, width=1000)
  plot(plot2)
dev.off()
## png 
##   2

Exercise 08 (20 points)

Predicting future yield

  • Train a model with the historical data to predict future yields by using the future climatic data from the folder “future_climate” as testing data set. Do this 10 times, and for each combination of RCP and future time period separately. Save the resulting 10 predictions of each combination in 4 separate data frames, and export these data frames as CSV files.
data <- read.csv(paste0(dir, "data/winterwheat_historical.csv"))
data$region <- as.factor(data$region)
data <- data[complete.cases(data),]
data_hist <- data
library(dplyr)

## import future data and select same columns as in the historical data
data_fut <- read.csv(paste0(dir, "data/winterwheat_future_new.csv"))
#data_fut <- select(data_fut, c("rcp", "period", names(data_hist)[-2]))
data_fut <- filter(data_fut, region != "Yerevan")
data_fut$region <- as.factor(data_fut$region)

## separate future data into 4 data frames 
data_45_nearfut <- filter(data_fut, rcp == "rcp45", period == "2041_2060") 
data_85_nearfut <- filter(data_fut, rcp == "rcp85", period == "2041_2060")
data_45_farfut  <- filter(data_fut, rcp == "rcp45", period == "2081_2099")
data_85_farfut  <- filter(data_fut, rcp == "rcp85", period == "2081_2099")

## join the 4 data frames in a list
testing_data <- list(data_45_nearfut, data_85_nearfut, data_45_farfut, data_85_farfut)
names(testing_data) <- c("RCP4.5_nearfut", "RCP8.5_nearfut", "RCP4.5_farfut", "RCP8.5_farfut")

## run models
N <- 10

## save predictions for each of the 4 data sets as a separate list object
pred_track <- list()

for (t in 1:length(testing_data))
{ data_train <- data_hist[,-2]
  data_test  <- testing_data[[t]]
  
  pred_track[[t]]        <- as.data.frame(matrix(ncol=1+N, nrow=NROW(data_test)))
  pred_track[[t]][,1]    <- as.character(data_test$region)
  names(pred_track[[t]]) <- c("region", paste("prediction_run", c(1:N), sep="_"))

  for (y in 1:N) { 
    model   <- randomForest(yield ~ ., data = data_train, ntree = 300, importance = TRUE)
    pred    <- predict(model, data_test)
    pred_track[[t]][which(rownames(pred_track[[t]])%in%rownames(data_test)), y+1] <- pred
  }
 write.csv(pred_track[[t]], paste0(dir, "predictions_", names(testing_data)[t], ".csv"), row.names=F)
}
  • For each marz, calculate long-year historical yields based on the yearly yield values in the folder “historical_yields”.
  • From the future yield predictions and the long-year historical yield, calculate for each marz and each combination of RCP and future time period the predicted in- or decrease in yield in percent. Plot the results in a multipart-map using ggplot(), and export it as PNG. This figure should resemble the one from the main report of WP4.
## import historical yield data and calculate long-year average
yield_raw <- read.csv(paste0(dir, "data/winterwheat.csv"), sep=";")
yield <- data.frame(marz = yield_raw$region, mean_hist_yield = rowMeans(yield_raw[,c(5:20)]))

## name list elements in "pred_track"
names(pred_track) <- names(testing_data)

## for each of the four datasets:
for (t in 1:length(pred_track))

## calculate mean predictions across all N model runs
{ yield_fut <- data.frame(marz = pred_track[[t]]$region, 
                          mean_yield = rowMeans(pred_track[[t]][,c(2:N+1)]))
  names(yield_fut)[2] <- names(pred_track)[t]
    
  ## append "yield_fut" to "yield"
  yield <- merge(yield, yield_fut, by="marz")
  
  ## calculate percent change
  yield[[paste(names(pred_track)[t], "change", sep="_")]] <- ((yield[[names(pred_track)[t]]] - yield$mean_hist_yield)/yield$mean_hist_yield)*100
}

head(yield)
marz mean_hist_yield RCP4.5_nearfut RCP4.5_nearfut_change RCP8.5_nearfut RCP8.5_nearfut_change RCP4.5_farfut RCP4.5_farfut_change RCP8.5_farfut RCP8.5_farfut_change
Aragatsotn 24.26250 24.39713 0.5548877 24.10466 -0.6505670 24.41871 0.6438478 22.82669 -5.917835
Ararat 38.77500 29.10108 -24.9488582 29.00362 -25.2001990 29.11261 -24.9191123 28.30540 -27.000912
Armavir 35.35000 29.20260 -17.3901095 28.83992 -18.4160564 28.96604 -18.0592994 28.25071 -20.082857
Gegharkunik 24.78750 24.51343 -1.1056947 24.87039 0.3344129 25.28053 1.9890352 23.56604 -4.927733
Kotayk 19.57500 23.34950 19.2822572 23.54539 20.2829604 23.82218 21.6969617 22.17259 13.269937
Lori 25.60625 24.72745 -3.4319674 25.17914 -1.6680100 25.70046 0.3679223 23.64132 -7.673618

## add some NAs for Yerevan (otherwise, the join with the shapefile will not work)
yield[11,] <- c("Yerevan", rep(NA,11))

## re-format data on yield changes
library(reshape2)
yield_melted <- melt(yield[,c(1, 4, 6, 8, 10)], id.vars="marz")
yield_melted$RCP    <- substring(yield_melted$variable, 1, 6) 
yield_melted$period <- substring(yield_melted$variable, 8, 14)
names(yield_melted)[1] <- "region"

head(yield_melted)
region variable value RCP period
Aragatsotn RCP4.5_nearfut_change 0.554887705840834 RCP4.5 nearfut
Ararat RCP4.5_nearfut_change -24.9488581639882 RCP4.5 nearfut
Armavir RCP4.5_nearfut_change -17.3901094871392 RCP4.5 nearfut
Gegharkunik RCP4.5_nearfut_change -1.10569470125697 RCP4.5 nearfut
Kotayk RCP4.5_nearfut_change 19.2822572252968 RCP4.5 nearfut
Lori RCP4.5_nearfut_change -3.4319673835417 RCP4.5 nearfut

## load shapefile and fortify it 
library(rgdal)
arm              <- readOGR(paste0(dir, "shapefiles/ARM_marzes_short_simplified_500.shp"), verbose=FALSE)
arm_forti        <- fortify(arm)
arm_forti$region <- NA

## add region names to "arm_forti"
arm_forti$region[arm_forti$group==0.1] <- "Syunik"
arm_forti$region[arm_forti$group==1.1] <- "Vayotz Dzor"
arm_forti$region[arm_forti$group==2.1] <- "Ararat"
arm_forti$region[arm_forti$group==3.1] <- "Armavir"
arm_forti$region[arm_forti$group==4.1] <- "Gegharkunik"
arm_forti$region[arm_forti$group==5.1] <- "Yerevan"
arm_forti$region[arm_forti$group==6.1] <- "Aragatsotn"
arm_forti$region[arm_forti$group==7.1] <- "Kotayk"
arm_forti$region[arm_forti$group==8.1] <- "Shirak"
arm_forti$region[arm_forti$group==9.1] <- "Tavush"
arm_forti$region[arm_forti$group==10.1] <- "Lori"

## join yield changes with "arm_forti"
plotdata <- merge(arm_forti, yield_melted[-2], by="region", sort=FALSE, all=TRUE)
plotdata$value <- as.numeric(plotdata$value)

## define maximum absolute value to center color legend
maxabs <- max(abs(na.omit(plotdata$value)))

## plot
library(pals)
plot3 <- ggplot(plotdata, aes(x=long, y=lat, group=group, fill=value)) +
             ggtitle("Projected change in Winter Wheat yield") +
             geom_polygon(color="black", size=0.25) +
             coord_equal() + xlab("") + ylab("") +
             facet_grid(period~RCP) +
             theme(axis.ticks = element_blank(),
                   axis.text = element_blank()) +
             scale_fill_gradientn("Change in %",  colors = rev(as.vector(coolwarm(25))), limits=c(-maxabs, maxabs))

plot3


png(paste0(dir, "plot_ex08.png"), height = 800, width=1000)
  plot(plot3 + theme(text=element_text(size=20),
                     legend.text = element_text(size=26),
                     plot.title = element_text(size=34),
                     legend.key.size = unit(1, 'cm'),
                     legend.title = element_text(size=30)))
dev.off()
## png 
##   2