loading packages and data
require(xlsx)
## Loading required package: xlsx
## Loading required package: rJava
## Loading required package: xlsxjars
require(e1071)
## Loading required package: e1071
require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v tibble 1.4.2 v purrr 0.2.4
## v tidyr 0.7.2 v dplyr 0.7.4
## v readr 1.1.1 v stringr 1.2.0
## v tibble 1.4.2 v forcats 0.2.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
require(cluster)
## Loading required package: cluster
require(factoextra)
## Loading required package: factoextra
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
build <- read.xlsx('buildingsDoc.xlsx', 1)
preprocessing data to scale to range [0, 1]
pp <- preProcess(build[,5:11], method=c("range"))
transformed <- predict(pp, build[,5:11])
making variables
train <- transformed[1:2926, ]
test <- transformed[2927:nrow(transformed), ]
functions to test accuracy of models
mse <- function(actual, pred){ #meaning???????
ans <- mean((actual - pred) ^ 2)
return(ans)
}
r2 <- function(actual, pred){ #how well future samples to be predicted by model
ans <- 1 - (sum((actual - pred)^2)/sum((actual - mean(actual))^2))
return(ans)
}
ex_var <- function(actual, pred){ #how much variance of y explained by x, 1 is best
ans <- 1 - var(actual - pred)/var(actual)
return(ans)
}
making multiple regression models for each dependent variable
model <- lm(WBE ~ TEMP + HUMID + SOLAR + WIND, data = train)
pred <- predict(model, test[, 1:4])
print(paste0('MSE = ', mse(test$WBE, pred), #MSE = 0.034
', r^2 = ', r2(test$WBE, pred), #r^2 = 0.30
', explained variance = ', ex_var(test$WBE, pred))) #0.33
## [1] "MSE = 0.0343368911346634, r^2 = 0.298673360530942, explained variance = 0.325951228557521"
plot(pred, type = 'l', col = 'red', main = 'WBE')
lines(test$WBE, col = 'green')
legend(0, 0.9, legend = c("prediction", "actual"), col = c("red", "green"),
cex = 0.7, box.lty = 0, pch = 19)

model <- lm(WBCW ~ TEMP + HUMID + SOLAR + WIND, data = train)
pred <- predict(model, test[, 1:4])
print(paste0('MSE = ', mse(test$WBCW, pred), #MSE = 0.0071
', r^2 = ', r2(test$WBCW, pred), #r^2 = 0.41
', explained variance = ', ex_var(test$WBCW, pred))) #0.56
## [1] "MSE = 0.00708201074854426, r^2 = 0.412336240657542, explained variance = 0.564532721197946"
plot(pred, type = 'l', col = 'red', main = 'WBCW')
lines(test$WBCW, col = 'green')

model <- lm(WBHW ~ TEMP + HUMID + SOLAR, data = train) #don't use WIND, because p value for coefficient was 0.781
pred <- predict(model, test[, 1:3])
print(paste0('MSE = ', mse(test$WBHW, pred), #MSE = 0.025
', r^2 = ', r2(test$WBHW, pred), #r^2 = -0.37
', explained variance = ', ex_var(test$WBHW, pred))) #0.68
## [1] "MSE = 0.025205374555236, r^2 = -0.369134478059708, explained variance = 0.683746184085652"
plot(pred, type = 'l', col = 'red', main = 'WBHW')
lines(test$WBHW, col = 'green')

making support vector regression models for each dependent variable
model <- svm(WBE ~ TEMP + HUMID + SOLAR + WIND, data = train)
pred <- predict(model, test[, 1:4])
print(paste0('MSE = ', mse(test$WBE, pred), #MSE = 0.037
', r^2 = ', r2(test$WBE, pred), #r^2 = 0.24
', explained variance = ', ex_var(test$WBE, pred))) #0.33
## [1] "MSE = 0.0373625420760677, r^2 = 0.236874824413636, explained variance = 0.327974473527583"
plot(pred, type = 'l', col = 'red', main = 'WBE')
lines(test$WBE, col = 'green')

model <- svm(WBCW ~ TEMP + HUMID + SOLAR + WIND, data = train)
pred <- predict(model, test[, 1:4])
print(paste0('MSE = ', mse(test$WBCW, pred), #MSE = 0.0073
', r^2 = ', r2(test$WBCW, pred), #r^2 = 0.39
', explained variance = ', ex_var(test$WBCW, pred))) #0.55
## [1] "MSE = 0.00733582604935426, r^2 = 0.391274700489265, explained variance = 0.550968284719153"
plot(pred, type = 'l', col = 'red', main = 'WBCW')
lines(test$WBCW, col = 'green')

model <- svm(WBHW ~ TEMP + HUMID + SOLAR + WIND, data = train)
pred <- predict(model, test[, 1:4])
print(paste0('MSE = ', mse(test$WBHW, pred), #MSE = 0.030
', r^2 = ', r2(test$WBHW, pred), #r^2 = 0.-0.64
', explained variance = ', ex_var(test$WBHW, pred))) #0.61
## [1] "MSE = 0.0301665504087885, r^2 = -0.638621324919717, explained variance = 0.612454150511288"
plot(pred, type = 'l', col = 'red', main = 'WBHW')
lines(test$WBHW, col = 'green')

k means clustering code adapted from UC Business Analytics R Programming Guide
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
km.res <- kmeans(build[,5:8], centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(build[,5:8]))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 10
k.values <- 2:10
# extract avg silhouette for 2-10 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")

fviz_nbclust(build[,5:8], kmeans, method = "silhouette")

#2 clusters seems to be the optimal number
set.seed(1)
input <- build[,5:8]
pca <- princomp(input, cor=TRUE)
pc.comp <- pca$scores
pc.comp1 <- -1*pc.comp[,1]
pc.comp2 <- -1*pc.comp[,2]
components <- cbind(pc.comp1, pc.comp2)
cl <- kmeans(components, 2)
plot(pc.comp1, pc.comp2,col=cl$cluster)
points(cl$centers, pch=16)

clusters <- cl$cluster #extract the ones and twos as different groups