##Set the work directory
##Invoke required libraries
library("tidyr")
## Warning: package 'tidyr' was built under R version 3.6.2
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.6.2
library("psych")
## Warning: package 'psych' was built under R version 3.6.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library("DAAG")
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.1
library("lubridate")
## Warning: package 'lubridate' was built under R version 3.6.1
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
##Load Data from the CSV file
halo <- read.csv('HalloweenTableau.csv')
hal <- halo
hal <- separate(hal,"Date",c("Month","Date","Year"),sep = "/")
##Removing the orginal DateTime column and reordering sequence
hal <- hal[,-c(4)]
hal <- hal[,c(1,2,3,5,6,7,8,9,4)]
#Data Encoding for Categorical Variables
hal$Day.of.Week <- factor(hal$Day.of.Week,
levels = c('Sunday','Monday','Tuesday',
'Wednesday','Thursday','Friday','Saturday'),
labels = c(1,2,3,4,5,6,7))
hal$Time <- factor(hal$Time,
levels = c('6:00pm', '6:30pm','7:00pm','7:30pm','8:00pm','8:15pm'),
labels = c(1,2,3,4,5,6))
#EDA
par(mfrow=c(2,3))
plot(hal$Count~hal$Day.of.Week,pch=20,color='blue',xlab = 'Count', ylab = 'Day')
plot(hal$Count~hal$Time,pch=20,color='blue',xlab = 'Count', ylab = 'Time')
plot(hal$Count~hal$Temprature,pch=20,color='blue',xlab = 'Count', ylab = 'Temprature')
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter
plot(hal$Count~hal$WindSpeed,pch=20,color='blue',xlab = 'Count', ylab = 'Wind Speed')
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter
plot(hal$Count~hal$Precipitation,pch=20,color='blue',xlab = 'Count', ylab = 'Precipitation')
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter
par(mfrow=c(1,2))
hist(hal$Temprature, xlab = 'Temprature', col = 'light green')
hist(hal$Count, xlab = 'Count', col = 'light green')
##Model Building
regressor1 <- lm(formula = Count~Day.of.Week+Time+Temprature+WindSpeed+Precipitation, data = hal)
regressor2 <- lm(formula = Count~Time+Temprature+WindSpeed+Precipitation, data = hal)
regressor3 <- lm(formula = Count~Time+WindSpeed+Precipitation, data = hal)
regressor4 <- lm(formula = Count~Time+Day.of.Week+WindSpeed+Precipitation, data = hal)
regressor5 <- lm(formula = Count~Time+Day.of.Week+Precipitation, data = hal)
regressor6 <- lm(formula = Count~Time+Precipitation, data = hal)
##Evaluating the Models comparing Rsquared and Adj R-squared
hal_rsq <- data.frame(Rsq = summary(regressor1)$r.squared,
AdjRsq = summary(regressor1)$adj.r.squared)
hal_rsq <- rbind(hal_rsq, list(summary(regressor2)$r.squared,
summary(regressor2)$adj.r.squared))
hal_rsq <- rbind(hal_rsq, list(summary(regressor3)$r.squared,
summary(regressor3)$adj.r.squared))
hal_rsq <- rbind(hal_rsq, list(summary(regressor4)$r.squared,
summary(regressor4)$adj.r.squared))
hal_rsq <- rbind(hal_rsq, list(summary(regressor5)$r.squared,
summary(regressor3)$adj.r.squared))
hal_rsq <- rbind(hal_rsq, list(summary(regressor6)$r.squared,
summary(regressor4)$adj.r.squared))
hal_rsq
## Rsq AdjRsq
## 1 0.8646598 0.8275076
## 2 0.8178278 0.7922597
## 3 0.8175874 0.7955721
## 4 0.8629601 0.8287001
## 5 0.8604348 0.7955721
## 6 0.8012790 0.8287001
#Based on the output, we can conclude that given variables, Regressor4 is the optimum regression model
##Model Validation
KCV=cv.lm(data=hal, regressor4, m=10, seed=123)
## Analysis of Variance Table
##
## Response: Count
## Df Sum Sq Mean Sq F value Pr(>F)
## Time 5 255678 51136 56.75 < 2e-16 ***
## Day.of.Week 6 37992 6332 7.03 1.6e-05 ***
## WindSpeed 1 1012 1012 1.12 0.29
## Precipitation 1 365 365 0.40 0.53
## Residuals 52 46854 901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = hal, regressor4, m = 10, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 6
## 6 12 37 57 59 65
## Predicted 13.5 38.0 -23.6 181.7 159.64 132.0
## cvpred 14.3 42.6 -29.8 186.8 165.98 136.4
## Count 9.0 20.0 0.0 167.0 161.00 107.0
## CV residual -5.3 -22.6 29.8 -19.8 -4.98 -29.4
##
## Sum of squares = 2711 Mean square = 452 n = 6
##
## fold 2
## Observations in test set: 7
## 1 2 35 39 40 50 52
## Predicted -12.72 107.5 87.3 122.1 137.4 159.7 206.5
## cvpred -8.36 117.2 97.5 136.6 146.9 165.7 205.4
## Count 0.00 75.0 68.0 91.0 124.0 138.0 226.0
## CV residual 8.36 -42.2 -29.5 -45.6 -22.9 -27.7 20.6
##
## Sum of squares = 6515 Mean square = 931 n = 7
##
## fold 3
## Observations in test set: 7
## 5 10 25 31 33 44 55
## Predicted 120.1 175.8 8.55 -24.1 117.0 124.41 36
## cvpred 108.9 179.5 6.09 -43.9 114.3 130.41 24
## Count 144.0 150.0 0.00 33.0 81.0 135.00 41
## CV residual 35.1 -29.5 -6.09 76.9 -33.3 4.59 17
##
## Sum of squares = 9477 Mean square = 1354 n = 7
##
## fold 4
## Observations in test set: 7
## 4 8 23 38 41 48 61
## Predicted 154.37 127.5 172.2 92.03 103.1 36.5 11.39
## cvpred 154.36 138.5 165.2 96.31 95.4 35.4 6.25
## Count 147.00 52.0 197.0 106.00 115.0 80.0 18.00
## CV residual -7.36 -86.5 31.8 9.69 19.6 44.6 11.75
##
## Sum of squares = 11162 Mean square = 1595 n = 7
##
## fold 5
## Observations in test set: 7
## 3 7 11 36 54 58 66
## Predicted 142 8.75 144.60 -8.59 65.629 193.95 33.0
## cvpred 150 9.16 148.03 -15.84 63.848 193.19 35.4
## Count 117 0.00 143.00 20.00 63.000 192.00 11.0
## CV residual -33 -9.16 -5.03 35.84 -0.848 -1.19 -24.4
##
## Sum of squares = 3076 Mean square = 439 n = 7
##
## fold 6
## Observations in test set: 7
## 13 16 21 32 45 46 60
## Predicted 27.4 185.36 194.30 87.0 156.0 171.3 53.0
## cvpred 40.8 192.15 191.36 77.9 146.4 164.9 49.3
## Count 0.0 187.00 195.00 119.0 188.0 187.0 66.0
## CV residual -40.8 -5.15 3.64 41.1 41.6 22.1 16.7
##
## Sum of squares = 5892 Mean square = 842 n = 7
##
## fold 7
## Observations in test set: 7
## 14 28 29 43 51 53 64
## Predicted 143.1 174.1 139.8 5.70 194.3 172.2 169.3
## cvpred 134.3 168.8 143.5 3.82 191.9 178.7 169.8
## Count 172.0 232.0 111.0 13.00 226.0 147.0 140.0
## CV residual 37.7 63.2 -32.5 9.18 34.1 -31.7 -29.8
##
## Sum of squares = 9611 Mean square = 1373 n = 7
##
## fold 8
## Observations in test set: 6
## 9 15 17 24 30 63
## Predicted 160.5 174.6 151.1 65.6 31.7 158.6
## cvpred 153.1 157.4 136.4 72.1 38.8 149.6
## Count 177.0 179.0 185.0 53.0 20.0 176.0
## CV residual 23.9 21.6 48.6 -19.1 -18.8 26.4
##
## Sum of squares = 4814 Mean square = 802 n = 6
##
## fold 9
## Observations in test set: 6
## 18 22 26 49 56 62
## Predicted 44.4 206.5 127.3 41.0 151.68 128.6
## cvpred 60.4 199.8 118.5 41.9 145.49 116.4
## Count 3.0 252.0 147.0 22.0 149.00 148.0
## CV residual -57.4 52.2 28.5 -19.9 3.51 31.6
##
## Sum of squares = 8245 Mean square = 1374 n = 6
##
## fold 10
## Observations in test set: 6
## 19 20 27 34 42 47
## Predicted 48.6 164 158.81 132.3 -9.67 140.01
## cvpred 61.3 170 153.32 153.9 -29.55 139.59
## Count 0.0 172 163.00 70.0 18.00 144.00
## CV residual -61.3 2 9.68 -83.9 47.55 4.41
##
## Sum of squares = 13162 Mean square = 2194 n = 6
##
## Overall (Sum over all 6 folds)
## ms
## 1131
#’ @title PRESS #’ @author Thomas Hopper #’ @description Returns the PRESS statistic (predictive residual sum of squares). #’ Useful for evaluating predictive power of regression models. #’ @param linear.model A linear regression model (class ‘lm’). Required.
PRESS <- function(linear.model) {
#' calculate the predictive residuals
pr <- residuals(linear.model)/(1-lm.influence(linear.model)$hat)
#' calculate the PRESS
PRESS <- sum(pr^2)
return(PRESS)
}
#’ @title MSPE #’ @author Yichen Qin #’ @description Returns the MSPE statistic (mean square prediction error). #’ @param linear.model A linear regression model (class ‘lm’). Required.
pred_r_squared <- function(linear.model) {
#' Use anova() to get the sum of squares for the linear model
lm.anova <- anova(linear.model)
#' Calculate the total sum of squares
tss <- sum(lm.anova$'Sum Sq')
# Calculate the predictive R^2
pred.r.squared <- 1-PRESS(linear.model)/(tss)
return(pred.r.squared)
}
pred_r_squared(regressor4)
## [1] 0.778
#Considering the very limited dataset and amount of uncertainity associated, I think predicted R-squared = 78.4% is quite aceeptable
#Prediction
hal$DateTime <- halo$Date.and.Time
hal$YearTime <- paste(hal$Year,hal$Time,sep = '/')
ggplot()+
geom_point(aes(x = hal$YearTime, y = hal$Count),
color = 'red') +
geom_point(aes(x = hal$YearTime, y = predict(regressor4, newdata = hal)),
color = 'blue') +
ggtitle('Trick or Treaster Prediction') +
xlab('Year and Time') +
ylab('Visitors')
#Add prediction to dataset and export it as csv file, which can then be used for Tableau
hal$Prediction <- regressor4$fitted.values
hal$Prediction <- round(hal$Prediction, digits = 0)
hal$Prediction <- ifelse(hal$Prediction < 0, 0, hal$Prediction)
head(hal)
## Month Date Year Day.of.Week Time Temprature WindSpeed Precipitation Count
## 1 10 31 2008 6 1 58 6 0 0
## 2 10 31 2008 6 2 56 3 0 75
## 3 10 31 2008 6 3 54 0 0 117
## 4 10 31 2008 6 4 54 0 0 147
## 5 10 31 2008 6 5 52 0 0 144
## 6 10 31 2008 6 6 50 0 0 9
## DateTime YearTime Prediction
## 1 10/31/2008 18:00 2008/1 0
## 2 10/31/2008 18:30 2008/2 108
## 3 10/31/2008 19:00 2008/3 142
## 4 10/31/2008 19:30 2008/4 154
## 5 10/31/2008 20:00 2008/5 120
## 6 10/31/2008 20:15 2008/6 13