##Set the work directory

##Invoke required libraries

library("tidyr")
## Warning: package 'tidyr' was built under R version 3.6.2
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.6.2
library("psych")
## Warning: package 'psych' was built under R version 3.6.2
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library("DAAG")
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.1
library("lubridate")
## Warning: package 'lubridate' was built under R version 3.6.1
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

##Load Data from the CSV file

halo <- read.csv('HalloweenTableau.csv')
hal <- halo
hal <- separate(hal,"Date",c("Month","Date","Year"),sep = "/")

##Removing the orginal DateTime column and reordering sequence

hal <- hal[,-c(4)]
hal <- hal[,c(1,2,3,5,6,7,8,9,4)]

#Data Encoding for Categorical Variables

hal$Day.of.Week <- factor(hal$Day.of.Week,
                          levels = c('Sunday','Monday','Tuesday',
                                     'Wednesday','Thursday','Friday','Saturday'),
                          labels = c(1,2,3,4,5,6,7))

hal$Time <- factor(hal$Time,
                   levels = c('6:00pm', '6:30pm','7:00pm','7:30pm','8:00pm','8:15pm'),
                   labels = c(1,2,3,4,5,6))

#EDA

par(mfrow=c(2,3))
plot(hal$Count~hal$Day.of.Week,pch=20,color='blue',xlab = 'Count', ylab = 'Day')
plot(hal$Count~hal$Time,pch=20,color='blue',xlab = 'Count', ylab = 'Time')
plot(hal$Count~hal$Temprature,pch=20,color='blue',xlab = 'Count', ylab = 'Temprature')
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter
plot(hal$Count~hal$WindSpeed,pch=20,color='blue',xlab = 'Count', ylab = 'Wind Speed')
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter
plot(hal$Count~hal$Precipitation,pch=20,color='blue',xlab = 'Count', ylab = 'Precipitation')
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter

par(mfrow=c(1,2))
hist(hal$Temprature, xlab = 'Temprature', col = 'light green')
hist(hal$Count, xlab = 'Count', col = 'light green')

##Model Building

regressor1 <- lm(formula = Count~Day.of.Week+Time+Temprature+WindSpeed+Precipitation, data = hal)

regressor2 <- lm(formula = Count~Time+Temprature+WindSpeed+Precipitation, data = hal)

regressor3 <- lm(formula = Count~Time+WindSpeed+Precipitation, data = hal)

regressor4 <- lm(formula = Count~Time+Day.of.Week+WindSpeed+Precipitation, data = hal)

regressor5 <- lm(formula = Count~Time+Day.of.Week+Precipitation, data = hal)

regressor6 <- lm(formula = Count~Time+Precipitation, data = hal)

##Evaluating the Models comparing Rsquared and Adj R-squared

hal_rsq <- data.frame(Rsq = summary(regressor1)$r.squared, 
                      AdjRsq = summary(regressor1)$adj.r.squared)

hal_rsq <- rbind(hal_rsq, list(summary(regressor2)$r.squared,
                               summary(regressor2)$adj.r.squared))

hal_rsq <- rbind(hal_rsq, list(summary(regressor3)$r.squared,
                               summary(regressor3)$adj.r.squared))

hal_rsq <- rbind(hal_rsq, list(summary(regressor4)$r.squared,
                               summary(regressor4)$adj.r.squared))

hal_rsq <- rbind(hal_rsq, list(summary(regressor5)$r.squared,
                               summary(regressor3)$adj.r.squared))

hal_rsq <- rbind(hal_rsq, list(summary(regressor6)$r.squared,
                               summary(regressor4)$adj.r.squared))

hal_rsq
##         Rsq    AdjRsq
## 1 0.8646598 0.8275076
## 2 0.8178278 0.7922597
## 3 0.8175874 0.7955721
## 4 0.8629601 0.8287001
## 5 0.8604348 0.7955721
## 6 0.8012790 0.8287001

#Based on the output, we can conclude that given variables, Regressor4 is the optimum regression model

##Model Validation

KCV=cv.lm(data=hal, regressor4, m=10, seed=123)
## Analysis of Variance Table
## 
## Response: Count
##               Df Sum Sq Mean Sq F value  Pr(>F)    
## Time           5 255678   51136   56.75 < 2e-16 ***
## Day.of.Week    6  37992    6332    7.03 1.6e-05 ***
## WindSpeed      1   1012    1012    1.12    0.29    
## Precipitation  1    365     365    0.40    0.53    
## Residuals     52  46854     901                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = hal, regressor4, m = 10, seed = 123): 
## 
##  As there is >1 explanatory variable, cross-validation
##  predicted values for a fold are not a linear function
##  of corresponding overall predicted values.  Lines that
##  are shown for the different folds are approximate

## 
## fold 1 
## Observations in test set: 6 
##                6    12    37    57     59    65
## Predicted   13.5  38.0 -23.6 181.7 159.64 132.0
## cvpred      14.3  42.6 -29.8 186.8 165.98 136.4
## Count        9.0  20.0   0.0 167.0 161.00 107.0
## CV residual -5.3 -22.6  29.8 -19.8  -4.98 -29.4
## 
## Sum of squares = 2711    Mean square = 452    n = 6 
## 
## fold 2 
## Observations in test set: 7 
##                  1     2    35    39    40    50    52
## Predicted   -12.72 107.5  87.3 122.1 137.4 159.7 206.5
## cvpred       -8.36 117.2  97.5 136.6 146.9 165.7 205.4
## Count         0.00  75.0  68.0  91.0 124.0 138.0 226.0
## CV residual   8.36 -42.2 -29.5 -45.6 -22.9 -27.7  20.6
## 
## Sum of squares = 6515    Mean square = 931    n = 7 
## 
## fold 3 
## Observations in test set: 7 
##                 5    10    25    31    33     44 55
## Predicted   120.1 175.8  8.55 -24.1 117.0 124.41 36
## cvpred      108.9 179.5  6.09 -43.9 114.3 130.41 24
## Count       144.0 150.0  0.00  33.0  81.0 135.00 41
## CV residual  35.1 -29.5 -6.09  76.9 -33.3   4.59 17
## 
## Sum of squares = 9477    Mean square = 1354    n = 7 
## 
## fold 4 
## Observations in test set: 7 
##                  4     8    23     38    41   48    61
## Predicted   154.37 127.5 172.2  92.03 103.1 36.5 11.39
## cvpred      154.36 138.5 165.2  96.31  95.4 35.4  6.25
## Count       147.00  52.0 197.0 106.00 115.0 80.0 18.00
## CV residual  -7.36 -86.5  31.8   9.69  19.6 44.6 11.75
## 
## Sum of squares = 11162    Mean square = 1595    n = 7 
## 
## fold 5 
## Observations in test set: 7 
##               3     7     11     36     54     58    66
## Predicted   142  8.75 144.60  -8.59 65.629 193.95  33.0
## cvpred      150  9.16 148.03 -15.84 63.848 193.19  35.4
## Count       117  0.00 143.00  20.00 63.000 192.00  11.0
## CV residual -33 -9.16  -5.03  35.84 -0.848  -1.19 -24.4
## 
## Sum of squares = 3076    Mean square = 439    n = 7 
## 
## fold 6 
## Observations in test set: 7 
##                13     16     21    32    45    46   60
## Predicted    27.4 185.36 194.30  87.0 156.0 171.3 53.0
## cvpred       40.8 192.15 191.36  77.9 146.4 164.9 49.3
## Count         0.0 187.00 195.00 119.0 188.0 187.0 66.0
## CV residual -40.8  -5.15   3.64  41.1  41.6  22.1 16.7
## 
## Sum of squares = 5892    Mean square = 842    n = 7 
## 
## fold 7 
## Observations in test set: 7 
##                14    28    29    43    51    53    64
## Predicted   143.1 174.1 139.8  5.70 194.3 172.2 169.3
## cvpred      134.3 168.8 143.5  3.82 191.9 178.7 169.8
## Count       172.0 232.0 111.0 13.00 226.0 147.0 140.0
## CV residual  37.7  63.2 -32.5  9.18  34.1 -31.7 -29.8
## 
## Sum of squares = 9611    Mean square = 1373    n = 7 
## 
## fold 8 
## Observations in test set: 6 
##                 9    15    17    24    30    63
## Predicted   160.5 174.6 151.1  65.6  31.7 158.6
## cvpred      153.1 157.4 136.4  72.1  38.8 149.6
## Count       177.0 179.0 185.0  53.0  20.0 176.0
## CV residual  23.9  21.6  48.6 -19.1 -18.8  26.4
## 
## Sum of squares = 4814    Mean square = 802    n = 6 
## 
## fold 9 
## Observations in test set: 6 
##                18    22    26    49     56    62
## Predicted    44.4 206.5 127.3  41.0 151.68 128.6
## cvpred       60.4 199.8 118.5  41.9 145.49 116.4
## Count         3.0 252.0 147.0  22.0 149.00 148.0
## CV residual -57.4  52.2  28.5 -19.9   3.51  31.6
## 
## Sum of squares = 8245    Mean square = 1374    n = 6 
## 
## fold 10 
## Observations in test set: 6 
##                19  20     27    34     42     47
## Predicted    48.6 164 158.81 132.3  -9.67 140.01
## cvpred       61.3 170 153.32 153.9 -29.55 139.59
## Count         0.0 172 163.00  70.0  18.00 144.00
## CV residual -61.3   2   9.68 -83.9  47.55   4.41
## 
## Sum of squares = 13162    Mean square = 2194    n = 6 
## 
## Overall (Sum over all 6 folds) 
##   ms 
## 1131

This calculate the PRESS (predictive residual sum of squares), the lower, the better

#’ @title PRESS #’ @author Thomas Hopper #’ @description Returns the PRESS statistic (predictive residual sum of squares). #’ Useful for evaluating predictive power of regression models. #’ @param linear.model A linear regression model (class ‘lm’). Required.

PRESS <- function(linear.model) {
  #' calculate the predictive residuals
  pr <- residuals(linear.model)/(1-lm.influence(linear.model)$hat)
  #' calculate the PRESS
  PRESS <- sum(pr^2)
  return(PRESS)
}

This calculate the MSPE (mean square prediction error), the lower, the better

#’ @title MSPE #’ @author Yichen Qin #’ @description Returns the MSPE statistic (mean square prediction error). #’ @param linear.model A linear regression model (class ‘lm’). Required.

pred_r_squared <- function(linear.model) {
  #' Use anova() to get the sum of squares for the linear model
  lm.anova <- anova(linear.model)
  #' Calculate the total sum of squares
  tss <- sum(lm.anova$'Sum Sq')
  # Calculate the predictive R^2
  pred.r.squared <- 1-PRESS(linear.model)/(tss)
  
  return(pred.r.squared)
}

pred_r_squared(regressor4)
## [1] 0.778

#Considering the very limited dataset and amount of uncertainity associated, I think predicted R-squared = 78.4% is quite aceeptable

#Prediction

hal$DateTime <- halo$Date.and.Time
hal$YearTime <- paste(hal$Year,hal$Time,sep = '/')

ggplot()+
  geom_point(aes(x = hal$YearTime, y = hal$Count),
             color = 'red') + 
  geom_point(aes(x = hal$YearTime, y = predict(regressor4, newdata = hal)),
            color = 'blue') +
  ggtitle('Trick or Treaster Prediction') +
  xlab('Year and Time') +
  ylab('Visitors')

#Add prediction to dataset and export it as csv file, which can then be used for Tableau

hal$Prediction <- regressor4$fitted.values
hal$Prediction <- round(hal$Prediction, digits = 0)

hal$Prediction <-  ifelse(hal$Prediction < 0, 0, hal$Prediction)
head(hal)
##   Month Date Year Day.of.Week Time Temprature WindSpeed Precipitation Count
## 1    10   31 2008           6    1         58         6             0     0
## 2    10   31 2008           6    2         56         3             0    75
## 3    10   31 2008           6    3         54         0             0   117
## 4    10   31 2008           6    4         54         0             0   147
## 5    10   31 2008           6    5         52         0             0   144
## 6    10   31 2008           6    6         50         0             0     9
##           DateTime YearTime Prediction
## 1 10/31/2008 18:00   2008/1          0
## 2 10/31/2008 18:30   2008/2        108
## 3 10/31/2008 19:00   2008/3        142
## 4 10/31/2008 19:30   2008/4        154
## 5 10/31/2008 20:00   2008/5        120
## 6 10/31/2008 20:15   2008/6         13