#reading in raw data1
df1 <- read.csv('Data2.csv')
head(df1)
## date_time Dependent var1 var2 var3 var4 var5 var6
## 1 2015-10-11T07:00:00Z 414.9572 2 16 5.680167 0.7020470 7.591875 3.460000
## 2 2015-10-11T07:10:00Z 494.1149 2 30 5.814046 0.8561147 8.331474 2.673900
## 3 2015-10-11T07:20:00Z 541.9777 2 16 6.073006 0.7561278 8.426399 3.109900
## 4 2015-10-11T07:30:00Z 352.6545 2 16 5.118218 0.7872996 8.143275 2.726475
## 5 2015-10-11T07:40:00Z 333.5432 2 16 5.083811 0.9556038 7.773475 2.469100
## 6 2015-10-11T07:50:00Z 547.4465 2 16 6.243423 0.8838568 8.811600 3.766875
## var7 var8
## 1 1.4306272 10.62770
## 2 1.1773447 10.24751
## 3 0.9336286 11.29218
## 4 0.9562747 10.82755
## 5 1.0673733 10.77063
## 6 0.9356092 11.04803
#checking original data sets
glimpse(df1)
## Rows: 144
## Columns: 10
## $ date_time <chr> "2015-10-11T07:00:00Z", "2015-10-11T07:10:00Z", "2015-10-11T~
## $ Dependent <dbl> 414.9572, 494.1149, 541.9777, 352.6545, 333.5432, 547.4465, ~
## $ var1 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ var2 <int> 16, 30, 16, 16, 16, 16, 16, 16, 16, 16, 30, 30, 30, 30, 16, ~
## $ var3 <dbl> 5.680167, 5.814046, 6.073006, 5.118218, 5.083811, 6.243423, ~
## $ var4 <dbl> 0.7020470, 0.8561147, 0.7561278, 0.7872996, 0.9556038, 0.883~
## $ var5 <dbl> 7.591875, 8.331474, 8.426399, 8.143275, 7.773475, 8.811600, ~
## $ var6 <dbl> 3.460000, 2.673900, 3.109900, 2.726475, 2.469100, 3.766875, ~
## $ var7 <dbl> 1.4306272, 1.1773447, 0.9336286, 0.9562747, 1.0673733, 0.935~
## $ var8 <dbl> 10.627695, 10.247507, 11.292180, 10.827546, 10.770628, 11.04~
skim(df1)
Name | df1 |
Number of rows | 144 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
date_time | 0 | 1 | 20 | 20 | 0 | 144 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Dependent | 0 | 1 | 651.17 | 547.47 | -10.56 | 191.61 | 546.04 | 1031.44 | 1690.44 | ▇▇▃▂▃ |
var1 | 0 | 1 | 2.23 | 1.14 | 1.00 | 2.00 | 2.00 | 2.00 | 7.00 | ▇▁▁▁▁ |
var2 | 0 | 1 | 101.76 | 179.86 | 16.00 | 16.00 | 30.00 | 30.00 | 701.00 | ▇▁▁▁▁ |
var3 | 0 | 1 | 5.94 | 2.40 | 1.87 | 4.31 | 5.81 | 7.52 | 12.09 | ▆▇▇▃▂ |
var4 | 0 | 1 | 1.04 | 0.40 | 0.26 | 0.79 | 0.98 | 1.39 | 2.12 | ▂▇▅▅▁ |
var5 | 0 | 1 | 9.41 | 3.52 | 3.28 | 7.30 | 9.01 | 11.90 | 18.51 | ▅▇▅▃▁ |
var6 | 0 | 1 | 3.12 | 1.37 | 1.50 | 1.70 | 3.08 | 3.78 | 7.50 | ▇▇▃▁▁ |
var7 | 0 | 1 | 11.14 | 21.90 | 0.88 | 1.30 | 2.54 | 6.01 | 82.00 | ▇▁▁▁▁ |
var8 | 0 | 1 | 13.90 | 3.43 | 4.96 | 10.92 | 13.69 | 17.22 | 19.54 | ▁▅▇▅▇ |
nrow(df1)
## [1] 144
head(df1$date_time)
## [1] "2015-10-11T07:00:00Z" "2015-10-11T07:10:00Z" "2015-10-11T07:20:00Z"
## [4] "2015-10-11T07:30:00Z" "2015-10-11T07:40:00Z" "2015-10-11T07:50:00Z"
class(df1$date_time)
## [1] "character"
df1$date_time1 <- AsDateTime(df1$date_time)
class(df1$date_time1)
## [1] "POSIXct" "POSIXt"
head(df1)
## date_time Dependent var1 var2 var3 var4 var5 var6
## 1 2015-10-11T07:00:00Z 414.9572 2 16 5.680167 0.7020470 7.591875 3.460000
## 2 2015-10-11T07:10:00Z 494.1149 2 30 5.814046 0.8561147 8.331474 2.673900
## 3 2015-10-11T07:20:00Z 541.9777 2 16 6.073006 0.7561278 8.426399 3.109900
## 4 2015-10-11T07:30:00Z 352.6545 2 16 5.118218 0.7872996 8.143275 2.726475
## 5 2015-10-11T07:40:00Z 333.5432 2 16 5.083811 0.9556038 7.773475 2.469100
## 6 2015-10-11T07:50:00Z 547.4465 2 16 6.243423 0.8838568 8.811600 3.766875
## var7 var8 date_time1
## 1 1.4306272 10.62770 2015-10-11 07:00:00
## 2 1.1773447 10.24751 2015-10-11 07:10:00
## 3 0.9336286 11.29218 2015-10-11 07:20:00
## 4 0.9562747 10.82755 2015-10-11 07:30:00
## 5 1.0673733 10.77063 2015-10-11 07:40:00
## 6 0.9356092 11.04803 2015-10-11 07:50:00
#Check for Nulls or NAs
#Checking if NA exist in Data set
is.null(df1)
## [1] FALSE
#Statistical Summary of original data
# Visualizations via Box plots
meltData <- melt(df1)
## Using date_time as id variables
## Warning: attributes are not identical across measure variables; they will be
## dropped
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()
#Selecting all columns except the first column: date_time which is a string
df2 <- df1 %>% dplyr::select(!c(date_time))
head(df2)
## Dependent var1 var2 var3 var4 var5 var6 var7 var8
## 1 414.9572 2 16 5.680167 0.7020470 7.591875 3.460000 1.4306272 10.62770
## 2 494.1149 2 30 5.814046 0.8561147 8.331474 2.673900 1.1773447 10.24751
## 3 541.9777 2 16 6.073006 0.7561278 8.426399 3.109900 0.9336286 11.29218
## 4 352.6545 2 16 5.118218 0.7872996 8.143275 2.726475 0.9562747 10.82755
## 5 333.5432 2 16 5.083811 0.9556038 7.773475 2.469100 1.0673733 10.77063
## 6 547.4465 2 16 6.243423 0.8838568 8.811600 3.766875 0.9356092 11.04803
## date_time1
## 1 2015-10-11 07:00:00
## 2 2015-10-11 07:10:00
## 3 2015-10-11 07:20:00
## 4 2015-10-11 07:30:00
## 5 2015-10-11 07:40:00
## 6 2015-10-11 07:50:00
#plot the dependent variable over time to get a feel of underlying pattern
df3 <- df2 %>% dplyr::select(c(date_time1,Dependent))
#display time plot
ggplot(data=df3, aes(x=date_time1, y=Dependent, group=1)) +
geom_line(color="red")+
geom_point(color='blue')+ggtitle("Fig1:Timeplot of Dependent Variable")+theme_economist()
From boxplots, Var 1, 2 and 7 appeared to be higly skewed
Dependent var did not exhibit any strong trend or seasonality
#scaled data
df.scaled <- scale(df2[,1:9])
df2.scaled<- data.frame(df.scaled)
#fit a regression model
#base.model <- lm(Dependent ~. ,data = df2[,1:9]) unscaled data produced same results
base.model <- lm(Dependent~., data=df2.scaled)
#view the output of the base regression model
summary(base.model)
##
## Call:
## lm(formula = Dependent ~ ., data = df2.scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71292 -0.11044 -0.02562 0.10843 0.76764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.562e-16 1.852e-02 0.000 1.000
## var1 1.734e-02 2.231e-02 0.777 0.438
## var2 1.827e-02 2.612e-02 0.699 0.485
## var3 6.830e-01 1.376e-01 4.962 2.06e-06 ***
## var4 6.091e-02 6.562e-02 0.928 0.355
## var5 2.111e-01 1.315e-01 1.606 0.111
## var6 1.046e-01 6.968e-02 1.501 0.136
## var7 1.118e-01 2.693e-02 4.152 5.82e-05 ***
## var8 -1.245e-03 2.098e-02 -0.059 0.953
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2222 on 135 degrees of freedom
## Multiple R-squared: 0.9534, Adjusted R-squared: 0.9506
## F-statistic: 345.1 on 8 and 135 DF, p-value: < 2.2e-16
# Compute the analysis of variance
res.aov <- aov(base.model, data = df2.scaled)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## var1 1 8.29 8.29 167.840 < 2e-16 ***
## var2 1 31.27 31.27 633.124 < 2e-16 ***
## var3 1 95.37 95.37 1931.260 < 2e-16 ***
## var4 1 0.07 0.07 1.374 0.2432
## var5 1 0.23 0.23 4.579 0.0342 *
## var6 1 0.26 0.26 5.317 0.0226 *
## var7 1 0.85 0.85 17.244 5.79e-05 ***
## var8 1 0.00 0.00 0.004 0.9528
## Residuals 135 6.67 0.05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances check
plot(res.aov, 1)
# 2. Normality of residual check
plot(res.aov, 2)
headers = c("var1","var2","var3","var4","var5","var6","var7","var8")
newdata <- df2.scaled [headers]
correlation = cor(newdata )
corrplot(correlation, method = 'number',type='upper',bg="lightblue")
mtext("Fig2: Correlation Matrix", at=.95, line=-15, cex=1.2)
#create horizontal bar chart to display each VIF value
barplot(car::vif(base.model), main = "Fig3: VIF Values on the Base Linear Model", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)
note: Both scaled and unscaled data exhibited similar results. Therefore, only scaled data results were published
ANOVA residual plots showed some patterns which indicated some heteroskedasticity
QQ plot had some fraying on the edges but otherwise quite normal in the middle. This is commonly seen residuals; normality can usually be improved with larger sample size
Var 3, 4, 5, 6 appeared to be correlated per Fig 2: Correlation Matrix
In addition, Fig 3: VIF chart confirmed multicollinearity amongst these 4 predictor variables. The VIF values of these predictors surpassed 5
Predictors that were significant: Var3 and Var7. However, Var7 is highly skewed. So only Var3 was selected for predicting purposes (all other predictors were dropped)
And dropping other predictors that were causing multicollinearity issues
# Split the data into training and test set: 80/20 ratio
set.seed(123)
training.samples <- df2.scaled$Dependent %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- df2.scaled[training.samples, ]
test.data <- df2.scaled[-training.samples, ]
ggplot(train.data, aes(var3, Dependent) ) +
geom_point() +
stat_smooth()+theme_economist()+ ggtitle("Fig4a: Scatter Plot of Dependent vs. Var3")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Build the model
linear_model <- lm(Dependent ~ var3, data = train.data)
# Make predictions
predictions <- linear_model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$Dependent),
R2 = R2(predictions, test.data$Dependent)
)
## RMSE R2
## 1 0.256342 0.9417951
ggplot(train.data, aes(var3, Dependent) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)+ggtitle("Fig4b: Plot of linear model")+theme_economist()
lm(Dependent ~ poly(var3, 2, raw = TRUE), data = train.data)
##
## Call:
## lm(formula = Dependent ~ poly(var3, 2, raw = TRUE), data = train.data)
##
## Coefficients:
## (Intercept) poly(var3, 2, raw = TRUE)1
## -0.06353 0.97281
## poly(var3, 2, raw = TRUE)2
## 0.07622
lm(Dependent ~ poly(var3, 6, raw = TRUE), data = train.data) %>%
summary()
##
## Call:
## lm(formula = Dependent ~ poly(var3, 6, raw = TRUE), data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54114 -0.06936 -0.00251 0.05777 0.43354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.17129 0.02326 -7.364 3.59e-11 ***
## poly(var3, 6, raw = TRUE)1 1.19064 0.06152 19.352 < 2e-16 ***
## poly(var3, 6, raw = TRUE)2 0.49661 0.08047 6.172 1.18e-08 ***
## poly(var3, 6, raw = TRUE)3 0.03437 0.07618 0.451 0.652771
## poly(var3, 6, raw = TRUE)4 -0.22612 0.05905 -3.830 0.000215 ***
## poly(var3, 6, raw = TRUE)5 -0.05059 0.02133 -2.371 0.019477 *
## poly(var3, 6, raw = TRUE)6 0.04057 0.01248 3.250 0.001534 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1494 on 109 degrees of freedom
## Multiple R-squared: 0.9794, Adjusted R-squared: 0.9783
## F-statistic: 865 on 6 and 109 DF, p-value: < 2.2e-16
Polynomial terms beyond the 3rd term is less significant
Let’s keep it simple by using only the first 3 terms
lm(Dependent ~ poly(var3, 3, raw = TRUE), data = train.data) %>%
summary()
##
## Call:
## lm(formula = Dependent ~ poly(var3, 3, raw = TRUE), data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43116 -0.07877 0.00143 0.06172 0.49601
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.11306 0.02002 -5.648 1.25e-07 ***
## poly(var3, 3, raw = TRUE)1 1.33672 0.03452 38.726 < 2e-16 ***
## poly(var3, 3, raw = TRUE)2 0.16499 0.01504 10.971 < 2e-16 ***
## poly(var3, 3, raw = TRUE)3 -0.16226 0.01380 -11.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.16 on 112 degrees of freedom
## Multiple R-squared: 0.9758, Adjusted R-squared: 0.9751
## F-statistic: 1503 on 3 and 112 DF, p-value: < 2.2e-16
# Build the model
poly_model <- lm(Dependent ~ poly(var3, 3, raw = TRUE), data = train.data)
# Make predictions
predictions_poly <- poly_model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions_poly, test.data$Dependent),
R2 = R2(predictions_poly, test.data$Dependent)
)
## RMSE R2
## 1 0.1494827 0.9750068
ggplot(train.data, aes(var3, Dependent) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 3, raw = TRUE))+ggtitle("Fig4c: Plot of Polynomial model of order 3")+theme_economist()
# Build the model
logmodel <- lm(Dependent ~ log(var3), data = train.data)
## Warning in log(var3): NaNs produced
# Make predictions
predictions_log <- logmodel %>% predict(test.data)
## Warning in log(var3): NaNs produced
predictions_log
## 3 6 11 13 14 25
## -0.84593182 -0.24365814 NaN 1.07475499 1.11763687 1.94959701
## 29 37 42 45 49 59
## NaN NaN NaN NaN NaN NaN
## 60 72 80 81 91 94
## NaN NaN NaN NaN 1.14012265 0.92375856
## 102 104 106 118 125 135
## -1.74546082 NaN 1.54357989 1.87778880 1.36151403 NaN
## 136 137 139 144
## -0.04661499 0.54244418 NaN NaN
library(splines)
knots <- quantile(train.data$var3, p = c(0.25, 0.5, 0.75))
# Build the model
spline_model <- lm (Dependent~ bs(var3, knots = knots), data = train.data)
# Make predictions
predictions_spline <- spline_model %>% predict(test.data)
## Warning in bs(var3, degree = 3L, knots = c(`25%` = -0.701194304800093, `50%`
## = -0.0954071289926184, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
# Model performance
data.frame(
RMSE = RMSE(predictions_spline, test.data$Dependent),
R2 = R2(predictions, test.data$Dependent)
)
## RMSE R2
## 1 0.1732723 0.9417951
ggplot(train.data, aes(var3, Dependent) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 3))+ggtitle("Fig4d: Plot of Spline Model")+theme_economist()
The 3 knotted spline model has a slight improvements over the 3rd order poly model
Let’s try using an automated method: Generalized additive models, or GAM, is a technique to automatically fit a spline regression and see if it can do any better?
# Build the model
GAMmodel <- gam(Dependent ~ s(var3), data = train.data)
# Make predictions
predictions_GAM <- GAMmodel %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions_GAM, test.data$var3),
R2 = R2(predictions_GAM, test.data$var3)
)
## RMSE R2
## 1 0.2443233 0.944182
ggplot(train.data, aes(var3, Dependent) ) +
geom_point() +
stat_smooth(method = gam, formula = y ~ s(x))+ggtitle("Fig4e: Plot of GAM Model")+theme_economist()
We started out with 8 predictor variables and 1 Dependent variable
Using linear model as our baseline, we were able to reduced the 8 predictors to only 1, var3. The methodology I used is analogous to a step-wise regression technique. That is to methodically trimmed off statistically insignificant predictors or ones that exhibited high covariance with other predictors. This method helped reduced the complexities of dealing with mulitple predictors (high dimensionality)
Furthermore, var3 has high correlation with the dependent variable = 0.9621
Once the linear model was built, all following non-linear models were based on using var3 as it’s predictor variable
The 3 non-linear models built were:
Polynomial of order 3
3 knotted spline
GAM model
Note: A log model was also tried but it produced Nans and was discarded from further analysis
linear-reg, RSME=0.2823
Polynomial, RSME=0.1595
Spline, RSME=0.1465
GAM, RSME=0.2238
The 3-knotted mannually made spline model had the lowest RSME followed by the polynomial model. The automated GAM model did not fared as well and the linear-model had the worse performance