library(devtools)
library(rio)
library(stringr)
library(tseries)
library(lubridate)
library(tidyverse)
library(timetk)
library(readxl)
library(tidyquant)
library(scales)
library(forecast)   #  forecasting pkg
library(sweep)# Broom tidiers for forecast pk
library(timekit)
library(plotly)
library(broom)
library(tibble)
library(ggfortify)  #similar to broom,also helps autoplot of lm objects
library(gam) #Fit general additive model
library(highcharter)
library(dygraphs)
library(DT)
require(RColorBrewer)
setwd("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/time series")
#load excel file with rio
UptimeData<- rio::import("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/Uptimebycategories08-29-2017b.xlsx")

DT::datatable(UptimeData) 
# replace white spaces in column names with _

names(UptimeData)<-stringr::str_replace_all(names(UptimeData),"\\s", "_")

UptimeData%>%head()
str(UptimeData)
## 'data.frame':    979 obs. of  8 variables:
##  $ Month-Yr            : chr  "Jan-2017" "Jan-2017" "Jan-2017" "Jan-2017" ...
##  $ Dur/EBRG            : chr  "Durability" "Durability" "Durability" "Durability" ...
##  $ priority_category_cd: chr  "P70" "P70" "P70" "P70" ...
##  $ build_phase         : chr  "Comp Veh" "Comp Veh" "Comp Veh" "Comp Veh" ...
##  $ Build_Category      : chr  "Comp Veh" "Comp Veh" "Comp Veh" "Comp Veh" ...
##  $ Day_of_Week         : chr  "Sun" "Mon" "Tue" "Wed" ...
##  $ vehicle_family_ref  : chr  "JT" "JT" "JT" "JT" ...
##  $ APG_Uptime          : num  0 0 0 0 0 0 0 1 1 1 ...
UptimeData=UptimeData%>%mutate(Month=stringr:: str_sub(`Month-Yr`, 1, 3))%>%mutate_if(is.character,factor)


UptimeData%>%head()
Durability=UptimeData%>%filter(`Dur/EBRG`=="Durability")%>%rename(Dur=`Dur/EBRG`)

DT::datatable(Durability)
str(Durability)
## 'data.frame':    838 obs. of  9 variables:
##  $ Month-Yr            : Factor w/ 8 levels "Apr-2017","Aug-2017",..: 4 4 4 4 4 4 4 3 3 3 ...
##  $ Dur                 : Factor w/ 2 levels "Durability","EBRG-Trans": 1 1 1 1 1 1 1 1 1 1 ...
##  $ priority_category_cd: Factor w/ 4 levels "P100","P50","P70",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ build_phase         : Factor w/ 10 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Build_Category      : Factor w/ 5 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day_of_Week         : Factor w/ 7 levels "Fri","Mon","Sat",..: 4 2 6 7 5 1 3 4 2 6 ...
##  $ vehicle_family_ref  : Factor w/ 18 levels "BU","D2","DS",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ APG_Uptime          : num  0 0 0 0 0 0 0 1 1 1 ...
##  $ Month               : Factor w/ 8 levels "Apr","Aug","Feb",..: 4 4 4 4 4 4 4 3 3 3 ...
sum(is.na(Durability))
## [1] 7
dim(Durability)
## [1] 838   9
library(Amelia)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.4.1
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(Durability, main = "Missing values vs observed",col=c("blue","green"),y.cex=0.5)

apply(Durability,2,function(x) sum(is.na(x)))
##             Month-Yr                  Dur priority_category_cd 
##                    0                    0                    0 
##          build_phase       Build_Category          Day_of_Week 
##                    0                    7                    0 
##   vehicle_family_ref           APG_Uptime                Month 
##                    0                    0                    0
summary(Durability)
##      Month-Yr           Dur      priority_category_cd   build_phase 
##  May-2017:138   Durability:838   P100:269             VP      :372  
##  Jun-2017:121   EBRG-Trans:  0   P50 :  0             PS      :113  
##  Feb-2017:112                    P70 :487             Proto   :110  
##  Jul-2017:101                    P85 : 82             Comp Veh: 66  
##  Apr-2017:100                                         J1      : 53  
##  Jan-2017: 94                                         VP(A)   : 52  
##  (Other) :172                                         (Other) : 72  
##     Build_Category Day_of_Week vehicle_family_ref   APG_Uptime    
##  Comp Veh  : 66    Fri:125     KL     :103        Min.   :0.0000  
##  J1        : 53    Mon:121     JL     : 99        1st Qu.:0.4963  
##  Mule_Proto:161    Sat:115     LA-SRT : 83        Median :0.7599  
##  PS        :113    Sun:110     JT     : 66        Mean   :0.6764  
##  VP        :438    Thu:125     MP     : 60        3rd Qu.:0.9727  
##  NA's      :  7    Tue:121     RU     : 49        Max.   :1.0000  
##                    Wed:121     (Other):378                        
##      Month    
##  May    :138  
##  Jun    :121  
##  Feb    :112  
##  Jul    :101  
##  Apr    :100  
##  Jan    : 94  
##  (Other):172
smy=summary(Durability)%>%as.data.frame.array()

#smy=smy%>%replace_na(list(Dur = "",priority_category_cd="",Build_Category="",APG_Uptime="")) 
#replace NA variables in table
#smy=smy%>% replace_na(list(Dur = "",priority_category_cd= ""))

smy
Durability=na.omit(Durability)
ggplot(Durability, aes(APG_Uptime, Month, colour = Build_Category)) + geom_point()

ggplot(Durability, aes(APG_Uptime, Month, colour = Build_Category)) + geom_boxplot()+coord_flip()

str(Durability)
## 'data.frame':    831 obs. of  9 variables:
##  $ Month-Yr            : Factor w/ 8 levels "Apr-2017","Aug-2017",..: 4 4 4 4 4 4 4 3 3 3 ...
##  $ Dur                 : Factor w/ 2 levels "Durability","EBRG-Trans": 1 1 1 1 1 1 1 1 1 1 ...
##  $ priority_category_cd: Factor w/ 4 levels "P100","P50","P70",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ build_phase         : Factor w/ 10 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Build_Category      : Factor w/ 5 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day_of_Week         : Factor w/ 7 levels "Fri","Mon","Sat",..: 4 2 6 7 5 1 3 4 2 6 ...
##  $ vehicle_family_ref  : Factor w/ 18 levels "BU","D2","DS",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ APG_Uptime          : num  0 0 0 0 0 0 0 1 1 1 ...
##  $ Month               : Factor w/ 8 levels "Apr","Aug","Feb",..: 4 4 4 4 4 4 4 3 3 3 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:7] 394 395 396 397 398 399 400
##   .. ..- attr(*, "names")= chr [1:7] "394" "395" "396" "397" ...
ggplot(Durability, aes(APG_Uptime, Day_of_Week, colour = Build_Category)) + geom_point()

 # ggplot(Durability, aes(x=Day_of_Week, y=APG_Uptime, group=Build_Category, color=Build_Category)) + 
 #  geom_line() +
 #  geom_point()+
 #  geom_errorbar(aes(ymin=APG_Uptime-sd, ymax=APG_Uptime+sd), width=.2,
 #                 position=position_dodge(0.05))
ggplot(Durability, aes(APG_Uptime, Month, colour =priority_category_cd)) + geom_point()

ggplot(Durability, aes(APG_Uptime,  Day_of_Week, colour =priority_category_cd)) + geom_point()

ggplot(Durability, aes(APG_Uptime,  Day_of_Week, colour = build_phase)) + geom_point()

ggplot(Durability, aes(APG_Uptime,  Month, colour = build_phase)) + geom_point()

ggplot(Durability, aes(APG_Uptime,  Day_of_Week, colour = vehicle_family_ref)) + geom_point()

ggplot(Durability, aes(APG_Uptime,  Month, colour = vehicle_family_ref)) + geom_point()

Two-way ANOVA with interaction effect
These two calls are equivalent
mod1 = aov(APG_Uptime ~ Month*vehicle_family_ref, data = Durability)

summary(mod1)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Month                      7   1.89  0.2703   5.085 1.18e-05 ***
## vehicle_family_ref        17  15.06  0.8861  16.668  < 2e-16 ***
## Month:vehicle_family_ref  68  24.09  0.3543   6.664  < 2e-16 ***
## Residuals                738  39.24  0.0532                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitt=lm(APG_Uptime~Month+vehicle_family_ref+Day_of_Week+Build_Category+build_phase+priority_category_cd+Month*vehicle_family_ref+Day_of_Week*Build_Category+build_phase+priority_category_cd*Month+Month*Day_of_Week+Month*Build_Category+Month:build_phase   ,Durability)
summary(aov(fitt))%>%as.data.frame.list()
#options(dplyr.tibble.print =1e9)
#dplyr_tibble_print_original(50)
options(dplyr.print_max=100)


fitt=lm(APG_Uptime~Month+vehicle_family_ref+Day_of_Week+Build_Category+build_phase+priority_category_cd+Month*vehicle_family_ref+Day_of_Week*Build_Category+build_phase+priority_category_cd*Month+Month*Day_of_Week+Month*Build_Category+Month:build_phase   ,Durability)

summary(aov(fitt))
##                             Df Sum Sq Mean Sq F value   Pr(>F)    
## Month                        7  1.892  0.2703   6.183 5.13e-07 ***
## vehicle_family_ref          17 15.064  0.8861  20.267  < 2e-16 ***
## Day_of_Week                  6  1.430  0.2384   5.452 1.63e-05 ***
## Build_Category               2  5.851  2.9257  66.914  < 2e-16 ***
## build_phase                  4  0.818  0.2045   4.677 0.001000 ***
## priority_category_cd         2  0.461  0.2306   5.274 0.005350 ** 
## Month:vehicle_family_ref    67 20.528  0.3064   7.008  < 2e-16 ***
## Day_of_Week:Build_Category  24  1.033  0.0431   0.985 0.484256    
## Month:priority_category_cd  12  1.522  0.1268   2.900 0.000639 ***
## Month:Day_of_Week           42  1.498  0.0357   0.816 0.791087    
## Month:Build_Category         9  1.617  0.1797   4.110 3.72e-05 ***
## Month:build_phase            3  0.803  0.2676   6.120 0.000417 ***
## Residuals                  635 27.764  0.0437                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
DT::datatable( summary((fitt))%>%tidy()%>%filter(p.value<0.05))

7-way ANOVA

fitt=aov(APG_Uptime~Month+vehicle_family_ref+Day_of_Week+Build_Category+build_phase+priority_category_cd+Month*vehicle_family_ref+Day_of_Week*Build_Category+build_phase+priority_category_cd*Month+Month*Day_of_Week   ,Durability)
summary(fitt)
##                             Df Sum Sq Mean Sq F value   Pr(>F)    
## Month                        7  1.892  0.2703   5.794 1.58e-06 ***
## vehicle_family_ref          17 15.064  0.8861  18.994  < 2e-16 ***
## Day_of_Week                  6  1.430  0.2384   5.110 3.85e-05 ***
## Build_Category               2  5.851  2.9257  62.712  < 2e-16 ***
## build_phase                  4  0.818  0.2045   4.383  0.00167 ** 
## priority_category_cd         2  0.461  0.2306   4.943  0.00741 ** 
## Month:vehicle_family_ref    67 20.528  0.3064   6.568  < 2e-16 ***
## Day_of_Week:Build_Category  24  1.033  0.0431   0.923  0.57034    
## Month:priority_category_cd  12  1.522  0.1268   2.718  0.00135 ** 
## Month:Day_of_Week           42  1.498  0.0357   0.765  0.85970    
## Residuals                  647 30.184  0.0467                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(fitt)
tidy(fitt)
aovdata=broom::augment(fitt)

aovdata

There is clear trend in the residuals, the Fitted values increase as the residuals decrease. This is a violation of the ANOVA assumptions of constant residual variance. The normality assumption of the data in each group is clearly violated here too from the normality plot below.

ggplot(aovdata, aes(.fitted, .resid, colour = Month)) + geom_point() +
  xlab("Fitted Values") + ylab("Residuals")

autoplot(fitt,which = 1:2)

ggplot(aovdata, aes(.fitted, .resid, colour = Month)) + geom_point() +
  xlab("Fitted Values") + ylab("Residuals")+facet_wrap(~vehicle_family_ref )

ggplot(aovdata, aes(.fitted, .resid, colour = Month)) + geom_point() +
  xlab("Fitted Values") + ylab("Residuals")+facet_wrap(~build_phase  )

TukeyHSD(fitt, which = "build_phase")%>%as.data.frame.list()%>%rename(Adjusted_P_value=build_phase.p.adj)
TukeyHSD(fitt, which = "Month")%>%as.data.frame.list()%>%rename(Adjusted_P_value=Month.p.adj)
TukeyHSD(fitt, which = "vehicle_family_ref")%>%as.data.frame.list()%>%rename(Adjusted_P_value=vehicle_family_ref.p.adj)
TukeyHSD(fitt, which = "Day_of_Week")%>%as.data.frame.list()%>%rename(Adjusted_P_value=Day_of_Week.p.adj)
#TukeyHSD(fitt, which = "Build_Category ")

#TukeyHSD(fitt, which = "priority_category_cd ")
fitt.hsd = data.frame(TukeyHSD(fitt, which = "Month")$Month)

fitt.hsd$Comparison = row.names(fitt.hsd)

fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
  geom_pointrange() + ylab("Difference in Mean APG Uptime by Month") +
  coord_flip()

fitt.hsd = data.frame(TukeyHSD(fitt, which = "build_phase")$build_phase)

fitt.hsd$Comparison = row.names(fitt.hsd)

fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
  geom_pointrange() + ylab("Difference in Mean APG Uptime by build_phase") +
  coord_flip()

fitt.hsd = data.frame(TukeyHSD(fitt, which = "Day_of_Week")$Day_of_Week)

fitt.hsd$Comparison = row.names(fitt.hsd)

fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
  geom_pointrange() + ylab("Difference in Mean APG Uptime by Day_of_Week") +
  coord_flip()

fitt.hsd = data.frame(TukeyHSD(fitt, which = "vehicle_family_ref")$vehicle_family_ref)

fitt.hsd$Comparison = row.names(fitt.hsd)

fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
  geom_pointrange() + ylab("Difference in Mean APG Uptime by vehicle_family_ref") +
  coord_flip()+ theme(axis.text.y=element_text(angle=50, size=5, vjust=0.5))

broom::tidy(fitt)
broom::augment(fitt)
broom::confint_tidy(fitt)
broom::glance(fitt)
aug_anova=broom::augment_columns(fitt,Durability)
aug_anova
ggplot2::autoplot(fitt, which = 1:2, ncol = 2, label.size = F)+ theme_bw()

forecast::ggtsdisplay(fitt$residuals, lag.max=40,main="Residuals of Linear Model")

forecast::checkresiduals(fitt$residuals)

aug_anova1=aug_anova%>%mutate(index=1:dim(aug_anova)[1])


aug_anova1%>%ggplot(aes(x=index,y=APG_Uptime))+geom_point(aes(color="Observed"),show.legend=TRUE)+geom_point(aes(x=index,y=.fitted,color="Predicted"),data=aug_anova1, show.legend=TRUE)+
 labs(title = "Predicted Values vs Observed values", x = "Time",y="APG_Uptime")+
 
  theme_bw() +
# define a custom  background theme
#theme(panel.background = element_rect(fill = 'grey75'))  +
  

scale_colour_manual(name='Linear Model',values=c("Predicted"="red","Observed"='black'))+

#Make title bold and add a little space at the baseline (face, vjust)
  
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
  
#change the position of the legend
  
theme(legend.position="top") + 

# Change the color of the  title of the legend (name)  
  
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))

aug_anova1%>%ggplot(aes(x=index,y=APG_Uptime))+geom_line(aes(color="Observed"),show.legend=TRUE)+geom_line(aes(x=index,y=.fitted,color="Predicted"),data=aug_anova1, show.legend=TRUE)+
 labs(title = "Predicted Values vs Observed values", x = "Time",y="APG_Uptime")+
 
  theme_bw() +
# define a custom  background theme
#theme(panel.background = element_rect(fill = 'grey75'))  +
  

scale_colour_manual(name='Linear Model',values=c("Predicted"="red","Observed"='black'))+

#Make title bold and add a little space at the baseline (face, vjust)
  
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
  
#change the position of the legend
  
theme(legend.position="top") + 

# Change the color of the  title of the legend (name)  
  
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))