library(devtools)
library(rio)
library(stringr)
library(tseries)
library(lubridate)
library(tidyverse)
library(timetk)
library(readxl)
library(tidyquant)
library(scales)
library(forecast) # forecasting pkg
library(sweep)# Broom tidiers for forecast pk
library(timekit)
library(plotly)
library(broom)
library(tibble)
library(ggfortify) #similar to broom,also helps autoplot of lm objects
library(gam) #Fit general additive model
library(highcharter)
library(dygraphs)
library(DT)
require(RColorBrewer)
setwd("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/time series")
#load excel file with rio
UptimeData<- rio::import("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/Uptimebycategories08-29-2017b.xlsx")
DT::datatable(UptimeData)
# replace white spaces in column names with _
names(UptimeData)<-stringr::str_replace_all(names(UptimeData),"\\s", "_")
UptimeData%>%head()
str(UptimeData)
## 'data.frame': 979 obs. of 8 variables:
## $ Month-Yr : chr "Jan-2017" "Jan-2017" "Jan-2017" "Jan-2017" ...
## $ Dur/EBRG : chr "Durability" "Durability" "Durability" "Durability" ...
## $ priority_category_cd: chr "P70" "P70" "P70" "P70" ...
## $ build_phase : chr "Comp Veh" "Comp Veh" "Comp Veh" "Comp Veh" ...
## $ Build_Category : chr "Comp Veh" "Comp Veh" "Comp Veh" "Comp Veh" ...
## $ Day_of_Week : chr "Sun" "Mon" "Tue" "Wed" ...
## $ vehicle_family_ref : chr "JT" "JT" "JT" "JT" ...
## $ APG_Uptime : num 0 0 0 0 0 0 0 1 1 1 ...
UptimeData=UptimeData%>%mutate(Month=stringr:: str_sub(`Month-Yr`, 1, 3))%>%mutate_if(is.character,factor)
UptimeData%>%head()
Durability=UptimeData%>%filter(`Dur/EBRG`=="Durability")%>%rename(Dur=`Dur/EBRG`)
DT::datatable(Durability)
str(Durability)
## 'data.frame': 838 obs. of 9 variables:
## $ Month-Yr : Factor w/ 8 levels "Apr-2017","Aug-2017",..: 4 4 4 4 4 4 4 3 3 3 ...
## $ Dur : Factor w/ 2 levels "Durability","EBRG-Trans": 1 1 1 1 1 1 1 1 1 1 ...
## $ priority_category_cd: Factor w/ 4 levels "P100","P50","P70",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ build_phase : Factor w/ 10 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Build_Category : Factor w/ 5 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Day_of_Week : Factor w/ 7 levels "Fri","Mon","Sat",..: 4 2 6 7 5 1 3 4 2 6 ...
## $ vehicle_family_ref : Factor w/ 18 levels "BU","D2","DS",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ APG_Uptime : num 0 0 0 0 0 0 0 1 1 1 ...
## $ Month : Factor w/ 8 levels "Apr","Aug","Feb",..: 4 4 4 4 4 4 4 3 3 3 ...
sum(is.na(Durability))
## [1] 7
dim(Durability)
## [1] 838 9
library(Amelia)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.4.1
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(Durability, main = "Missing values vs observed",col=c("blue","green"),y.cex=0.5)
apply(Durability,2,function(x) sum(is.na(x)))
## Month-Yr Dur priority_category_cd
## 0 0 0
## build_phase Build_Category Day_of_Week
## 0 7 0
## vehicle_family_ref APG_Uptime Month
## 0 0 0
summary(Durability)
## Month-Yr Dur priority_category_cd build_phase
## May-2017:138 Durability:838 P100:269 VP :372
## Jun-2017:121 EBRG-Trans: 0 P50 : 0 PS :113
## Feb-2017:112 P70 :487 Proto :110
## Jul-2017:101 P85 : 82 Comp Veh: 66
## Apr-2017:100 J1 : 53
## Jan-2017: 94 VP(A) : 52
## (Other) :172 (Other) : 72
## Build_Category Day_of_Week vehicle_family_ref APG_Uptime
## Comp Veh : 66 Fri:125 KL :103 Min. :0.0000
## J1 : 53 Mon:121 JL : 99 1st Qu.:0.4963
## Mule_Proto:161 Sat:115 LA-SRT : 83 Median :0.7599
## PS :113 Sun:110 JT : 66 Mean :0.6764
## VP :438 Thu:125 MP : 60 3rd Qu.:0.9727
## NA's : 7 Tue:121 RU : 49 Max. :1.0000
## Wed:121 (Other):378
## Month
## May :138
## Jun :121
## Feb :112
## Jul :101
## Apr :100
## Jan : 94
## (Other):172
smy=summary(Durability)%>%as.data.frame.array()
#smy=smy%>%replace_na(list(Dur = "",priority_category_cd="",Build_Category="",APG_Uptime=""))
#replace NA variables in table
#smy=smy%>% replace_na(list(Dur = "",priority_category_cd= ""))
smy
Durability=na.omit(Durability)
ggplot(Durability, aes(APG_Uptime, Month, colour = Build_Category)) + geom_point()
ggplot(Durability, aes(APG_Uptime, Month, colour = Build_Category)) + geom_boxplot()+coord_flip()
str(Durability)
## 'data.frame': 831 obs. of 9 variables:
## $ Month-Yr : Factor w/ 8 levels "Apr-2017","Aug-2017",..: 4 4 4 4 4 4 4 3 3 3 ...
## $ Dur : Factor w/ 2 levels "Durability","EBRG-Trans": 1 1 1 1 1 1 1 1 1 1 ...
## $ priority_category_cd: Factor w/ 4 levels "P100","P50","P70",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ build_phase : Factor w/ 10 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Build_Category : Factor w/ 5 levels "Comp Veh","J1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Day_of_Week : Factor w/ 7 levels "Fri","Mon","Sat",..: 4 2 6 7 5 1 3 4 2 6 ...
## $ vehicle_family_ref : Factor w/ 18 levels "BU","D2","DS",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ APG_Uptime : num 0 0 0 0 0 0 0 1 1 1 ...
## $ Month : Factor w/ 8 levels "Apr","Aug","Feb",..: 4 4 4 4 4 4 4 3 3 3 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:7] 394 395 396 397 398 399 400
## .. ..- attr(*, "names")= chr [1:7] "394" "395" "396" "397" ...
ggplot(Durability, aes(APG_Uptime, Day_of_Week, colour = Build_Category)) + geom_point()
# ggplot(Durability, aes(x=Day_of_Week, y=APG_Uptime, group=Build_Category, color=Build_Category)) +
# geom_line() +
# geom_point()+
# geom_errorbar(aes(ymin=APG_Uptime-sd, ymax=APG_Uptime+sd), width=.2,
# position=position_dodge(0.05))
ggplot(Durability, aes(APG_Uptime, Month, colour =priority_category_cd)) + geom_point()
ggplot(Durability, aes(APG_Uptime, Day_of_Week, colour =priority_category_cd)) + geom_point()
ggplot(Durability, aes(APG_Uptime, Day_of_Week, colour = build_phase)) + geom_point()
ggplot(Durability, aes(APG_Uptime, Month, colour = build_phase)) + geom_point()
ggplot(Durability, aes(APG_Uptime, Day_of_Week, colour = vehicle_family_ref)) + geom_point()
ggplot(Durability, aes(APG_Uptime, Month, colour = vehicle_family_ref)) + geom_point()
mod1 = aov(APG_Uptime ~ Month*vehicle_family_ref, data = Durability)
summary(mod1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Month 7 1.89 0.2703 5.085 1.18e-05 ***
## vehicle_family_ref 17 15.06 0.8861 16.668 < 2e-16 ***
## Month:vehicle_family_ref 68 24.09 0.3543 6.664 < 2e-16 ***
## Residuals 738 39.24 0.0532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitt=lm(APG_Uptime~Month+vehicle_family_ref+Day_of_Week+Build_Category+build_phase+priority_category_cd+Month*vehicle_family_ref+Day_of_Week*Build_Category+build_phase+priority_category_cd*Month+Month*Day_of_Week+Month*Build_Category+Month:build_phase ,Durability)
summary(aov(fitt))%>%as.data.frame.list()
#options(dplyr.tibble.print =1e9)
#dplyr_tibble_print_original(50)
options(dplyr.print_max=100)
fitt=lm(APG_Uptime~Month+vehicle_family_ref+Day_of_Week+Build_Category+build_phase+priority_category_cd+Month*vehicle_family_ref+Day_of_Week*Build_Category+build_phase+priority_category_cd*Month+Month*Day_of_Week+Month*Build_Category+Month:build_phase ,Durability)
summary(aov(fitt))
## Df Sum Sq Mean Sq F value Pr(>F)
## Month 7 1.892 0.2703 6.183 5.13e-07 ***
## vehicle_family_ref 17 15.064 0.8861 20.267 < 2e-16 ***
## Day_of_Week 6 1.430 0.2384 5.452 1.63e-05 ***
## Build_Category 2 5.851 2.9257 66.914 < 2e-16 ***
## build_phase 4 0.818 0.2045 4.677 0.001000 ***
## priority_category_cd 2 0.461 0.2306 5.274 0.005350 **
## Month:vehicle_family_ref 67 20.528 0.3064 7.008 < 2e-16 ***
## Day_of_Week:Build_Category 24 1.033 0.0431 0.985 0.484256
## Month:priority_category_cd 12 1.522 0.1268 2.900 0.000639 ***
## Month:Day_of_Week 42 1.498 0.0357 0.816 0.791087
## Month:Build_Category 9 1.617 0.1797 4.110 3.72e-05 ***
## Month:build_phase 3 0.803 0.2676 6.120 0.000417 ***
## Residuals 635 27.764 0.0437
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
DT::datatable( summary((fitt))%>%tidy()%>%filter(p.value<0.05))
fitt=aov(APG_Uptime~Month+vehicle_family_ref+Day_of_Week+Build_Category+build_phase+priority_category_cd+Month*vehicle_family_ref+Day_of_Week*Build_Category+build_phase+priority_category_cd*Month+Month*Day_of_Week ,Durability)
summary(fitt)
## Df Sum Sq Mean Sq F value Pr(>F)
## Month 7 1.892 0.2703 5.794 1.58e-06 ***
## vehicle_family_ref 17 15.064 0.8861 18.994 < 2e-16 ***
## Day_of_Week 6 1.430 0.2384 5.110 3.85e-05 ***
## Build_Category 2 5.851 2.9257 62.712 < 2e-16 ***
## build_phase 4 0.818 0.2045 4.383 0.00167 **
## priority_category_cd 2 0.461 0.2306 4.943 0.00741 **
## Month:vehicle_family_ref 67 20.528 0.3064 6.568 < 2e-16 ***
## Day_of_Week:Build_Category 24 1.033 0.0431 0.923 0.57034
## Month:priority_category_cd 12 1.522 0.1268 2.718 0.00135 **
## Month:Day_of_Week 42 1.498 0.0357 0.765 0.85970
## Residuals 647 30.184 0.0467
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(fitt)
tidy(fitt)
aovdata=broom::augment(fitt)
aovdata
There is clear trend in the residuals, the Fitted values increase as the residuals decrease. This is a violation of the ANOVA assumptions of constant residual variance. The normality assumption of the data in each group is clearly violated here too from the normality plot below.
ggplot(aovdata, aes(.fitted, .resid, colour = Month)) + geom_point() +
xlab("Fitted Values") + ylab("Residuals")
autoplot(fitt,which = 1:2)
ggplot(aovdata, aes(.fitted, .resid, colour = Month)) + geom_point() +
xlab("Fitted Values") + ylab("Residuals")+facet_wrap(~vehicle_family_ref )
ggplot(aovdata, aes(.fitted, .resid, colour = Month)) + geom_point() +
xlab("Fitted Values") + ylab("Residuals")+facet_wrap(~build_phase )
TukeyHSD(fitt, which = "build_phase")%>%as.data.frame.list()%>%rename(Adjusted_P_value=build_phase.p.adj)
TukeyHSD(fitt, which = "Month")%>%as.data.frame.list()%>%rename(Adjusted_P_value=Month.p.adj)
TukeyHSD(fitt, which = "vehicle_family_ref")%>%as.data.frame.list()%>%rename(Adjusted_P_value=vehicle_family_ref.p.adj)
TukeyHSD(fitt, which = "Day_of_Week")%>%as.data.frame.list()%>%rename(Adjusted_P_value=Day_of_Week.p.adj)
#TukeyHSD(fitt, which = "Build_Category ")
#TukeyHSD(fitt, which = "priority_category_cd ")
fitt.hsd = data.frame(TukeyHSD(fitt, which = "Month")$Month)
fitt.hsd$Comparison = row.names(fitt.hsd)
fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
geom_pointrange() + ylab("Difference in Mean APG Uptime by Month") +
coord_flip()
fitt.hsd = data.frame(TukeyHSD(fitt, which = "build_phase")$build_phase)
fitt.hsd$Comparison = row.names(fitt.hsd)
fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
geom_pointrange() + ylab("Difference in Mean APG Uptime by build_phase") +
coord_flip()
fitt.hsd = data.frame(TukeyHSD(fitt, which = "Day_of_Week")$Day_of_Week)
fitt.hsd$Comparison = row.names(fitt.hsd)
fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
geom_pointrange() + ylab("Difference in Mean APG Uptime by Day_of_Week") +
coord_flip()
fitt.hsd = data.frame(TukeyHSD(fitt, which = "vehicle_family_ref")$vehicle_family_ref)
fitt.hsd$Comparison = row.names(fitt.hsd)
fitt.hsd
ggplot(fitt.hsd, aes(Comparison, y = diff, ymin = lwr, ymax = upr)) +
geom_pointrange() + ylab("Difference in Mean APG Uptime by vehicle_family_ref") +
coord_flip()+ theme(axis.text.y=element_text(angle=50, size=5, vjust=0.5))
broom::tidy(fitt)
broom::augment(fitt)
broom::confint_tidy(fitt)
broom::glance(fitt)
aug_anova=broom::augment_columns(fitt,Durability)
aug_anova
ggplot2::autoplot(fitt, which = 1:2, ncol = 2, label.size = F)+ theme_bw()
forecast::ggtsdisplay(fitt$residuals, lag.max=40,main="Residuals of Linear Model")
forecast::checkresiduals(fitt$residuals)
aug_anova1=aug_anova%>%mutate(index=1:dim(aug_anova)[1])
aug_anova1%>%ggplot(aes(x=index,y=APG_Uptime))+geom_point(aes(color="Observed"),show.legend=TRUE)+geom_point(aes(x=index,y=.fitted,color="Predicted"),data=aug_anova1, show.legend=TRUE)+
labs(title = "Predicted Values vs Observed values", x = "Time",y="APG_Uptime")+
theme_bw() +
# define a custom background theme
#theme(panel.background = element_rect(fill = 'grey75')) +
scale_colour_manual(name='Linear Model',values=c("Predicted"="red","Observed"='black'))+
#Make title bold and add a little space at the baseline (face, vjust)
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
#change the position of the legend
theme(legend.position="top") +
# Change the color of the title of the legend (name)
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))
aug_anova1%>%ggplot(aes(x=index,y=APG_Uptime))+geom_line(aes(color="Observed"),show.legend=TRUE)+geom_line(aes(x=index,y=.fitted,color="Predicted"),data=aug_anova1, show.legend=TRUE)+
labs(title = "Predicted Values vs Observed values", x = "Time",y="APG_Uptime")+
theme_bw() +
# define a custom background theme
#theme(panel.background = element_rect(fill = 'grey75')) +
scale_colour_manual(name='Linear Model',values=c("Predicted"="red","Observed"='black'))+
#Make title bold and add a little space at the baseline (face, vjust)
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
#change the position of the legend
theme(legend.position="top") +
# Change the color of the title of the legend (name)
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))