library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
df <- read.csv("https://raw.githubusercontent.com/davidblumenstiel/data/master/Data614Project1Dataset.csv")
colnames(df)[1] <- "SeriesIndex"
summary(df)
## SeriesIndex group Var01 Var02
## Min. :40669 S01:1762 Min. : 9.03 Min. : 1339900
## 1st Qu.:41303 S02:1762 1st Qu.: 23.10 1st Qu.: 12520675
## Median :41946 S03:1762 Median : 38.44 Median : 21086550
## Mean :41945 S04:1762 Mean : 46.98 Mean : 37035741
## 3rd Qu.:42587 S05:1762 3rd Qu.: 66.78 3rd Qu.: 42486700
## Max. :43221 S06:1762 Max. :195.18 Max. :480879500
## NA's :854 NA's :842
## Var03 Var05 Var07
## Min. : 8.82 Min. : 8.99 Min. : 8.92
## 1st Qu.: 22.59 1st Qu.: 22.91 1st Qu.: 22.88
## Median : 37.66 Median : 38.05 Median : 38.05
## Mean : 46.12 Mean : 46.55 Mean : 46.56
## 3rd Qu.: 65.88 3rd Qu.: 66.38 3rd Qu.: 66.31
## Max. :189.36 Max. :195.00 Max. :189.72
## NA's :866 NA's :866 NA's :866
df %>%
group_by(group) %>%
summarise(var1_NAsum = sum(is.na(Var01)),
var2_NAsum = sum(is.na(Var02)),
var3_NAsum = sum(is.na(Var03)),
var5_NAsum = sum(is.na(Var05)),
var7_NAsum = sum(is.na(Var07)))
## # A tibble: 6 x 6
## group var1_NAsum var2_NAsum var3_NAsum var5_NAsum var7_NAsum
## <fct> <int> <int> <int> <int> <int>
## 1 S01 142 140 144 144 144
## 2 S02 142 140 144 144 144
## 3 S03 142 140 144 144 144
## 4 S04 142 140 144 144 144
## 5 S05 143 141 145 145 145
## 6 S06 143 141 145 145 145
Missing data is distributed within the same observations for the most part. Group has negligable effect on missing values. Coincidentally, there are almost 140 missing observations per group; approximately the number we are supposed to forecast.
Seperating data by group and making timeseries
#There's probably a prettier way to do this
s01 <- subset(df[,c("group","Var01","Var02")], group == "S01")
s01$group <- NULL
s01 <- ts(s01)
s02 <- subset(df[,c("group","Var02","Var03")], group == "S02")
s02$group <- NULL
s02 <- ts(s02)
s03 <- subset(df[,c("group","Var05","Var07")], group == "S03")
s03$group <- NULL
s03 <- ts(s03)
s04 <- subset(df[,c("group","Var01","Var02")], group == "S04")
s04$group <- NULL
s04 <- ts(s04)
s05 <- subset(df[,c("group","Var02","Var03")], group == "S05")
s05$group <- NULL
s05 <- ts(s05)
s06 <- subset(df[,c("group","Var05","Var07")], group == "S06")
s06$group <- NULL
s06 <- ts(s06)
series <- list(s01, s02, s03, s04, s05, s06)
#Could do this in cleaner, but oh well
autoplot(series[[1]][,1], main = "s01, var 1")
autoplot(series[[1]][,2], main = "s01, var 2")
autoplot(series[[2]][,1], main = "s02, var 2")
autoplot(series[[2]][,2], main = "s02, var 3")
autoplot(series[[3]][,1], main = "s03, var 5")
autoplot(series[[3]][,2], main = "s03, var 7")
autoplot(series[[4]][,1], main = "s04, var 1")
autoplot(series[[4]][,2], main = "s04, var 2")
autoplot(series[[5]][,1], main = "s05, var 2")
autoplot(series[[5]][,2], main = "s05, var 3")
autoplot(series[[6]][,1], main = "s06, var 5")
autoplot(series[[6]][,2], main = "s06, var 7")