rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.2 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##dat <- read.table("/Users/drlakeshialegettejones/Downloads/American_Airline_passengers1.csv", sep=",", header = TRUE)
dat <- read.csv("/Users/drlakeshialegettejones/Library/Mobile Documents/com~apple~CloudDocs/Desktop/ALL FILES!/Documents/All Class Info/Data Mining with R/American_Airline_passengers1.csv")
dat
## X2003 X55.317 X53.216 X62.656 X60.998 X66.503 X71.537 X72.092 X64.000
## 1 2004 50,075 47,066 57,356 57,307 59,912 59,962 58,393 51,419
## 2 2005 46,815 47,022 58,225 56,697 60,327 69,660 71,360 61,817
## 3 2006 57,655 56,441 69,593 74,414 78,972 79,307 76,982 65,043
## 4 2007 60,206 57,392 71,184 69,967 73,728 76,072 77,005 74,170
## 5 2008 57,092 59,175 69,906 63,995 68,133 68,780 69,416 61,618
## 6 2009 44,906 44,844 57,072 61,197 61,885 67,300 64,983 55,009
## 7 2010 47,152 43,117 56,901 45,132 45,812 42,620 46,368 40,127
## 8 2011 35,995 34,060 39,901 42,252 44,746 49,199 50,551 41,232
## 9 2012 39,279 34,833 45,290 36,496 41,541 40,313 39,204 34,557
## 10 2013 34,681 33,428 40,483 36,335 41,227 42,449 41,420 35,129
## 11 2014 36,528 37,977 45,699 45,052 51,104 50,156 49,595 43,880
## 12 2015 40,162 34,902 50,790 52,245 54,770 59,954 142,974 128,972
## 13 2016 105,835 87,860 98,310 104,517 144,985 127,941 127,795 112,763
## 14 2017 106,483 105,617 131,823 117,326 118,259 109,929 106,172 104,600
## 15 2018 93,438 93,955 107,938 108,555 115,985 107,033 111,286 104,715
## 16 2019 101,152 96,751 115,240 101,428 131,641 122,356 125,278 120,583
## 17 2020 91,762 90,663 53,051 7,639 15,491 22,323 49,336 56,352
## 18 2021 35,739 31,849 52,470 72,796 74,684 83,885 93,492 75,855
## 19 2022 65,738 71,531 97,970 77,766 79,095 72,210 71,413 69,378
## 20 2023 69,554 79,434 92,409 87,497 103,963 93,188 92,640 80,719
## X55.447 X62.865 X50.931 X56.330
## 1 47,245 56,010 53,956 60,175
## 2 62,012 66,786 67,391 70,801
## 3 58,299 65,844 64,960 68,272
## 4 64,897 71,648 67,568 66,547
## 5 52,321 59,440 47,032 54,271
## 6 48,954 55,022 47,385 51,512
## 7 38,854 46,181 40,992 45,964
## 8 40,082 46,420 43,833 47,597
## 9 29,218 37,039 36,367 41,561
## 10 31,665 42,063 36,082 40,889
## 11 39,408 41,631 39,628 42,697
## 12 121,215 129,563 122,524 124,347
## 13 118,802 131,622 115,485 119,060
## 14 95,397 108,102 99,820 98,632
## 15 94,300 109,809 110,490 112,362
## 16 115,706 128,987 115,988 123,632
## 17 52,696 51,624 42,552 43,657
## 18 73,688 84,803 97,655 96,764
## 19 70,797 78,478 66,241 70,375
## 20 79,846 93,968 94,529 81,037
## Remove commas in the data values
dat[, 2:13] <- lapply(dat[,2:3],function(x){as.numeric(gsub(",", "", x))})
dat
## X2003 X55.317 X53.216 X62.656 X60.998 X66.503 X71.537 X72.092 X64.000
## 1 2004 50075 47066 50075 47066 50075 47066 50075 47066
## 2 2005 46815 47022 46815 47022 46815 47022 46815 47022
## 3 2006 57655 56441 57655 56441 57655 56441 57655 56441
## 4 2007 60206 57392 60206 57392 60206 57392 60206 57392
## 5 2008 57092 59175 57092 59175 57092 59175 57092 59175
## 6 2009 44906 44844 44906 44844 44906 44844 44906 44844
## 7 2010 47152 43117 47152 43117 47152 43117 47152 43117
## 8 2011 35995 34060 35995 34060 35995 34060 35995 34060
## 9 2012 39279 34833 39279 34833 39279 34833 39279 34833
## 10 2013 34681 33428 34681 33428 34681 33428 34681 33428
## 11 2014 36528 37977 36528 37977 36528 37977 36528 37977
## 12 2015 40162 34902 40162 34902 40162 34902 40162 34902
## 13 2016 105835 87860 105835 87860 105835 87860 105835 87860
## 14 2017 106483 105617 106483 105617 106483 105617 106483 105617
## 15 2018 93438 93955 93438 93955 93438 93955 93438 93955
## 16 2019 101152 96751 101152 96751 101152 96751 101152 96751
## 17 2020 91762 90663 91762 90663 91762 90663 91762 90663
## 18 2021 35739 31849 35739 31849 35739 31849 35739 31849
## 19 2022 65738 71531 65738 71531 65738 71531 65738 71531
## 20 2023 69554 79434 69554 79434 69554 79434 69554 79434
## X55.447 X62.865 X50.931 X56.330
## 1 50075 47066 50075 47066
## 2 46815 47022 46815 47022
## 3 57655 56441 57655 56441
## 4 60206 57392 60206 57392
## 5 57092 59175 57092 59175
## 6 44906 44844 44906 44844
## 7 47152 43117 47152 43117
## 8 35995 34060 35995 34060
## 9 39279 34833 39279 34833
## 10 34681 33428 34681 33428
## 11 36528 37977 36528 37977
## 12 40162 34902 40162 34902
## 13 105835 87860 105835 87860
## 14 106483 105617 106483 105617
## 15 93438 93955 93438 93955
## 16 101152 96751 101152 96751
## 17 91762 90663 91762 90663
## 18 35739 31849 35739 31849
## 19 65738 71531 65738 71531
## 20 69554 79434 69554 79434
#
# R example on creating time series object
?ts
# gnp <- ts(cumsum(1 + round(rnorm(100), 2)),
# start = c(1954, 7), frequency = 12)
## Matrix transpose
temp_dat <- t(dat)
temp_dat
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## X2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
## X55.317 50075 46815 57655 60206 57092 44906 47152 35995 39279 34681 36528 40162
## X53.216 47066 47022 56441 57392 59175 44844 43117 34060 34833 33428 37977 34902
## X62.656 50075 46815 57655 60206 57092 44906 47152 35995 39279 34681 36528 40162
## X60.998 47066 47022 56441 57392 59175 44844 43117 34060 34833 33428 37977 34902
## X66.503 50075 46815 57655 60206 57092 44906 47152 35995 39279 34681 36528 40162
## X71.537 47066 47022 56441 57392 59175 44844 43117 34060 34833 33428 37977 34902
## X72.092 50075 46815 57655 60206 57092 44906 47152 35995 39279 34681 36528 40162
## X64.000 47066 47022 56441 57392 59175 44844 43117 34060 34833 33428 37977 34902
## X55.447 50075 46815 57655 60206 57092 44906 47152 35995 39279 34681 36528 40162
## X62.865 47066 47022 56441 57392 59175 44844 43117 34060 34833 33428 37977 34902
## X50.931 50075 46815 57655 60206 57092 44906 47152 35995 39279 34681 36528 40162
## X56.330 47066 47022 56441 57392 59175 44844 43117 34060 34833 33428 37977 34902
## [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## X2003 2016 2017 2018 2019 2020 2021 2022 2023
## X55.317 105835 106483 93438 101152 91762 35739 65738 69554
## X53.216 87860 105617 93955 96751 90663 31849 71531 79434
## X62.656 105835 106483 93438 101152 91762 35739 65738 69554
## X60.998 87860 105617 93955 96751 90663 31849 71531 79434
## X66.503 105835 106483 93438 101152 91762 35739 65738 69554
## X71.537 87860 105617 93955 96751 90663 31849 71531 79434
## X72.092 105835 106483 93438 101152 91762 35739 65738 69554
## X64.000 87860 105617 93955 96751 90663 31849 71531 79434
## X55.447 105835 106483 93438 101152 91762 35739 65738 69554
## X62.865 87860 105617 93955 96751 90663 31849 71531 79434
## X50.931 105835 106483 93438 101152 91762 35739 65738 69554
## X56.330 87860 105617 93955 96751 90663 31849 71531 79434
## Create a vector for all values in the data frame
dat_val <- as.vector(temp_dat[-1, 1:20])
## Create time series object
dat_ts <- ts( dat_val,start = c(2003, 1), frequency = 12 )
dat_ts
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct
## 2003 50075 47066 50075 47066 50075 47066 50075 47066 50075 47066
## 2004 46815 47022 46815 47022 46815 47022 46815 47022 46815 47022
## 2005 57655 56441 57655 56441 57655 56441 57655 56441 57655 56441
## 2006 60206 57392 60206 57392 60206 57392 60206 57392 60206 57392
## 2007 57092 59175 57092 59175 57092 59175 57092 59175 57092 59175
## 2008 44906 44844 44906 44844 44906 44844 44906 44844 44906 44844
## 2009 47152 43117 47152 43117 47152 43117 47152 43117 47152 43117
## 2010 35995 34060 35995 34060 35995 34060 35995 34060 35995 34060
## 2011 39279 34833 39279 34833 39279 34833 39279 34833 39279 34833
## 2012 34681 33428 34681 33428 34681 33428 34681 33428 34681 33428
## 2013 36528 37977 36528 37977 36528 37977 36528 37977 36528 37977
## 2014 40162 34902 40162 34902 40162 34902 40162 34902 40162 34902
## 2015 105835 87860 105835 87860 105835 87860 105835 87860 105835 87860
## 2016 106483 105617 106483 105617 106483 105617 106483 105617 106483 105617
## 2017 93438 93955 93438 93955 93438 93955 93438 93955 93438 93955
## 2018 101152 96751 101152 96751 101152 96751 101152 96751 101152 96751
## 2019 91762 90663 91762 90663 91762 90663 91762 90663 91762 90663
## 2020 35739 31849 35739 31849 35739 31849 35739 31849 35739 31849
## 2021 65738 71531 65738 71531 65738 71531 65738 71531 65738 71531
## 2022 69554 79434 69554 79434 69554 79434 69554 79434 69554 79434
## Nov Dec
## 2003 50075 47066
## 2004 46815 47022
## 2005 57655 56441
## 2006 60206 57392
## 2007 57092 59175
## 2008 44906 44844
## 2009 47152 43117
## 2010 35995 34060
## 2011 39279 34833
## 2012 34681 33428
## 2013 36528 37977
## 2014 40162 34902
## 2015 105835 87860
## 2016 106483 105617
## 2017 93438 93955
## 2018 101152 96751
## 2019 91762 90663
## 2020 35739 31849
## 2021 65738 71531
## 2022 69554 79434
class(dat_ts)
## [1] "ts"
ap <- as.xts(dat_ts)
class(ap)
## [1] "xts" "zoo"
ap
## m.c.seq.row..seq.n...seq.col..drop...FALSE.
## Jan 2003 50075
## Feb 2003 47066
## Mar 2003 50075
## Apr 2003 47066
## May 2003 50075
## Jun 2003 47066
## Jul 2003 50075
## Aug 2003 47066
## Sep 2003 50075
## Oct 2003 47066
## ...
## Mar 2022 69554
## Apr 2022 79434
## May 2022 69554
## Jun 2022 79434
## Jul 2022 69554
## Aug 2022 79434
## Sep 2022 69554
## Oct 2022 79434
## Nov 2022 69554
## Dec 2022 79434
apRel <- diff(ap)/ap[-length(ap)]
head(apRel)
## e1
## Jan 2003 NA
## Feb 2003 -0.06393150
## Mar 2003 0.06008987
## Apr 2003 -0.06393150
## May 2003 0.06008987
## Jun 2003 -0.06393150
head(embed(ap,4))
## [,1] [,2] [,3] [,4]
## [1,] 47066 50075 47066 50075
## [2,] 50075 47066 50075 47066
## [3,] 47066 50075 47066 50075
## [4,] 50075 47066 50075 47066
## [5,] 47066 50075 47066 50075
## [6,] 50075 47066 50075 47066
createEmbedDS <- function(s, emb=4) {
d <- dim(s)
if (!is.null(d) && d[2] > 1) stop("Only applicable to uni-variate time series")
if (emb < 2 || emb > length(s)) stop("Invalid embed size")
e <- embed(s,emb)
colnames(e) <- c("T",paste("T",1:(emb-1),sep="_"))
if (is.xts(s)) return(xts(e,index(s)[emb:length(s)])) else return(e)
}
dataSet <- createEmbedDS(ap,emb=5)
head(dataSet)
## T T_1 T_2 T_3 T_4
## May 2003 50075 47066 50075 47066 50075
## Jun 2003 47066 50075 47066 50075 47066
## Jul 2003 50075 47066 50075 47066 50075
## Aug 2003 47066 50075 47066 50075 47066
## Sep 2003 50075 47066 50075 47066 50075
## Oct 2003 47066 50075 47066 50075 47066