library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages -------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.2

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'dplyr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.3

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts ----------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 3.5.3

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(forecast)

## Warning: package 'forecast' was built under R version 3.5.3

head(data_pro,10)

## # A tibble: 10 x 10
##       X1 begins_at           open_price close_price high_price low_price volume
##    <dbl> <dttm>                   <dbl>       <dbl>      <dbl>     <dbl>  <dbl>
##  1     0 2015-04-28 00:00:00       24.9        24.9       24.9      24.9      0
##  2     1 2015-04-29 00:00:00       24.9        24.9       24.9      24.9      0
##  3     2 2015-04-30 00:00:00       24.9        24.9       24.9      24.9      0
##  4     3 2015-05-01 00:00:00       24.9        24.9       24.9      24.9      0
##  5     4 2015-05-04 00:00:00       24.9        24.9       24.9      24.9      0
##  6     5 2015-05-05 00:00:00       24.9        24.9       24.9      24.9      0
##  7     6 2015-05-06 00:00:00       24.9        24.9       24.9      24.9      0
##  8     7 2015-05-07 00:00:00       24.9        24.9       24.9      24.9      0
##  9     8 2015-05-08 00:00:00       24.9        24.9       24.9      24.9      0
## 10     9 2015-05-11 00:00:00       24.9        24.9       24.9      24.9      0
## # ... with 3 more variables: session <chr>, interpolated <lgl>, sname <chr>

months.abb <- c("Jan","Feb","Mar",
              "Apr","May","Jun",
              "Jul","Aug","Sep",
              "Oct","Nov","Dec")

head(data_indus)

## # A tibble: 6 x 3
##   Sector                 CODE  NAME                       
##   <chr>                  <chr> <chr>                      
## 1 Basic Materials Sector GPRE  Green Plains Inc           
## 2 Basic Materials Sector BCPC  Balchem Corporation        
## 3 Basic Materials Sector STLD  Steel Dynamics, Inc        
## 4 Basic Materials Sector METC  Ramaco Resources, Inc      
## 5 Basic Materials Sector ASH   Ashland Global Holdings Inc
## 6 Basic Materials Sector TRX   Tanzanian Gold Corporation

mutate(data_indus,avg_price = mean(data_pro[]))

## Warning in mean.default(data_pro[]): argument is not numeric or logical:
## returning NA

## # A tibble: 5,195 x 4
##    Sector                 CODE  NAME                         avg_price
##    <chr>                  <chr> <chr>                            <dbl>
##  1 Basic Materials Sector GPRE  Green Plains Inc                    NA
##  2 Basic Materials Sector BCPC  Balchem Corporation                 NA
##  3 Basic Materials Sector STLD  Steel Dynamics, Inc                 NA
##  4 Basic Materials Sector METC  Ramaco Resources, Inc               NA
##  5 Basic Materials Sector ASH   Ashland Global Holdings Inc         NA
##  6 Basic Materials Sector TRX   Tanzanian Gold Corporation          NA
##  7 Basic Materials Sector HCC   Warrior Met Coal, Inc               NA
##  8 Basic Materials Sector AUG   Auryn Resources Inc                 NA
##  9 Basic Materials Sector RFP   Resolute Forest Products Inc        NA
## 10 Basic Materials Sector EGO   Eldorado Gold Corporation           NA
## # ... with 5,185 more rows

# Avg Price of the Index
data_price <- data_pro %>% select(open_price,sname) %>% 
  group_by(sname) %>% 
  summarise(Avg_price = mean(open_price)) 

# yearly Price 
data_price_year <- data_pro %>% select(begins_at,open_price,sname) %>% 
  group_by(Year=year(begins_at),sname) %>% 
  summarise(Avg_price = mean(open_price)) 
head(data_price)

## # A tibble: 6 x 2
##   sname Avg_price
##   <chr>     <dbl>
## 1 AA         30.5
## 2 AAN        40.2
## 3 AAP       146. 
## 4 AAT        40.6
## 5 AB         26.2
## 6 ABB        21.5

# Monthly Price
data_price_month <- data_pro %>% select(begins_at,open_price,sname) %>% 
  group_by(Year=year(begins_at),Month=months.abb[month(begins_at)],sname) %>% 
  summarise(Avg_price = mean(open_price)) 
head(data_price)

## # A tibble: 6 x 2
##   sname Avg_price
##   <chr>     <dbl>
## 1 AA         30.5
## 2 AAN        40.2
## 3 AAP       146. 
## 4 AAT        40.6
## 5 AB         26.2
## 6 ABB        21.5

data_price_year[which(data_price_year$sname=='AA'),]

## # A tibble: 6 x 3
## # Groups:   Year [6]
##    Year sname Avg_price
##   <dbl> <chr>     <dbl>
## 1  2015 AA         24.9
## 2  2016 AA         25.4
## 3  2017 AA         38.3
## 4  2018 AA         44.3
## 5  2019 AA         23.7
## 6  2020 AA         12.4

glimpse(data_pro)

## Observations: 370,440
## Variables: 10
## $ X1           <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ begins_at    <dttm> 2015-04-28, 2015-04-29, 2015-04-30, 2015-05-01, 2015-...
## $ open_price   <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ close_price  <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ high_price   <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ low_price    <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ volume       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ session      <chr> "reg", "reg", "reg", "reg", "reg", "reg", "reg", "reg"...
## $ interpolated <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ...
## $ sname        <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", ...

summary(data_pro)

##        X1           begins_at                     open_price     
##  Min.   :   0.0   Min.   :2015-04-28 00:00:00   Min.   :   0.00  
##  1st Qu.: 314.8   1st Qu.:2016-07-26 18:00:00   1st Qu.:  13.61  
##  Median : 629.5   Median :2017-10-24 12:00:00   Median :  26.33  
##  Mean   : 629.5   Mean   :2017-10-25 14:51:25   Mean   :  48.92  
##  3rd Qu.: 944.2   3rd Qu.:2019-01-25 18:00:00   3rd Qu.:  56.09  
##  Max.   :1259.0   Max.   :2020-04-27 00:00:00   Max.   :1266.56  
##   close_price          high_price        low_price          volume         
##  Min.   :   0.0052   Min.   :   0.00   Min.   :   0.0   Min.   :        0  
##  1st Qu.:  13.6100   1st Qu.:  13.72   1st Qu.:  13.5   1st Qu.:   103282  
##  Median :  26.3300   Median :  26.67   Median :  26.0   Median :   421838  
##  Mean   :  48.9221   Mean   :  49.43   Mean   :  48.4   Mean   :  1650980  
##  3rd Qu.:  56.1200   3rd Qu.:  56.68   3rd Qu.:  55.5   3rd Qu.:  1332573  
##  Max.   :1250.0000   Max.   :1274.41   Max.   :1232.0   Max.   :375088650  
##    session          interpolated       sname          
##  Length:370440      Mode :logical   Length:370440     
##  Class :character   FALSE:362193    Class :character  
##  Mode  :character   TRUE :8247      Mode  :character  
##                                                       
##                                                       
##

data_pro %>%
  summary() %>%
  kable() %>%
  kable_styling()

X1	begins_at	open_price	close_price	high_price	low_price	volume	session	interpolated	sname
Min. : 0.0	Min. :2015-04-28 00:00:00	Min. : 0.00	Min. : 0.0052	Min. : 0.00	Min. : 0.0	Min. : 0	Length:370440	Mode :logical	Length:370440
1st Qu.: 314.8	1st Qu.:2016-07-26 18:00:00	1st Qu.: 13.61	1st Qu.: 13.6100	1st Qu.: 13.72	1st Qu.: 13.5	1st Qu.: 103282	Class :character	FALSE:362193	Class :character
Median : 629.5	Median :2017-10-24 12:00:00	Median : 26.33	Median : 26.3300	Median : 26.67	Median : 26.0	Median : 421838	Mode :character	TRUE :8247	Mode :character
Mean : 629.5	Mean :2017-10-25 14:51:25	Mean : 48.92	Mean : 48.9221	Mean : 49.43	Mean : 48.4	Mean : 1650980	NA	NA	NA
3rd Qu.: 944.2	3rd Qu.:2019-01-25 18:00:00	3rd Qu.: 56.09	3rd Qu.: 56.1200	3rd Qu.: 56.68	3rd Qu.: 55.5	3rd Qu.: 1332573	NA	NA	NA
Max. :1259.0	Max. :2020-04-27 00:00:00	Max. :1266.56	Max. :1250.0000	Max. :1274.41	Max. :1232.0	Max. :375088650	NA	NA	NA

# Get quantmod
if (!require("quantmod")) {
    install.packages("quantmod")
    library(quantmod)
}

## Loading required package: quantmod

## Warning: package 'quantmod' was built under R version 3.5.3

## Loading required package: xts

## Warning: package 'xts' was built under R version 3.5.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.5.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Warning: package 'TTR' was built under R version 3.5.3

## Version 0.4-0 included new data defaults. See ?getSymbols.

Number of ZERO

data_pro %>% 
  gather(variable, value) %>%
  filter(value == 0) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(data_pro) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
#  rename("Variable With Zeros"=variable,"Number of Records"=n,"Share of Total"=percent) %>%
  kable() %>%
  kable_styling()

## Warning: attributes are not identical across measure variables;
## they will be dropped

variable	n	percent
volume	8625	2.3%
X1	294	0.1%
high_price	1	0%
low_price	1	0%
open_price	1	0%

 # we will not drop these now but we will review it latter. We will not use volume in our anlysis at this time so we will drop those data points latter.

Let only Target shares that are between 100 and 200 open prices

unique(data_pro$sname)

##   [1] "AA"    "AAN"   "AAP"   "AAT"   "AB"    "ABB"   "ABBV"  "ABC"   "ABEV" 
##  [10] "ABG"   "ABM"   "ABR"   "ABT"   "ACC"   "ACCO"  "ACH"   "ACM"   "ACN"  
##  [19] "ACP"   "ACRE"  "ACT"   "ADC"   "ADM"   "ADPT"  "ADS"   "ADT"   "ADX"  
##  [28] "AEB"   "AEE"   "AEG"   "AEL"   "AEM"   "AEO"   "AEP"   "AER"   "AES"  
##  [37] "AFC"   "AFB"   "AFG"   "AFL"   "AFSIA" "AFSIB" "AFSIC" "AFT"   "AG"   
##  [46] "AGCO"  "AGD"   "AGI"   "AGM"   "AGN"   "AGO"   "AGRO"  "AGX"   "AHC"  
##  [55] "AHH"   "AHT"   "AI"    "AIF"   "AIG"   "AIN"   "AIR"   "AIT"   "AIV"  
##  [64] "AIW"   "AIZ"   "AJG"   "AKR"   "AL"    "ALB"   "ALE"   "ALEX"  "ALG"  
##  [73] "ALK"   "ALL"   "ALLE"  "ALLY"  "ALPN"  "ALSN"  "ALV"   "ALX"   "AM"   
##  [82] "AMC"   "AME"   "AMG"   "AMH"   "AMHC"  "AMP"   "AMRC"  "AMT"   "AMTD" 
##  [91] "AMX"   "AN"    "ANET"  "ANF"   "ANH"   "ANTM"  "AOD"   "AON"   "AOS"  
## [100] "AP"    "APA"   "APAM"  "APD"   "APH"   "APLE"  "APO"   "AR"    "ARC"  
## [109] "ARCO"  "ARDC"  "ARE"   "ARES"  "ARI"   "ARL"   "ARMK"  "ARR"   "ARW"  
## [118] "ASA"   "ASB"   "ASC"   "ASG"   "ASGN"  "ASH"   "ASPN"  "ASR"   "ASX"  
## [127] "AT"    "ATEN"  "ATHM"  "ATI"   "ATLS"  "ATO"   "ATR"   "ATTO"  "ATV"  
## [136] "AU"    "AUY"   "AVA"   "AVAL"  "AVB"   "AVD"   "AVH"   "AVK"   "AVT"  
## [145] "AVY"   "AWF"   "AWI"   "AWK"   "AWP"   "AWR"   "AXE"   "AXL"   "AXP"  
## [154] "AXR"   "AXS"   "AXTA"  "AYI"   "AZN"   "AZO"   "AZZ"   "B"     "BA"   
## [163] "BABA"  "BAC"   "BAF"   "BAH"   "BAK"   "BAM"   "BANC"  "BAP"   "BAX"  
## [172] "BBD"   "BBDO"  "BBF"   "BBK"   "BBL"   "BBN"   "BBVA"  "BBW"   "BBX"  
## [181] "BBY"   "BC"    "BCC"   "BCE"   "BCEI"  "BCH"   "BCO"   "BCS"   "BCX"  
## [190] "BDC"   "BDJ"   "BDN"   "BDX"   "BEN"   "BEP"   "BERY"  "BFAM"  "BFK"  
## [199] "BFO"   "BFS"   "BFZ"   "BG"    "BGB"   "BGG"   "BGH"   "BGR"   "BGS"  
## [208] "BGT"   "BGX"   "BGY"   "BH"    "BHE"   "BHK"   "BHLB"  "BHP"   "BIF"  
## [217] "BIG"   "BIO"   "BIP"   "BIT"   "BITA"  "BK"    "BKD"   "BKE"   "BKH"  
## [226] "BKK"   "BKN"   "BKT"   "BKU"   "BLK"   "BLL"   "BLW"   "BLX"   "BMA"  
## [235] "BME"   "BMI"   "BMO"   "BMY"   "BNS"   "BNY"   "BOE"   "BOH"   "BOOT" 
## [244] "BP"    "BPT"   "BPY"   "BQH"   "BR"    "BRC"   "BRFS"  "BRO"   "BRP"  
## [253] "BRT"   "BRX"   "BSAC"  "BSBR"  "BSD"   "BSE"   "BSL"   "BSMX"  "BST"  
## [262] "BSX"   "BTA"   "BTE"   "BTO"   "BTT"   "BTU"   "BTZ"   "BUD"   "BUI"  
## [271] "BURL"  "BVN"   "BWA"   "BWG"   "BX"    "BXC"   "BXMT"  "BXMX"  "BXP"  
## [280] "BXS"   "BYD"   "BYM"   "BZH"   "C"     "CC"    "CL"    "CN"    "CP"   
## [289] "CB"    "CACI"  "CAE"

data_pro %>% filter(open_price > 100 & open_price < 200  ) %>% select(open_price,sname) %>% group_by(sname) %>% summarise(Avg_price = mean(open_price)) %>% ggplot(mapping = aes(x=sname,y= Avg_price)) + geom_col()

# left_join(data_price,data_indus,by = c("sname"="CODE"))  %>% filter(Avg_price > 100 & Avg_price < 200  ) %>% ggplot(mapping = aes(Sector)) + geom_bar()

# Cheking only with 500 stokcs data how is the distribution of data in each sector

left_join(data_price,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>% ggplot(mapping = aes(Sector)) + geom_bar() + theme(axis.text.x = element_text(angle = 70, hjust = 1))

spread(data_price_year,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Sector,col=Year)) + geom_bar() + theme(axis.text.x = element_text(angle = 70, hjust = 1)) +facet_wrap('Year')

spread(data_price_year,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" )  %>% ggplot(mapping = aes(Sector,Avg_price,fill=Year)) + geom_col() + theme(axis.text.x = element_text(angle = 70, hjust = 1))

#+facet_wrap('Year')

# Below Graph of the secotr by Month and year, show some pattern
# We will  do some analysis to  see how stocs from  some  of these  industries  fit with AR  and  MA model.

spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% filter(Avg_price > 100 & Avg_price < 200  )%>% ggplot(mapping = aes(Sector,Avg_price,fill=Year)) + geom_col(position = "dodge2") + theme(axis.text.x = element_text(angle = 70, hjust = 1))

#+ geom_bar(position = "dodge2")
#+facet_wrap('Month')
# By Month 
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Sector,Avg_price,fill=Month)) + geom_col(position = "dodge") + theme(axis.text.x = element_text(angle = 70, hjust = 1))

## Warning: Removed 2904 rows containing missing values (geom_col).

# By Year 
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes(Sector,Avg_price,fill=Year)) + geom_col(position = "dodge") + theme(axis.text.x = element_text(angle = 70, hjust = 1))

## Warning: Removed 2904 rows containing missing values (geom_col).

# Box PLot for year 
spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>% ggplot(mapping = aes( stringr::str_remove(Sector,'Sector') ,Avg_price,fill=Year)) + geom_boxplot(position = "dodge") + theme(axis.text.x = element_text(angle = 30, hjust = 1)) + labs(title = "Boxplot of Stocks by Year ")+ ylim(0,200) + xlab( "Sector")

## Warning: Removed 3518 rows containing non-finite values (stat_boxplot).

# ----- TEMP
# spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%  pivot_longer(c('2015','2016','2017','2018','2019','2020'),"Year",values_to = "Avg_price" ) %>%   ggplot(mapping = aes(Month,Avg_price,fill=Year)) + geom_col() + theme(axis.text.x = element_text(angle = 70, hjust = 1)) +   geom_bar(position = "dodge2")
#   # ggplot(mapping = aes(Avg_price,fill=Month)) +   geom_histogram(position = "fill")
#   # ggplot( aes(Month, Avg_price)) + geom_area(aes(fill = Month))
#    # ggplot(mapping = aes(Month , Avg_price,group=Sector)) + geom_line(aes(colour = Sector), position = "stack") + geom_point(aes(colour = Sector), position = "stack") +  geom_area(aes(fill = Sector))


# Find top 3 stocks in each Sector
library(dplyr)


data_price_month %>% group_by(sname)  %>%  summarise(Avg = mean(Avg_price)) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%   arrange(desc(Avg)) %>% top_n(n=16,wt = Avg)

## # A tibble: 16 x 4
##    sname   Avg Sector                 NAME                                      
##    <chr> <dbl> <chr>                  <chr>                                     
##  1 AZO    801. Consumer Cyclical Sec~ AutoZone, Inc                             
##  2 AZO    801. Consumer Defensive Se~ AutoZone, Inc                             
##  3 BLK    420. Financial Services Se~ BlackRock, Inc                            
##  4 BH     292. Consumer Cyclical Sec~ Biglari Holdings Inc                      
##  5 BH     292. Consumer Defensive Se~ Biglari Holdings Inc                      
##  6 BA     245. Industrials Sector     The Boeing Company                        
##  7 BIO    234. Healthcare Sector      Bio-Rad Laboratories, Inc                 
##  8 AGN    212. Healthcare Sector      Allergan plc                              
##  9 ADS    208. Financial Services Se~ Alliance Data Systems Corporation         
## 10 ANTM   206. Healthcare Sector      Anthem, Inc                               
## 11 BDX    203. Healthcare Sector      Becton, Dickinson and Company             
## 12 BAP    182. Financial Services Se~ Credicorp Ltd                             
## 13 CP     179. Industrials Sector     Canadian Pacific Railway Limited          
## 14 AYI    172. Industrials Sector     Acuity Brands, Inc                        
## 15 ANET   169. Technology Sector      Arista Networks, Inc                      
## 16 ASR    164. Industrials Sector     Grupo Aeroportuario del Sureste, S. A. B.~

data_stock = data_price_month %>% group_by(sname)  %>%  summarise(Avg = mean(Avg_price)) %>% left_join(.,data_indus,by = c("sname"="CODE"))  %>% filter(!is.na(Sector)) %>%   arrange(desc(Avg)) %>% top_n(n=20,wt = Avg)

head(data_stock)

## # A tibble: 6 x 4
##   sname   Avg Sector                    NAME                
##   <chr> <dbl> <chr>                     <chr>               
## 1 AZO    801. Consumer Cyclical Sector  AutoZone, Inc       
## 2 AZO    801. Consumer Defensive Sector AutoZone, Inc       
## 3 BLK    420. Financial Services Sector BlackRock, Inc      
## 4 BH     292. Consumer Cyclical Sector  Biglari Holdings Inc
## 5 BH     292. Consumer Defensive Sector Biglari Holdings Inc
## 6 BA     245. Industrials Sector        The Boeing Company

unique(cbind(data_stock$NAME,data_stock$sname,data_stock$Sector))

##       [,1]                                                [,2]  
##  [1,] "AutoZone, Inc"                                     "AZO" 
##  [2,] "AutoZone, Inc"                                     "AZO" 
##  [3,] "BlackRock, Inc"                                    "BLK" 
##  [4,] "Biglari Holdings Inc"                              "BH"  
##  [5,] "Biglari Holdings Inc"                              "BH"  
##  [6,] "The Boeing Company"                                "BA"  
##  [7,] "Bio-Rad Laboratories, Inc"                         "BIO" 
##  [8,] "Allergan plc"                                      "AGN" 
##  [9,] "Alliance Data Systems Corporation"                 "ADS" 
## [10,] "Anthem, Inc"                                       "ANTM"
## [11,] "Becton, Dickinson and Company"                     "BDX" 
## [12,] "Credicorp Ltd"                                     "BAP" 
## [13,] "Canadian Pacific Railway Limited"                  "CP"  
## [14,] "Acuity Brands, Inc"                                "AYI" 
## [15,] "Arista Networks, Inc"                              "ANET"
## [16,] "Grupo Aeroportuario del Sureste, S. A. B. de C. V" "ASR" 
## [17,] "Air Products and Chemicals, Inc"                   "APD" 
## [18,] "CACI International Inc"                            "CACI"
## [19,] "Advance Auto Parts, Inc"                           "AAP" 
## [20,] "Advance Auto Parts, Inc"                           "AAP" 
##       [,3]                       
##  [1,] "Consumer Cyclical Sector" 
##  [2,] "Consumer Defensive Sector"
##  [3,] "Financial Services Sector"
##  [4,] "Consumer Cyclical Sector" 
##  [5,] "Consumer Defensive Sector"
##  [6,] "Industrials Sector"       
##  [7,] "Healthcare Sector"        
##  [8,] "Healthcare Sector"        
##  [9,] "Financial Services Sector"
## [10,] "Healthcare Sector"        
## [11,] "Healthcare Sector"        
## [12,] "Financial Services Sector"
## [13,] "Industrials Sector"       
## [14,] "Industrials Sector"       
## [15,] "Technology Sector"        
## [16,] "Industrials Sector"       
## [17,] "Basic Materials Sector"   
## [18,] "Technology Sector"        
## [19,] "Consumer Cyclical Sector" 
## [20,] "Consumer Defensive Sector"

  # ggplot(mapping = aes(Month,Avg_price,fill=Year)) + geom_col() + theme(axis.text.x = element_text(angle = 70, hjust = 1)) +   geom_bar(position = "dodge2")

library(stringr)
# We will study the flow on some of the stocks from Health and Tech Sectors
# ANTM Anthem, Inc
# ANET Arista Networks, Inc
# BA The Boeing Company

spread(data_price_month,Year,Avg_price) %>% left_join(.,data_indus,by = c("sname"="CODE"))%>% filter(!is.na(Sector))

## # A tibble: 3,168 x 10
## # Groups:   Month [12]
##    Month sname `2015` `2016` `2017` `2018` `2019` `2020` Sector      NAME       
##    <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <chr>       <chr>      
##  1 Apr   AA      24.9   24.9   33.4   52.2   28.3   7.12 Basic Mate~ Alcoa Corp~
##  2 Apr   AAN     34.2   26.3   30.7   46.1   53.8  23.3  Industrial~ Aaron's, I~
##  3 Apr   AAP    145.   159.   144.   111.   175.  107.   Consumer C~ Advance Au~
##  4 Apr   AAP    145.   159.   144.   111.   175.  107.   Consumer D~ Advance Au~
##  5 Apr   AB      31.2   23.9   22.9   26.6   29.4  19.7  Financial ~ AllianceBe~
##  6 Apr   ABB     21.8   20.0   23.3   23.3   20.1  17.4  Industrial~ ABB Ltd    
##  7 Apr   ABBV    65.2   59.6   64.8   93.2   81.0  79.6  Healthcare~ AbbVie Inc 
##  8 Apr   ABC    115.    88.0   84.9   88.8   75.1  87.4  Healthcare~ Amerisourc~
##  9 Apr   ABG     86.0   58.1   59.9   67.8   74.9  53.4  Consumer C~ Asbury Aut~
## 10 Apr   ABG     86.0   58.1   59.9   67.8   74.9  53.4  Consumer D~ Asbury Aut~
## # ... with 3,158 more rows

summary(data_pro)

##        X1           begins_at                     open_price     
##  Min.   :   0.0   Min.   :2015-04-28 00:00:00   Min.   :   0.00  
##  1st Qu.: 314.8   1st Qu.:2016-07-26 18:00:00   1st Qu.:  13.61  
##  Median : 629.5   Median :2017-10-24 12:00:00   Median :  26.33  
##  Mean   : 629.5   Mean   :2017-10-25 14:51:25   Mean   :  48.92  
##  3rd Qu.: 944.2   3rd Qu.:2019-01-25 18:00:00   3rd Qu.:  56.09  
##  Max.   :1259.0   Max.   :2020-04-27 00:00:00   Max.   :1266.56  
##   close_price          high_price        low_price          volume         
##  Min.   :   0.0052   Min.   :   0.00   Min.   :   0.0   Min.   :        0  
##  1st Qu.:  13.6100   1st Qu.:  13.72   1st Qu.:  13.5   1st Qu.:   103282  
##  Median :  26.3300   Median :  26.67   Median :  26.0   Median :   421838  
##  Mean   :  48.9221   Mean   :  49.43   Mean   :  48.4   Mean   :  1650980  
##  3rd Qu.:  56.1200   3rd Qu.:  56.68   3rd Qu.:  55.5   3rd Qu.:  1332573  
##  Max.   :1250.0000   Max.   :1274.41   Max.   :1232.0   Max.   :375088650  
##    session          interpolated       sname          
##  Length:370440      Mode :logical   Length:370440     
##  Class :character   FALSE:362193    Class :character  
##  Mode  :character   TRUE :8247      Mode  :character  
##                                                       
##                                                       
##

# Only Keeping Date, open_price ,      sname  , interpolated = FALSE
glimpse((data_pro))

## Observations: 370,440
## Variables: 10
## $ X1           <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ begins_at    <dttm> 2015-04-28, 2015-04-29, 2015-04-30, 2015-05-01, 2015-...
## $ open_price   <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ close_price  <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ high_price   <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ low_price    <dbl> 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, 24.8871, ...
## $ volume       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ session      <chr> "reg", "reg", "reg", "reg", "reg", "reg", "reg", "reg"...
## $ interpolated <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ...
## $ sname        <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", ...

data_Main <- data_pro %>% filter(!interpolated == TRUE )%>% .[,c(2,3,10)] %>% 
  subset( sname %in% data_stock$sname) 



head(data_Main)

## # A tibble: 6 x 3
##   begins_at           open_price sname
##   <dttm>                   <dbl> <chr>
## 1 2015-04-28 00:00:00       145  AAP  
## 2 2015-04-29 00:00:00       144. AAP  
## 3 2015-04-30 00:00:00       144. AAP  
## 4 2015-05-01 00:00:00       143. AAP  
## 5 2015-05-04 00:00:00       145. AAP  
## 6 2015-05-05 00:00:00       145. AAP

#COnverting our data of stokcs in wide format 
wide_data_Main <- spread(data_Main,sname,open_price)

head(wide_data_Main)

## # A tibble: 6 x 18
##   begins_at             AAP   ADS   AGN  ANET  ANTM   APD   ASR   AYI   AZO
##   <dttm>              <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2015-04-28 00:00:00  145   303   282.  64.8  150.  139.  152.  167.  692.
## 2 2015-04-29 00:00:00  144.  301.  286.  65.7  156.  140.  151.  165.  690 
## 3 2015-04-30 00:00:00  144.  300.  287.  64.6  152.  142.  150.  169.  683.
## 4 2015-05-01 00:00:00  143.  300.  285.  64.1  152.  133.  145.  167.  675.
## 5 2015-05-04 00:00:00  145.  300.  290.  64.2  153.  137.  147.  170.  683 
## 6 2015-05-05 00:00:00  145.  300   290.  64.7  155.  137.  144.  173.  682 
## # ... with 8 more variables: BA <dbl>, BAP <dbl>, BDX <dbl>, BH <dbl>,
## #   BIO <dbl>, BLK <dbl>, CACI <dbl>, CP <dbl>

# Fit an AR model to this data
# ANTM Anthem, Inc
# ANET Arista Networks, Inc
# BA The Boeing Company
library(xts)
# wide_data_Main$begins_at  <- as_datetime(wide_data_Main$begins_at)
stocks_ANTM <-xts(wide_data_Main$ANTM,  order.by=as.Date(wide_data_Main$begins_at))
stocks_ANET <- xts(wide_data_Main$ANET, order.by=as.Date(wide_data_Main$begins_at))
stocks_BA <- xts(wide_data_Main$BA,  order.by=as.Date(wide_data_Main$begins_at))

# Data for only 2020 data 
wide_data_Main_20 <- wide_data_Main[which(year(wide_data_Main$begins_at) %in% c("2020")),]
# Data for only Rest than 2020 data 
wide_data_Main_Old <- wide_data_Main[-which(year(wide_data_Main$begins_at) %in% c("2020")),]

stocks_ANTM_MY  <-xts(wide_data_Main_20$ANTM,  order.by=as.Date(wide_data_Main_20$begins_at))
stocks_ANET_MY <- xts(wide_data_Main_20$ANET, order.by=as.Date(wide_data_Main_20$begins_at))
stocks_BA_MY <- xts(wide_data_Main_20$BA,  order.by=as.Date(wide_data_Main_20$begins_at))

stocks_ANTM_old  <-xts(wide_data_Main_Old$ANTM,  order.by=as.Date(wide_data_Main_Old$begins_at))
stocks_ANET_old <- xts(wide_data_Main_Old$ANET, order.by=as.Date(wide_data_Main_Old$begins_at))
stocks_BA_old <- xts(wide_data_Main_Old$BA,  order.by=as.Date(wide_data_Main_Old$begins_at))

index(stocks_ANTM_MY)

##  [1] "2020-01-02" "2020-01-03" "2020-01-06" "2020-01-07" "2020-01-08"
##  [6] "2020-01-09" "2020-01-10" "2020-01-13" "2020-01-14" "2020-01-15"
## [11] "2020-01-16" "2020-01-17" "2020-01-21" "2020-01-22" "2020-01-23"
## [16] "2020-01-24" "2020-01-27" "2020-01-28" "2020-01-29" "2020-01-30"
## [21] "2020-01-31" "2020-02-03" "2020-02-04" "2020-02-05" "2020-02-06"
## [26] "2020-02-07" "2020-02-10" "2020-02-11" "2020-02-12" "2020-02-13"
## [31] "2020-02-14" "2020-02-18" "2020-02-19" "2020-02-20" "2020-02-21"
## [36] "2020-02-24" "2020-02-25" "2020-02-26" "2020-02-27" "2020-02-28"
## [41] "2020-03-02" "2020-03-03" "2020-03-04" "2020-03-05" "2020-03-06"
## [46] "2020-03-09" "2020-03-10" "2020-03-11" "2020-03-12" "2020-03-13"
## [51] "2020-03-16" "2020-03-17" "2020-03-18" "2020-03-19" "2020-03-20"
## [56] "2020-03-23" "2020-03-24" "2020-03-25" "2020-03-26" "2020-03-27"
## [61] "2020-03-30" "2020-03-31" "2020-04-01" "2020-04-02" "2020-04-03"
## [66] "2020-04-06" "2020-04-07" "2020-04-08" "2020-04-09" "2020-04-13"
## [71] "2020-04-14" "2020-04-15" "2020-04-16" "2020-04-17" "2020-04-20"
## [76] "2020-04-21" "2020-04-22" "2020-04-23" "2020-04-24" "2020-04-27"

coredata(stocks_ANTM_MY)

##         [,1]
##  [1,] 302.67
##  [2,] 293.68
##  [3,] 295.75
##  [4,] 299.20
##  [5,] 300.89
##  [6,] 307.83
##  [7,] 307.94
##  [8,] 306.88
##  [9,] 296.04
## [10,] 297.26
## [11,] 302.92
## [12,] 305.07
## [13,] 303.94
## [14,] 306.68
## [15,] 303.46
## [16,] 304.84
## [17,] 293.63
## [18,] 285.00
## [19,] 279.50
## [20,] 269.26
## [21,] 265.68
## [22,] 266.11
## [23,] 269.90
## [24,] 275.36
## [25,] 288.78
## [26,] 279.70
## [27,] 275.88
## [28,] 277.04
## [29,] 286.27
## [30,] 294.73
## [31,] 299.29
## [32,] 297.45
## [33,] 302.01
## [34,] 301.38
## [35,] 292.03
## [36,] 283.49
## [37,] 279.21
## [38,] 269.84
## [39,] 263.06
## [40,] 249.94
## [41,] 259.60
## [42,] 270.13
## [43,] 286.55
## [44,] 287.65
## [45,] 278.18
## [46,] 264.00
## [47,] 278.33
## [48,] 278.60
## [49,] 262.03
## [50,] 267.50
## [51,] 226.50
## [52,] 229.76
## [53,] 224.45
## [54,] 206.22
## [55,] 204.29
## [56,] 188.54
## [57,] 183.98
## [58,] 190.09
## [59,] 221.28
## [60,] 217.61
## [61,] 224.66
## [62,] 235.28
## [63,] 217.01
## [64,] 210.53
## [65,] 208.89
## [66,] 215.51
## [67,] 235.70
## [68,] 227.15
## [69,] 240.94
## [70,] 240.78
## [71,] 245.76
## [72,] 249.15
## [73,] 254.51
## [74,] 279.01
## [75,] 262.48
## [76,] 255.00
## [77,] 255.09
## [78,] 264.71
## [79,] 265.48
## [80,] 267.56

first(stocks_ANTM_MY)

##              [,1]
## 2020-01-02 302.67

stocks_ANTM_MY[]

##              [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
## 2020-01-06 295.75
## 2020-01-07 299.20
## 2020-01-08 300.89
## 2020-01-09 307.83
## 2020-01-10 307.94
## 2020-01-13 306.88
## 2020-01-14 296.04
## 2020-01-15 297.26
## 2020-01-16 302.92
## 2020-01-17 305.07
## 2020-01-21 303.94
## 2020-01-22 306.68
## 2020-01-23 303.46
## 2020-01-24 304.84
## 2020-01-27 293.63
## 2020-01-28 285.00
## 2020-01-29 279.50
## 2020-01-30 269.26
## 2020-01-31 265.68
## 2020-02-03 266.11
## 2020-02-04 269.90
## 2020-02-05 275.36
## 2020-02-06 288.78
## 2020-02-07 279.70
## 2020-02-10 275.88
## 2020-02-11 277.04
## 2020-02-12 286.27
## 2020-02-13 294.73
## 2020-02-14 299.29
## 2020-02-18 297.45
## 2020-02-19 302.01
## 2020-02-20 301.38
## 2020-02-21 292.03
## 2020-02-24 283.49
## 2020-02-25 279.21
## 2020-02-26 269.84
## 2020-02-27 263.06
## 2020-02-28 249.94
## 2020-03-02 259.60
## 2020-03-03 270.13
## 2020-03-04 286.55
## 2020-03-05 287.65
## 2020-03-06 278.18
## 2020-03-09 264.00
## 2020-03-10 278.33
## 2020-03-11 278.60
## 2020-03-12 262.03
## 2020-03-13 267.50
## 2020-03-16 226.50
## 2020-03-17 229.76
## 2020-03-18 224.45
## 2020-03-19 206.22
## 2020-03-20 204.29
## 2020-03-23 188.54
## 2020-03-24 183.98
## 2020-03-25 190.09
## 2020-03-26 221.28
## 2020-03-27 217.61
## 2020-03-30 224.66
## 2020-03-31 235.28
## 2020-04-01 217.01
## 2020-04-02 210.53
## 2020-04-03 208.89
## 2020-04-06 215.51
## 2020-04-07 235.70
## 2020-04-08 227.15
## 2020-04-09 240.94
## 2020-04-13 240.78
## 2020-04-14 245.76
## 2020-04-15 249.15
## 2020-04-16 254.51
## 2020-04-17 279.01
## 2020-04-20 262.48
## 2020-04-21 255.00
## 2020-04-22 255.09
## 2020-04-23 264.71
## 2020-04-24 265.48
## 2020-04-27 267.56

tail(stocks_ANTM_MY,n=10)

##              [,1]
## 2020-04-14 245.76
## 2020-04-15 249.15
## 2020-04-16 254.51
## 2020-04-17 279.01
## 2020-04-20 262.48
## 2020-04-21 255.00
## 2020-04-22 255.09
## 2020-04-23 264.71
## 2020-04-24 265.48
## 2020-04-27 267.56

endpoints(stocks_ANTM_MY,on="months")

## [1]  0 21 40 62 80

stocks_ANTM_MY['2020-04-14']

##              [,1]
## 2020-04-14 245.76

month.abb[month(index(stocks_ANTM_MY))]

##  [1] "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan"
## [13] "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Jan" "Feb" "Feb" "Feb"
## [25] "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb" "Feb"
## [37] "Feb" "Feb" "Feb" "Feb" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar"
## [49] "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar"
## [61] "Mar" "Mar" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr"
## [73] "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr" "Apr"

nmonths(stocks_ANTM_MY)

## [1] 4

head(stocks_ANTM_MY,n=10)

##              [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
## 2020-01-06 295.75
## 2020-01-07 299.20
## 2020-01-08 300.89
## 2020-01-09 307.83
## 2020-01-10 307.94
## 2020-01-13 306.88
## 2020-01-14 296.04
## 2020-01-15 297.26

to.weekly(stocks_ANTM_MY)%>% as.data.frame %>% cbind(.,yr=week(index(to.weekly(stocks_ANTM_MY))))%>% pivot_longer(c(stocks_ANTM_MY.Open,stocks_ANTM_MY.High,stocks_ANTM_MY.Low,stocks_ANTM_MY.Close),names_to="Key" , values_to = "Prc")  %>% ggplot(aes(x=yr, y= Prc,fill=Key))+  geom_line(aes(colour=Key)) #+geom_col(position="dodge", alpha=0.5)

to.monthly(stocks_ANTM_MY) %>% as.data.frame %>% cbind(.,yr=index(to.monthly(stocks_ANTM_MY)))%>% pivot_longer(c(stocks_ANTM_MY.Open,stocks_ANTM_MY.High,stocks_ANTM_MY.Low,stocks_ANTM_MY.Close),names_to="Key" , values_to = "Prc")  %>% ggplot(aes(x=yr, y= Prc,fill=Key))+  geom_line(aes(colour=Key)) +geom_col(position="dodge", alpha=0.5)

## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

periodicity(stocks_ANTM_MY)

## Daily periodicity from 2020-01-02 to 2020-04-27

str(stocks_ANTM_MY)

## An 'xts' object on 2020-01-02/2020-04-27 containing:
##   Data: num [1:80, 1] 303 294 296 299 301 ...
##   Indexed by objects of class: [Date] TZ: UTC
##   xts Attributes:  
##  NULL

week(index(to.weekly(stocks_ANTM_MY)))

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 17

stocks_ANTM_MY[c("2020-01-02","2020-01-03")]

##              [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68

stocks_ANTM_MY[str_which(index(stocks_ANTM_MY),"\\d+-\\d+-03")]

##              [,1]
## 2020-01-03 293.68
## 2020-02-03 266.11
## 2020-03-03 270.13
## 2020-04-03 208.89

# With the commands head() and tail() we can see the first and last 6 lines of the base. There are 6 columns with: opening price, maximum and minimum prices, closing price, volume of transactions and adjusted price. Using the command summary() we verify the descriptive statistics of each price series and volume. The command str() returns the object structure. In this case, it's a xts object, a time series.

library(forecast)

head(stocks_ANTM)

##              [,1]
## 2015-04-28 150.15
## 2015-04-29 155.96
## 2015-04-30 151.83
## 2015-05-01 151.92
## 2015-05-04 153.45
## 2015-05-05 154.67

tail(stocks_ANTM)

##              [,1]
## 2020-04-20 262.48
## 2020-04-21 255.00
## 2020-04-22 255.09
## 2020-04-23 264.71
## 2020-04-24 265.48
## 2020-04-27 267.56

summary(stocks_ANTM)

##      Index             stocks_ANTM   
##  Min.   :2015-04-28   Min.   :117.0  
##  1st Qu.:2016-07-26   1st Qu.:145.2  
##  Median :2017-10-24   Median :195.9  
##  Mean   :2017-10-25   Mean   :206.5  
##  3rd Qu.:2019-01-26   3rd Qu.:262.8  
##  Max.   :2020-04-27   Max.   :317.6

str(stocks_ANTM)

## An 'xts' object on 2015-04-28/2020-04-27 containing:
##   Data: num [1:1259, 1] 150 156 152 152 153 ...
##   Indexed by objects of class: [Date] TZ: UTC
##   xts Attributes:  
##  NULL

plot(stocks_BA)

acf(stocks_ANTM)

acf_ANTM = acf(stocks_ANTM,lag.max = 30)

acf_BA = acf(stocks_BA,lag.max = 30)

acf_ANET = acf(stocks_ANET,lag.max = 30)

acf_BA= acf(stocks_BA,lag.max = 30,plot = FALSE)


autoplot(stocks_ANTM)

frequency(stocks_ANTM)

## [1] 1

# Plot for 2020 Data only
acf(stocks_ANTM_MY,lag.max = 30)

autoplot(stocks_ANTM_MY)

head(stocks_ANTM_MY)

##              [,1]
## 2020-01-02 302.67
## 2020-01-03 293.68
## 2020-01-06 295.75
## 2020-01-07 299.20
## 2020-01-08 300.89
## 2020-01-09 307.83

frequency(stocks_ANTM_MY)

## [1] 1

# For for rest of the data before 2020
# Plot for 2020 Data only
acf(stocks_ANTM_old)

plot(stocks_ANTM_old)

head(stocks_ANTM_old)

##              [,1]
## 2015-04-28 150.15
## 2015-04-29 155.96
## 2015-04-30 151.83
## 2015-05-01 151.92
## 2015-05-04 153.45
## 2015-05-05 154.67

frequency(stocks_ANTM_old)

## [1] 1

The ACF plots test if an individual lag autocorrelation is different than zero. An alternative approach is to use the Ljung-Box test, which tests whether any of a group of autocorrelations of a time series are different from zero. In essence it tests the “overall randomness” based on a number of lags. If the result is a small p-value than it indicates the data are probably not white noise.

# For 2020 Data
Box.test(wide_data_Main_20$ANTM, lag = 30, fitdf = 0, type = "Lj")

## 
##  Box-Ljung test
## 
## data:  wide_data_Main_20$ANTM
## X-squared = 480.01, df = 30, p-value < 2.2e-16

Box.test(wide_data_Main$ANTM, lag = 4, fitdf = 0, type = "Lj")

## 
##  Box-Ljung test
## 
## data:  wide_data_Main$ANTM
## X-squared = 4976, df = 4, p-value < 2.2e-16

Here, we perform a Ljung-Box test on the first 24 lag autocorrelations. The resulting p-value is significant at p < .001, so this supports our ACF plot consideration above where we stated it’s likely this is not purely white noise and that some time series information exists in this data.

plot(cbind(stocks_ANTM,stocks_ANET,stocks_BA))

plot(cbind(stocks_ANTM_MY,stocks_ANET_MY,stocks_BA_MY))

plot(cbind(stocks_ANTM_old,stocks_ANET_old,stocks_BA_old))

These plots suggest that these slots the stocks improved from their position from mid of 2016 though 2018, and then it remained constant in progress untill Late 2019 and early 2020.

the trend is the long-term increase or decrease in the data. There is an increasing trend in the cement data. the seasonal pattern occurs when a time series is affected by seasonal factors such as the time of the year or the day of the week. The daily data of the stocks_ANTM doens’t show any seasonality in the graph.

the cycle occurs when the data exhibit rises and falls that are not of a fixed period. These fluctuations are usually due to economic conditions and are often related to the “business cycle”. We can see a few cycles in our in stocks_ANTM data from 2015 to 2018 and then in 2020 we have sudden drop due to covid 19. #https://afit-r.github.io/ts_exploration

Autocorrelation of Time Series

Another way to look at time series data is to plot each observation against another observation that occurred some time previously. For example, we could plot yt agaisnt yt-1 . This is called a lag plot because you are plotting the time series against lags of itself.

plot(diff(stocks_ANTM))

plot(diff(stocks_BA))

plot(diff(stocks_ANET))

dif_stocks_ANTM <- diff(stocks_ANTM)
dif_stocks_ANET <- diff(stocks_ANET)
dif_stocks_BA <- diff(stocks_BA)

plot(cbind(dif_stocks_ANTM,dif_stocks_ANET,dif_stocks_BA))

# Correatliton Between the stocks 
cor(stocks_BA,stocks_ANET)

##           [,1]
## [1,] 0.9344986

cor(wide_data_Main$ANET,wide_data_Main$ANTM)

## [1] 0.8950863

cor(wide_data_Main_20$ANET,wide_data_Main_20$ANTM)

## [1] 0.6139195

cor(wide_data_Main_Old$ANET,wide_data_Main_Old$ANTM)

## [1] 0.9075179

cor(stocks_BA,stocks_ANTM)

##           [,1]
## [1,] 0.9196915

cor(wide_data_Main$BA,wide_data_Main$ANTM)

## [1] 0.9196915

cor(wide_data_Main_20$BA,wide_data_Main_20$ANTM)

## [1] 0.8360629

cor(wide_data_Main_Old$BA,wide_data_Main_Old$ANTM)

## [1] 0.9595244

psych::pairs.panels(as.matrix(wide_data_Main[,c('ANTM','BA','ANET')]))

psych::pairs.panels(as.matrix(wide_data_Main_Old[,c('ANTM','BA','ANET')]))

psych::pairs.panels(as.matrix(wide_data_Main_20[,c('ANTM','BA','ANET')]))

class(stocks_ANTM)

## [1] "xts" "zoo"

plot.xts(stocks_ANTM)

acf(stocks_ANTM,lag.max = 30)

acf(stocks_ANET)

acf(stocks_BA)

plot(diff(as.zoo(stocks_ANTM)))

head(as.zoo(stocks_ANTM))

##                  
## 2015-04-28 150.15
## 2015-04-29 155.96
## 2015-04-30 151.83
## 2015-05-01 151.92
## 2015-05-04 153.45
## 2015-05-05 154.67

White Noise : Time series that show no autocorrelation are called “white noise”. Above plots shows that its of type of Random Walk model , and the (MA Model) Moving Average model should give better estimates of this index.

#   For a given time seriesæ¼ã¸°xæ¼ã¸°we can fit the autoregressive (AR) model using theæ¼ã¸°arima()æ¼ã¸°command and settingæ¼ã¸°orderæ¼ã¸°equal toæ¼ã¸°c(1, 0, 0). Note for reference that an AR model is anæ¼ã¸°ARIMA(1, 0, 0)æ¼ã¸°model.

# Fit with Full Data
# plot.ts(stocks_ANTM)
AR_ANTM <- arima(stocks_ANTM, order  = c(1,0,0))
MA_ANTM <- arima(stocks_ANTM, order  = c(0,0,1))
AR_ANTM_fit <- as.ts(stocks_ANTM) - resid(AR_ANTM)
MA_ANTM_fit <- as.ts(stocks_ANTM) - resid(MA_ANTM)
summary(AR_ANTM)

## 
## Call:
## arima(x = stocks_ANTM, order = c(1, 0, 0))
## 
## Coefficients:
##          ar1  intercept
##       0.9978   222.5894
## s.e.  0.0018    45.1308
## 
## sigma^2 estimated as 16.88:  log likelihood = -3568.12,  aic = 7142.25
## 
## Training set error measures:
##                     ME     RMSE      MAE         MPE    MAPE     MASE
## Training set 0.0542719 4.108244 2.529343 -0.01210249 1.19899 1.002039
##                    ACF1
## Training set -0.0209907

summary(MA_ANTM)

## 
## Call:
## arima(x = stocks_ANTM, order = c(0, 0, 1))
## 
## Coefficients:
##          ma1  intercept
##       0.9678   206.4969
## s.e.  0.0058     1.7300
## 
## sigma^2 estimated as 973.9:  log likelihood = -6119.59,  aic = 12245.19
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE     MASE      ACF1
## Training set 0.01566924 31.20706 27.96412 -4.741534 15.01558 11.07842 0.9108069

# points(AR_ANTM_fit, type = "l", col = 4, lty = 2)
# points(MA_ANTM_fit, type = "l", col = 3, lty = 3)


ggplot(stocks_ANTM, aes(x = index(stocks_ANTM))) +
  geom_line(aes(y= coredata(stocks_ANTM) , color="BASE")) +
  geom_line(aes(y = AR_ANTM_fit, color = "AR Fit")) + 
   geom_line(aes(y = MA_ANTM_fit, color = "MA Fit"))+
  ggtitle("Anthem(ANTM) from 2015-20") +
  scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
  xlab("Date") + ylab("Price")

  # scale_colour_manual("Series", values=c("AR Fit"="gray40", "MA Fit"="firebrick4", "BASE"="darkcyan"))

# Fit with Only data After 2020
# plot.ts(stocks_ANTM_MY)
AR_ANTM_MY <- arima(stocks_ANTM_MY, order  = c(1,0,0))
MA_ANTM_MY <- arima(stocks_ANTM_MY, order  = c(0,0,1))
AR_ANTM_MY_fit <- as.ts(stocks_ANTM_MY) - resid(AR_ANTM_MY)
MA_ANTM_MY_fit <- as.ts(stocks_ANTM_MY) - resid(MA_ANTM_MY)
# points(AR_ANTM_MY_fit, type = "l", col = 4, lty = 2)
# points(MA_ANTM_MY_fit, type = "l", col = 3, lty = 3)


ggplot(stocks_ANTM_MY, aes(x = index(stocks_ANTM_MY))) +
  geom_line(aes(y= coredata(stocks_ANTM_MY) , color="BASE")) +
  geom_line(aes(y = AR_ANTM_MY_fit, color = "AR Fit")) + 
   geom_line(aes(y = MA_ANTM_MY_fit, color = "MA Fit"))+
   ggtitle("Anthem(ANTM) from 2020") +
  scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
  xlab("Date") + ylab("Price")

# Fit with Data before 2020
AR_ANTM_old <- arima(stocks_ANTM_old, order  = c(1,0,0))
MA_ANTM_old <- arima(stocks_ANTM_old, order  = c(0,0,1))
AR_ANTM_old_fit <- as.ts(stocks_ANTM_old) - resid(AR_ANTM_old)
MA_ANTM_old_fit <- as.ts(stocks_ANTM_old) - resid(MA_ANTM_old)

ggplot(stocks_ANTM_old, aes(x = index(stocks_ANTM_old))) +
  geom_line(aes(y= coredata(stocks_ANTM_old) , color="BASE")) +
  geom_line(aes(y = AR_ANTM_old_fit, color = "AR Fit")) + 
   geom_line(aes(y = MA_ANTM_old_fit, color = "MA Fit"))+
  ggtitle("Anthem(ANTM) Before 2020") +
  scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
  xlab("Date") + ylab("Price")

Predict

WE will evalute all the data models and see its predction using both the models with Currnt Years data.

# Make a 1-step through 10-step forecast based on MA
predict(AR_ANTM,n.ahead = 10)

## $pred
## Time Series:
## Start = 1260 
## End = 1269 
## Frequency = 1 
##  [1] 267.4619 267.3640 267.2663 267.1688 267.0716 266.9745 266.8777 266.7811
##  [9] 266.6846 266.5884
## 
## $se
## Time Series:
## Start = 1260 
## End = 1269 
## Frequency = 1 
##  [1]  4.108244  5.803600  7.100186  8.189663  9.146360 10.008444 10.798612
##  [8] 11.531670 12.217916 12.864855

# Plot the  series plus the forecast and 95% prediction intervals

AR_forecasts <- predict(AR_ANTM, n.ahead = 300)$pred
AR_forecast_se <- predict(AR_ANTM, n.ahead = 300)$se
plot.ts(stocks_ANTM)
points(AR_forecasts, type = "l", col = 4,lty=2)
# points(AR_forecasts - AR_forecast_se, type = "l", col = 2, lty = 1)
points(AR_forecasts - 2*AR_forecast_se, type = "l", col = 2, lty = 1)
points(AR_forecasts + 2*AR_forecast_se, type = "l", col = 2, lty = 1)

#--------------------
library(forecast)
# We can then use the ARIMA model to make forecasts for future values of the time series, using the "forecast.
AR_ANTM_forcast <- forecast(AR_ANTM, h=30,level=c(99.5))
# We can plot the observed value of stock for the , as well as the predicted that would be predicted for these and for the next 5 days using our ARIMA(0,0,1) model, 
plot(AR_ANTM_forcast)

#--------------------

# Plot of orignal data set and predicaitn  of 2020 based on old data

AR_old_forecasts <- predict(AR_ANTM_old, n.ahead = 300)$pred
AR_old_forecast_se <- predict(AR_ANTM_old, n.ahead = 300)$se
plot.ts(stocks_ANTM)
points(AR_old_forecasts, type = "l", col = 4,lty=2)
points(AR_old_forecasts - 2*AR_old_forecast_se, type = "l", col = 2, lty = 1)
points(AR_old_forecasts + 2*AR_old_forecast_se, type = "l", col = 2, lty = 1)

#MA Movinfg Average Modege
MA_old_forecasts <- predict(MA_ANTM_old, n.ahead = 300)$pred
MA_old_forecast_se <- predict(MA_ANTM_old, n.ahead = 300)$se
plot.ts(stocks_ANTM)
points(MA_old_forecasts, type = "l", col = 4,lty=2)
points(MA_old_forecasts - 2*MA_old_forecast_se, type = "l", col = 2, lty = 1)
points(MA_old_forecasts + 2*MA_old_forecast_se, type = "l", col = 2, lty = 1)

# @----------------------
AR_MY_forecasts <- predict(AR_ANTM_MY, n.ahead = 5)$pred
AR_MY_forecast_se <- predict(AR_ANTM_MY, n.ahead = 3)$se
plot.ts(stocks_ANTM_MY)
points(AR_MY_forecasts, type = "l", col = 4,lty=2)
points(AR_MY_forecasts - 2*AR_MY_forecast_se, type = "l", col = 2, lty = 1)
points(AR_MY_forecasts + 2*AR_MY_forecast_se, type = "l", col = 2, lty = 1)

dim(coredata(stocks_ANTM_MY))

## [1] 80  1

length(AR_MY_forecasts)

## [1] 5

# REDUCE DATE TO SEE HWO IT GOES FOR MONT OG MARCH
# PLOT COR PLOT WITH OTHER SHARS ON TOP
# ggplot(stocks_ANTM_MY, aes(x = index(stocks_ANTM_MY))) 

  # autoplot(AR_ANTM_forcast) + geom_smooth()
  # geom_line(aes(y= coredata(stocks_ANTM_MY) , color="BASE")) +
  #   ggtitle("Anthem(ANTM): Predicted next 30") +
  # scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
  # xlab("Date") + ylab("Price") 
  # 
  # geom_line(aes(y = AR_forecasts, color = "EXACT")) + 
  # geom_line(aes(y = AR_forecasts - 2*AR_forecast_se, color = "-Range"))+
  # geom_line(aes(y = AR_forecasts + 2*AR_forecast_se, color = "+Range"))+
  # ggtitle("Anthem(ANTM): Predicted next 30") +
  # scale_x_date(date_labels = "%b %y", date_breaks = "3 months")+
  # xlab("Date") + ylab("Price") 


AIC(AR_ANTM,MA_ANTM)

##         df       AIC
## AR_ANTM  3  7142.246
## MA_ANTM  3 12245.188

BIC(AR_ANTM,MA_ANTM)

##         df      BIC
## AR_ANTM  3  7157.66
## MA_ANTM  3 12260.60

AIC(AR_ANTM_MY,MA_ANTM_MY)

##            df      AIC
## AR_ANTM_MY  3 608.5171
## MA_ANTM_MY  3 707.9615

BIC(AR_ANTM_MY,MA_ANTM_MY)

##            df      BIC
## AR_ANTM_MY  3 615.6632
## MA_ANTM_MY  3 715.1076

AIC(AR_ANTM_old,MA_ANTM_old)

##             df      AIC
## AR_ANTM_old  3  6151.07
## MA_ANTM_old  3 11435.55

BIC(AR_ANTM_old,MA_ANTM_old)

##             df       BIC
## AR_ANTM_old  3  6166.287
## MA_ANTM_old  3 11450.772

Data 621 final project

Priya Shaji

4/18/2020

Number of ZERO

Let only Target shares that are between 100 and 200 open prices

Autocorrelation of Time Series

Predict