0. Load Libraries and Define Functions

# 0.0  Load libraries ----
# Loads tidyquant, lubridate, xts, quantmod, TTR, and PerformanceAnalytics
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyquant)  
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.1── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
#0.1 Define functions ----
#The following functions explicitly compute annual, monthly and weekly returns
get_annual_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "yearly")
}

get_monthly_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "monthly")
}

get_weekly_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "weekly")
}

get_daily_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "daily")
}

1. Collect stock price data from yahoo —-

1.1 Get data on all stocks in the SP00 —-

stocks_tbl <- tq_index("SP500")
## Getting holdings for SP500
stocks_tbl
## # A tibble: 504 × 8
##    symbol company      identifier sedol weight sector shares_held local_currency
##    <chr>  <chr>        <chr>      <chr>  <dbl> <chr>        <dbl> <chr>         
##  1 NVDA   NVIDIA CORP  67066G104  2379… 0.0725 -        292827172 USD           
##  2 AAPL   APPLE INC    037833100  2046… 0.0702 -        178100766 USD           
##  3 MSFT   MICROSOFT C… 594918104  2588… 0.0616 -         89205769 USD           
##  4 AMZN   AMAZON.COM … 023135106  2000… 0.0373 -        116468378 USD           
##  5 GOOGL  ALPHABET IN… 02079K305  BYVY… 0.0300 -         69807871 USD           
##  6 AVGO   BROADCOM INC 11135F101  BDZ7… 0.0267 -         56445022 USD           
##  7 META   META PLATFO… 30303M102  B7TL… 0.0245 -         26028317 USD           
##  8 GOOG   ALPHABET IN… 02079K107  BYY8… 0.0242 -         56039675 USD           
##  9 TSLA   TESLA INC    88160R101  B616… 0.0228 -         33674627 USD           
## 10 BRK-B  BERKSHIRE H… 084670702  2073… 0.0161 -         22004519 USD           
## # ℹ 494 more rows
# # Symbol "BRK.B" gives error from tq_get  
# # Change to "BRK-B"
# rowindex_0<-which(stocks_tbl$symbol=="BRK.B")
# stocks_tbl$symbol[rowindex_0]<-"BRK-B"
# 
# # Change to "BF.B" to "BF-B"
# rowindex_0<-which(stocks_tbl$symbol=="BF.B")
# stocks_tbl$symbol[rowindex_0]<-"BF-B"
# 
# # Drop  symbol "-"
index_dash<-which(stocks_tbl$symbol=="-")  
stocks_tbl0<-stocks_tbl[-c( index_dash),]  

# Collect all symbols in stocks_tbl0

stocks_symbols<-stocks_tbl0$symbol

1.2 Set start and end dates for price data —-

date_start= "2019-01-01"
date_end= "2024-08-31"


# Collecting data from large group can take
# significant amount of time
#     (on 9/12/2023, less than 3 minutes)
# To monitor progress we split large group into subgroups

1.3 Split large group of stocks into subgroups —-

stocks_tbl0$group<-floor((c(1:nrow(stocks_tbl0))-1)/50)


list_group<-unique(stocks_tbl0$group)

#for (group0 in list_group[-c(1:4)]){
  
for (group0 in list_group) {
  stocks_data <- stocks_tbl0 %>%
    filter(group==group0) %>%
    select(symbol, company) %>%
    tq_get(from = date_start, to=date_end)
assign(paste("stocks_data",group0,sep="."),
       stocks_data)
print(c("End for group ", as.character(group0)))
}
## [1] "End for group " "0"             
## [1] "End for group " "1"             
## [1] "End for group " "2"             
## [1] "End for group " "3"             
## [1] "End for group " "4"
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `data.. = purrr::map(...)`.
## Caused by warning:
## ! x = 'SNDK', get = 'stock.prices': Error in getSymbols.yahoo(Symbols = "SNDK", env = <environment>, verbose = FALSE, : Unable to import "SNDK".
## cannot open the connection
##  Removing SNDK.
## [1] "End for group " "5"             
## [1] "End for group " "6"             
## [1] "End for group " "7"
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `data.. = purrr::map(...)`.
## Caused by warning:
## ! x = 'Q', get = 'stock.prices': Error in getSymbols.yahoo(Symbols = "Q", env = <environment>, verbose = FALSE, : Unable to import "Q".
## cannot open the connection
##  Removing Q.
## [1] "End for group " "8"
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `data.. = purrr::map(...)`.
## Caused by warning:
## ! x = 'SOLS', get = 'stock.prices': Error in getSymbols.yahoo(Symbols = "SOLS", env = <environment>, verbose = FALSE, : Unable to import "SOLS".
## cannot open the connection
##  Removing SOLS.
## [1] "End for group " "9"             
## [1] "End for group " "10"
# If any warnings occur, determine which symbol generated message
# Revise symbol name (as above, replacing "." by "-", or delete
# as was case for symbol = "-", which corresponded to US dollar).

1.4 Bind together data from all groups

stocks_data<-rbind(
  stocks_data.0,
  stocks_data.1,
  stocks_data.2,
  stocks_data.3,
  stocks_data.4,
  stocks_data.5,
  stocks_data.6,
  stocks_data.7,
  stocks_data.8,
  stocks_data.9,
  stocks_data.10)
dim(stocks_data)
## [1] 701329      9

1.5 Convert stocks_data to time series matrix

stocks_data1 <- stocks_data %>% 
  dplyr::select(symbol, date, adjusted) %>% 
  dplyr::filter(!is.na(adjusted)) %>% 
  dplyr::rename(price = adjusted)
  

stocks_tsmatrix<- stocks_data1 %>%
  pivot_wider(names_from = symbol, values_from = price)

dim(stocks_tsmatrix)
## [1] 1426  501
# On 9/10/2024
# dim(stocks_tsmatrix)
# [1] 1426  504
names(stocks_tsmatrix)
##   [1] "date"  "NVDA"  "AAPL"  "MSFT"  "AMZN"  "GOOGL" "AVGO"  "META"  "GOOG" 
##  [10] "TSLA"  "BRK-B" "JPM"   "LLY"   "V"     "WMT"   "JNJ"   "XOM"   "MA"   
##  [19] "PLTR"  "NFLX"  "ABBV"  "COST"  "BAC"   "HD"    "PG"    "AMD"   "GE"   
##  [28] "CSCO"  "UNH"   "WFC"   "ORCL"  "CVX"   "IBM"   "KO"    "GS"    "CAT"  
##  [37] "MU"    "MRK"   "PM"    "CRM"   "RTX"   "MCD"   "ABT"   "TMO"   "MS"   
##  [46] "PEP"   "C"     "AXP"   "ISRG"  "AMAT"  "DIS"   "LIN"   "LRCX"  "QCOM" 
##  [55] "INTU"  "APP"   "AMGN"  "T"     "TJX"   "BKNG"  "VZ"    "ACN"   "GEV"  
##  [64] "NEE"   "UBER"  "SCHW"  "NOW"   "BA"    "SPGI"  "TXN"   "INTC"  "KLAC" 
##  [73] "APH"   "BLK"   "COF"   "GILD"  "ADBE"  "PFE"   "DHR"   "UNP"   "BSX"  
##  [82] "LOW"   "ADI"   "PGR"   "HON"   "WELL"  "ANET"  "MDT"   "ETN"   "PANW" 
##  [91] "SYK"   "DE"    "PLD"   "COP"   "CRWD"  "CB"    "VRTX"  "BX"    "CMCSA"
## [100] "PH"    "NEM"   "BMY"   "ADP"   "CEG"   "MCK"   "MO"    "CVS"   "CME"  
## [109] "LMT"   "SBUX"  "TMUS"  "SO"    "ICE"   "MMC"   "DUK"   "KKR"   "HOOD" 
## [118] "MMM"   "CDNS"  "TT"    "GD"    "SNPS"  "USB"   "AMT"   "PNC"   "DASH" 
## [127] "WM"    "BK"    "ORLY"  "NKE"   "ELV"   "HCA"   "HWM"   "MCO"   "GM"   
## [136] "REGN"  "JCI"   "SHW"   "NOC"   "AON"   "UPS"   "EMR"   "EQIX"  "CI"   
## [145] "RCL"   "WMB"   "TDG"   "MAR"   "WBD"   "MDLZ"  "HLT"   "FCX"   "CMI"  
## [154] "CSX"   "APO"   "ITW"   "ECL"   "GLW"   "COR"   "NSC"   "TEL"   "TRV"  
## [163] "TFC"   "CL"    "AJG"   "CTAS"  "ADSK"  "MSI"   "PWR"   "AEP"   "FDX"  
## [172] "SPG"   "ROST"  "STX"   "PCAR"  "WDC"   "PYPL"  "SLB"   "ABNB"  "EOG"  
## [181] "AZO"   "SRE"   "NXPI"  "BDX"   "IDXX"  "COIN"  "ALL"   "APD"   "ZTS"  
## [190] "PSX"   "AFL"   "MPC"   "MNST"  "O"     "KMI"   "LHX"   "F"     "FTNT" 
## [199] "D"     "URI"   "VST"   "VLO"   "EW"    "CMG"   "ROP"   "FAST"  "AIG"  
## [208] "DLR"   "CAH"   "CBRE"  "WDAY"  "AME"   "MET"   "OKE"   "AMP"   "EA"   
## [217] "DAL"   "CTVA"  "GWW"   "TGT"   "EXC"   "BKR"   "DDOG"  "ROK"   "MPWR" 
## [226] "RSG"   "MSCI"  "XEL"   "AXON"  "YUM"   "NDAQ"  "PSA"   "DHI"   "FICO" 
## [235] "CARR"  "TTWO"  "ETR"   "CTSH"  "PRU"   "PEG"   "TRGP"  "DELL"  "HIG"  
## [244] "A"     "KR"    "CCI"   "KDP"   "PAYX"  "EBAY"  "VMC"   "IQV"   "GEHC" 
## [253] "MLM"   "FISV"  "NUE"   "WAB"   "SYY"   "ED"    "STT"   "RMD"   "VTR"  
## [262] "UAL"   "ACGL"  "XYZ"   "OTIS"  "CPRT"  "MCHP"  "FIS"   "ARES"  "PCG"  
## [271] "KMB"   "KEYS"  "EQT"   "WEC"   "EXPE"  "XYL"   "KVUE"  "GRMN"  "MTB"  
## [280] "WTW"   "FITB"  "HPE"   "HUM"   "SYF"   "VRSK"  "CCL"   "DG"    "VICI" 
## [289] "TER"   "OXY"   "FANG"  "NRG"   "ODFL"  "RJF"   "MTD"   "TSCO"  "IR"   
## [298] "EXR"   "HSY"   "ADM"   "IBKR"  "HBAN"  "CSGP"  "FSLR"  "ATO"   "EFX"  
## [307] "DTE"   "DOV"   "AEE"   "BR"    "EME"   "CBOE"  "NTRS"  "AWK"   "OMC"  
## [316] "ULTA"  "CINF"  "EXE"   "GIS"   "AVB"   "EL"    "DXCM"  "PPL"   "TPR"  
## [325] "BIIB"  "CFG"   "CNP"   "ES"    "VLTO"  "STE"   "DLTR"  "RF"    "LEN"  
## [334] "PHM"   "STLD"  "BRO"   "HAL"   "TDY"   "DVN"   "PPG"   "IRM"   "FE"   
## [343] "LDOS"  "JBL"   "HUBB"  "EIX"   "HPQ"   "TROW"  "WSM"   "LULU"  "WAT"  
## [352] "DRI"   "NTAP"  "CPAY"  "ON"    "LYV"   "EQR"   "LUV"   "NVR"   "LH"   
## [361] "CMS"   "KHC"   "CHD"   "PTC"   "WRB"   "SBAC"  "VRSN"  "LVS"   "IP"   
## [370] "PODD"  "EXPD"  "DGX"   "GPN"   "STZ"   "SW"    "CTRA"  "TYL"   "NI"   
## [379] "CHTR"  "WST"   "KEY"   "AMCR"  "IT"    "CHRW"  "CDW"   "TRMB"  "FTV"  
## [388] "PKG"   "ZBH"   "CNC"   "PFG"   "L"     "SNA"   "GPC"   "ROL"   "MKC"  
## [397] "GDDY"  "WY"    "DD"    "ESS"   "TPL"   "EVRG"  "LNT"   "PNR"   "IFF"  
## [406] "APTV"  "TSN"   "TTD"   "HOLX"  "COO"   "DOW"   "INCY"  "MAA"   "ALB"  
## [415] "J"     "GEN"   "TXT"   "LII"   "INVH"  "DECK"  "SMCI"  "FOXA"  "FFIV" 
## [424] "JBHT"  "TKO"   "AVY"   "BBY"   "BALL"  "RL"    "JKHY"  "KIM"   "ALLE" 
## [433] "EG"    "DPZ"   "MAS"   "VTRS"  "IEX"   "ZBRA"  "BG"    "CF"    "HII"  
## [442] "NDSN"  "UHS"   "CLX"   "AKAM"  "AIZ"   "SOLV"  "HST"   "IVZ"   "CPT"  
## [451] "DOC"   "BLDR"  "FDS"   "EPAM"  "LYB"   "REG"   "GL"    "SWK"   "RVTY" 
## [460] "UDR"   "DAY"   "SJM"   "HAS"   "PNW"   "ALGN"  "WYNN"  "MRNA"  "BXP"  
## [469] "NWSA"  "BAX"   "CRL"   "AES"   "NCLH"  "SWKS"  "FOX"   "TECH"  "APA"  
## [478] "MOH"   "CAG"   "PAYC"  "GNRC"  "LW"    "TAP"   "FRT"   "LKQ"   "HSIC" 
## [487] "MTCH"  "POOL"  "AOS"   "MOS"   "MGM"   "ARE"   "BEN"   "ERIE"  "HRL"  
## [496] "MHK"   "CPB"   "BF-B"  "DVA"   "PSKY"  "NWS"

1.6 Subset out stocks with no missing data

missingcount_bydate<-apply(is.na(stocks_tsmatrix),1,sum)
plot(x=stocks_tsmatrix$date, y=missingcount_bydate,type="l")

# On 9/12/2022, 11 stocks had some missing values on whole period from date_start to date_end

# Define stocks_tsmatrix2 to be  stocks with no missing values

missingcount_bystock<-apply(is.na(stocks_tsmatrix),2,sum)
plot(x=c(1:ncol(stocks_tsmatrix)),
          y=missingcount_bystock,type="l")

sum(missingcount_bystock==0)
## [1] 479
# 488 stocks had no missing prices on period

which_cols_nomissing<-which(missingcount_bystock==0)

# Extract data for stocks with no missing prices ----
dim(stocks_tsmatrix)
## [1] 1426  501
stocks_tsmatrix0<-stocks_tsmatrix[,which_cols_nomissing]

1.7 Create stocks_tbl0 corresponding to symbols in stocks_tsmatrix

# names(stocks_tsmatrix0)
dim(stocks_tsmatrix0) # columns equal to 1(date) + (number of stocks)
## [1] 1426  479
which_symbols_tsmatrix0<-match(names(stocks_tsmatrix0)[-1],
                               stocks_tbl0$symbol,nomatch=0)
stocks_tbl0_tsmatrix0<- stocks_tbl0[which_symbols_tsmatrix0,]

dim(stocks_tsmatrix0)
## [1] 1426  479

2. Save Workspaces

2.1 Save R workspace SP000_data_all.RData —-

save(file="SP500_data_all.RData", list=ls())

2.2 Save R workspace SP000_data_subset.RData —-

stocks_prices<-stocks_tsmatrix0

stocks_tbl<-stocks_tbl0_tsmatrix0     


save(file="SP500_data_subset.RData", list=c("stocks_tbl","stocks_prices"))