library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## Warning: package 'rvest' was built under R version 4.0.5
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.0.5
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 4.0.5
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Warning: package 'PerformanceAnalytics' was built under R version 4.0.5
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Warning: package 'quantmod' was built under R version 4.0.5
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## == Need to Learn tidyquant? ====================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(janitor)
## Warning: package 'janitor' was built under R version 4.0.5
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(readr)
SP500_Stocks_long <- read_csv("C:/Windows/System32/SP500-Stocks_long.txt")
## Rows: 829710 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): ticker
## dbl  (1): price.adjusted
## date (1): ref.date
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
today <- Sys.Date()
date <-  today %m+% months(-3)
print(date)
## [1] "2021-10-08"
SP500_1 <- tq_get("^GSPC", from = date)
SP500_1 %>% 
    head()
## # A tibble: 6 x 8
##   symbol date        open  high   low close     volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>      <dbl>    <dbl>
## 1 ^GSPC  2021-10-08 4407. 4412. 4386. 4391. 2401890000    4391.
## 2 ^GSPC  2021-10-11 4385. 4416. 4361. 4361. 2580000000    4361.
## 3 ^GSPC  2021-10-12 4368. 4375. 4342. 4351. 2608150000    4351.
## 4 ^GSPC  2021-10-13 4358. 4373. 4330. 4364. 2926460000    4364.
## 5 ^GSPC  2021-10-14 4387. 4440. 4387. 4438. 2642920000    4438.
## 6 ^GSPC  2021-10-15 4448. 4476. 4448. 4471. 3000560000    4471.
 url <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
 tickers <- url %>%
     read_html() %>%
     html_nodes(xpath = '//*[@id="constituents"]') %>% 
     html_table()
 sp500tickers <- tickers[[1]]
 sp500tickers = sp500tickers %>% mutate(Symbol = case_when(Symbol == "BRK.B" ~ "BRK-B",
                                                           Symbol == "BF.B" ~ "BF-B",
                                                           TRUE ~ as.character(Symbol)))
 symbol = sp500tickers$Symbol
 get_symbols <-  function(ticker = "AAPL"){
     df = tq_get(ticker, from = date) %>% mutate(symbol = rep(ticker, length(date)))
     return(df)
 }
 tickers_df = map(symbol, get_symbols) %>% bind_rows()
  tickers_df = tickers_df %>% 
     left_join(sp500tickers, by = c('symbol' = 'Symbol')) %>% 
     clean_names() 
 tickers_df %>% 
     head()
## # A tibble: 6 x 16
##   symbol date        open  high   low close volume adjusted security sec_filings
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl> <chr>    <chr>      
## 1 MMM    2021-10-08  178.  178.  177.  177. 2.47e6     176. 3M       reports    
## 2 MMM    2021-10-11  178.  179.  176.  176. 2.69e6     175. 3M       reports    
## 3 MMM    2021-10-12  176.  177.  175.  176. 2.16e6     174. 3M       reports    
## 4 MMM    2021-10-13  176.  178.  175.  177. 2.03e6     176. 3M       reports    
## 5 MMM    2021-10-14  178   180.  178.  180. 2.28e6     179. 3M       reports    
## 6 MMM    2021-10-15  181.  183   181.  182. 2.16e6     180. 3M       reports    
## # ... with 6 more variables: gics_sector <chr>, gics_sub_industry <chr>,
## #   headquarters_location <chr>, date_first_added <chr>, cik <int>,
## #   founded <chr>
 #1 Based on the data of SP500, create plot of 20 companies with the highest average returns in the last 3 months of the data range.
 daily_sector = tickers_df %>% group_by(security, gics_sector, symbol) %>% 
     tq_transmute(select     = adjusted, 
                  mutate_fun = periodReturn, 
                  period     = "daily") %>% 
     ungroup()
 avg_return =daily_sector %>% 
     group_by(security, gics_sector) %>%
     summarise(avg_return = round(mean(daily.returns), 4),Volatility =   sd(daily.returns)) %>%
     arrange(desc(avg_return), desc(Volatility))
## `summarise()` has grouped output by 'security'. You can override using the `.groups` argument.
 avg_return %>% head()
## # A tibble: 6 x 4
## # Groups:   security [6]
##   security        gics_sector            avg_return Volatility
##   <chr>           <chr>                       <dbl>      <dbl>
## 1 Ford            Consumer Discretionary     0.0081     0.0309
## 2 Teradyne        Information Technology     0.0061     0.0246
## 3 Qualcomm        Information Technology     0.0059     0.0263
## 4 Dollar Tree     Consumer Discretionary     0.0059     0.0255
## 5 Arista Networks Information Technology     0.0057     0.0328
## 6 HP              Information Technology     0.0055     0.0221
  avg_return %>% head(20) %>% ggplot(aes(reorder(security, -avg_return), avg_return, fill = avg_return))+
     geom_col()+
     coord_flip()+
     labs(title = "20 companies in SP500 with the highest average returns in the last 3 months", x = "Company", y = "Average Return")+
     theme_classic()+
     theme(legend.position="none")

#2 Based on the data of SP500, create plot of average returns vs. volatility in the last 3 months of the data range.  Also highlight 3 highest volatility companies in the plot as well. 
 plot <- avg_return %>% ggplot(aes(avg_return, Volatility))+
     geom_text(aes(label = symbol), size = 3)+
     labs(title = "Average Return vs Volatility Over Last 3 Months In SP500", x = "Average Return", subtitle = "Data Source: Yahoo Finance")+ theme_minimal()
 plot