#for scraping
library(rvest)
#blanket import for core tidyverse packages
library(tidyverse)
#tidy financial analysis 
library(tidyquant)
#tidy data cleaning functions
library(janitor)

rm(list=ls())
SP500.Stocks_long <- read.csv("C:/Users/HP/Downloads/SP500-Stocks_long.txt/SP500-Stocks_long.txt")

today <- Sys.Date()
date <-  today %m+% months(-3)
print(date)

SP500_1 <- tq_get("^GSPC", from = date)
SP500_1 %>% 
  head()

url <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tickers <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="constituents"]') %>% 
  html_table()
sp500tickers <- tickers[[1]]
sp500tickers = sp500tickers %>% mutate(Symbol = case_when(Symbol == "BRK.B" ~ "BRK-B",
                                           Symbol == "BF.B" ~ "BF-B",
                                            TRUE ~ as.character(Symbol)))
symbol = sp500tickers$Symbol
get_symbols <-  function(ticker = "AAPL"){
  df = tq_get(ticker, from = date) %>% mutate(symbol = rep(ticker, length(date)))
  return(df)
}
tickers_df = map(symbol, get_symbols) %>% bind_rows()
tickers_df = tickers_df %>% 
  left_join(sp500tickers, by = c('symbol' = 'Symbol')) %>% 
  clean_names() 
tickers_df %>% 
  head()
#Solution 1.
daily_sector = tickers_df %>% group_by(security, gics_sector, symbol) %>% 
tq_transmute(select     = adjusted, 
              mutate_fun = periodReturn, 
              period     = "daily") %>% 
              ungroup()
avg_return =daily_sector %>% 
  group_by(security, gics_sector) %>%
  summarise(avg_return = round(mean(daily.returns), 4),Volatility =   sd(daily.returns)) %>%
  arrange(desc(avg_return), desc(Volatility))

avg_return %>% head()

avg_return %>% head(20) %>% ggplot(aes(reorder(security, -avg_return), avg_return, fill = avg_return))+
  geom_col()+
  coord_flip()+
  labs(title = "20 Companies with the Highest Average Returns in the S&P500 During the Last 3 Months", x = "Company", y = "Average Return")+
  theme_classic()+
  theme(legend.position="none")
#Solution 2.
plot <- avg_return %>% ggplot(aes(avg_return, Volatility))+
  geom_text(aes(label = symbol), size = 3)+
  labs(title = "Average Return vs. Volatility Over Last 3 Months In S&P500", x = "Average Return", subtitle = "Data Source: Yahoo Finance")+ theme_minimal()
plot
#At the first glance, the biggest thing that stands out above the scatter plot is the high volatility of XYL, PYG, AMD corporations. So I'm going to highlight them

#First attempt:
avg_return = avg_return %>% 
  mutate(Indicator = case_when(symbol %in% c('XYL', 'PYG', 'AMD') ~ "Top 3 Highest Firms",
                               TRUE ~ "The Rest of the S&P500"))
plot = avg_return %>% ggplot(aes(avg_return, Volatility, color = Indicator))+
  geom_text(aes(label = symbol), size = 3)+
  labs(title = "Average Return vs Volatility Over Last 3 Months In SP500", x = "Average Return", subtitle = "Data Source: Yahoo Finance")+
  theme_minimal()
plot

#Second Attempt: filter dataframe to get data to be highligheted
highlight_df <- avg_return %>% 
             filter(Volatility > 0.04, avg_return >= -0.003)
plot <- avg_return %>% 
  ggplot(aes(avg_return, Volatility))+
  geom_text(data = highlight_df, aes(label = symbol), color= 'red', size = 3)+
  labs(title = "Average Return vs. Volatility Over Last 3 Months In S&P500", x = "Average Return", subtitle = "Data Source: Yahoo Finance")+ theme_minimal()
plot