library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)
library(ggthemes)
library(ggplot2)
library(dplyr)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(broom)
data <- read.csv("C:/Users/aiden/OneDrive/mergedfile.csv")
group_by_sector <- data |>
  group_by(Sector) |>
  summarise(mean_marketcap = mean(Marketcap, na.rm = TRUE))

# Visualizing: Marketcap by Sector
ggplot(group_by_sector, aes(x = Sector, y = mean_marketcap)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Average Market Cap by Sector")

# Set the seed for reproducibility
set.seed(123)

# Function to generate a random subsample
generate_subsample <- function(data, sample_size) {
  data |> sample_n(size = sample_size, replace = TRUE)
}

# Create 5 random subsamples, each 50% of the original data size
sample_size <- nrow(data) * 0.5
subsamples <- 1:5 |> map(~ generate_subsample(data, sample_size))

# Assign the subsamples to separate data frames
df_1 <- subsamples[[1]]
df_2 <- subsamples[[2]]
df_3 <- subsamples[[3]]
df_4 <- subsamples[[4]]
df_5 <- subsamples[[5]]
# Function to summarize each subsample by Sector and Industry
summarize_by_sector_industry <- function(df) {
  df |>
    group_by(Sector, Industry) |>
    summarize(
      mean_adj_close = mean(Adj.Close, na.rm = TRUE),
      mean_marketcap = mean(Marketcap, na.rm = TRUE),
      mean_current_price = mean(Currentprice, na.rm = TRUE),
      mean_ebitda = mean(Ebitda, na.rm = TRUE),
      mean_revenue_growth = mean(Revenuegrowth, na.rm = TRUE),
      .groups = 'drop'
    )
}

# Apply the summary function to each subsample
summaries <- subsamples |> map(summarize_by_sector_industry)
# View the summary of the subsamples
print(summaries[[1]])
## # A tibble: 56 × 7
##    Sector  Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
##    <chr>   <chr>             <dbl>          <dbl>              <dbl>       <dbl>
##  1 Basic … Special…          122.         3.61e10              183.      1.69e 9
##  2 Commun… Interne…           59.8        2.05e12              167.      1.15e11
##  3 Commun… Telecom…           13.9        1.41e11               19.7     4.21e10
##  4 Consum… Auto & …           64.8        1.34e10               85.8     9.63e 8
##  5 Consum… Auto Pa…           52.4        1.22e10               52.5     2.40e 9
##  6 Consum… Interne…           66.8        1.86e12              177.      1.04e11
##  7 Consum… Packagi…           50.5        1.78e10              101.      1.79e 9
##  8 Consum… Resorts…           40.6        8.28e 9               38.3     3.71e 9
##  9 Consum… Special…          365.         2.69e10             1087.      2.83e 9
## 10 Consum… Travel …          711.         7.39e10             1265.      4.55e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
data$Date <- as.Date(data$Date)

# Calculate the average Adjusted Close price across all companies for each date
average_price <- data |>
  group_by(Date) |>
  summarize(avg_price = mean(Adj.Close, na.rm = TRUE))

# Plot the average price over time as a line graph
ggplot(average_price, aes(x = Date, y = avg_price)) +
  geom_line(color = "blue", linewidth = 1) +  # Traditional line graph
  labs(title = "Average Price of S&P 500 Companies Over Time",
       x = "Date", y = "Average Adjusted Close Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Calculate the percentage change from the first to the last date for each company
percent_change <- data |>
  group_by(Symbol, Sector) |>
  arrange(Date) |>
  summarize(first_price = first(Adj.Close),
            last_price = last(Adj.Close)) |>
  mutate(percent_change = ((last_price - first_price) / first_price) * 100)
## `summarise()` has grouped output by 'Symbol'. You can override using the
## `.groups` argument.
# Calculate the average percent change for each sector
average_percent_change <- percent_change |>
  group_by(Sector) |>
  summarize(avg_percent_change = mean(percent_change, na.rm = TRUE)) |>
  arrange(desc(avg_percent_change))  # Arrange by highest-performing sector

# Print the table of average percent change for each sector
print(average_percent_change)
## # A tibble: 11 × 2
##    Sector                 avg_percent_change
##    <chr>                               <dbl>
##  1 Technology                       2257.   
##  2 Industrials                      1822.   
##  3 Financial Services                843.   
##  4 Consumer Cyclical                 825.   
##  5 Communication Services            701.   
##  6 Healthcare                        466.   
##  7 Utilities                         463.   
##  8 Real Estate                       394.   
##  9 Basic Materials                   303.   
## 10 Consumer Defensive                255.   
## 11 Energy                             -0.242
# Filter data for the Technology and Consumer Defensive sectors
tech_data <- data |>
  filter(Sector == "Technology")

consumer_defensive_data <- data |>
  filter(Sector == "Consumer Defensive")

# Calculate the average Adjusted Close price for the tech sector for each date
average_tech_price <- tech_data |>
  group_by(Date) |>
  summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
  mutate(Sector = "Technology")

# Calculate the average Adjusted Close price for the consumer defensive sector for each date
average_consumer_defensive_price <- consumer_defensive_data |>
  group_by(Date) |>
  summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
  mutate(Sector = "Consumer Defensive")

# Combine the two datasets
combined_data <- bind_rows(average_tech_price, average_consumer_defensive_price)

# Plot the average adjusted close price over time for both sectors
ggplot(combined_data, aes(x = Date, y = avg_price, color = Sector)) +
  geom_line(linewidth = 1) +  # Line graph for both sectors
  labs(title = "Average Adjusted Close Price: Technology vs Consumer Defensive",
       x = "Date", y = "Average Adjusted Close Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Filter data for Boeing (BA)
boeing_data <- data |> 
  filter(Symbol == "BA")

# Create a mock dataset for negative news headlines about Boeing
negative_news <- data.frame(
  Date = as.Date(c("2019-03-10", "2019-12-18", "2020-01-06", "2020-11-09")),
  News = c("737 MAX crash in Ethiopia",
           "737 MAX grounded worldwide",
           "Investigations into 737 MAX continue",
           "FAA approves return of 737 MAX")
)

# Join the stock data with the negative news
boeing_data <- boeing_data |>
  left_join(negative_news, by = "Date")

# Plotting
ggplot(boeing_data, aes(x = Date, y = Adj.Close)) +
  geom_line(color = "blue", linewidth = 1) +  # Line for stock price
  geom_point(data = boeing_data[!is.na(boeing_data$News), ], aes(x = Date, y = Adj.Close), 
             color = "red", size = 3, shape = 21, fill = "red") +  # Points for negative news
  labs(title = "Boeing (BA) Stock Price and Negative News Correlation",
       x = "Date", y = "Adjusted Close Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  geom_text(data = boeing_data[!is.na(boeing_data$News), ], aes(label = News), 
            vjust = -1, hjust = 1, size = 3, color = "red")  # Add text for news

# Visualization: Boxplot of Revenue Growth by Sector
ggplot(data, aes(x = Sector, y = Revenuegrowth, fill = Sector)) +
  geom_boxplot() +
  theme_minimal() +
  theme(axis.text.x=element_blank())+
  labs(title = "Revenue Growth by Sector", y = "Revenue Growth", x = "Sector")

# Select your response variable and explanatory variable
response_var <- data$Adj.Close
explanatory_var <- data$Sector

# Fit an ANOVA model
anova_model <- aov(response_var ~ explanatory_var, data = data)

# Print ANOVA summary
summary(anova_model)
##                     Df    Sum Sq   Mean Sq F value Pr(>F)    
## explanatory_var     10 1.222e+09 122160896    2406 <2e-16 ***
## Residuals       339524 1.724e+10     50766                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12562 observations deleted due to missingness
# If there are more than 10 categories in 'Sector', you can consolidate them manually or using a threshold
data$Sector <- fct_lump_n(data$Sector, n = 10)

# Re-run the ANOVA with the consolidated categories if necessary
anova_model_consolidated <- aov(response_var ~ explanatory_var, data = data)
summary(anova_model_consolidated)
##                     Df    Sum Sq   Mean Sq F value Pr(>F)    
## explanatory_var     10 1.222e+09 122160896    2406 <2e-16 ***
## Residuals       339524 1.724e+10     50766                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12562 observations deleted due to missingness

ANOVA Results:

After running the ANOVA, if the p-value is below 0.05, this suggests that the stock prices differ significantly across sectors. You can explain this finding in terms of how sectors may impact stock prices differently, which could guide investment decisions.

# Select another continuous variable for linear regression, e.g., Revenue Growth
linear_model <- lm(Adj.Close ~ Revenuegrowth, data = data)

# Print model summary
summary(linear_model)
## 
## Call:
## lm(formula = Adj.Close ~ Revenuegrowth, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -111.6  -78.1  -53.0   -2.9 4012.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   104.7525     0.4273 245.172  < 2e-16 ***
## Revenuegrowth  22.8919     3.5845   6.386  1.7e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 233.1 on 339533 degrees of freedom
##   (12562 observations deleted due to missingness)
## Multiple R-squared:  0.0001201,  Adjusted R-squared:  0.0001172 
## F-statistic: 40.79 on 1 and 339533 DF,  p-value: 1.701e-10
# Print regression coefficients and model fit statistics
tidy(linear_model)  # Coefficients
## # A tibble: 2 × 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)      105.      0.427    245.   0       
## 2 Revenuegrowth     22.9     3.58       6.39 1.70e-10
glance(linear_model)  # Model summary (R-squared, etc.)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df    logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>     <dbl>  <dbl>  <dbl>
## 1  0.000120      0.000117  233.      40.8 1.70e-10     1 -2332807. 4.67e6 4.67e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Visualize the regression line
ggplot(data, aes(x = Revenuegrowth, y = Adj.Close)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal() +
  labs(title = "Linear Relationship Between Revenue Growth and Adjusted Close",
       x = "Revenue Growth", y = "Adjusted Closing Price")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 12562 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12562 rows containing missing values or values outside the scale range
## (`geom_point()`).

Regression Analysis:

The linear regression model shows how Revenuegrowth affects Adj.Close. For example, a positive coefficient would suggest that companies with higher revenue growth tend to have higher stock prices

# Ensure the data used for fitting the model and for prediction are the same
# Regression Analysis
reg_model <- lm(Close ~ Volume + Sector, data = data)

# Add Predicted_Close to the original data frame
data <- data |>
  mutate(Predicted_Close = predict(reg_model, newdata = data))

# Extract Year from Date (assuming Date is in "yyyy-mm-dd" format)
data <- data |>
  mutate(Year = as.numeric(format(as.Date(Date), "%Y")))

# Visualization: Actual vs Predicted Close Prices
ggplot(data, aes(x = Close, y = Predicted_Close, color = Sector)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Actual vs Predicted Close Prices", x = "Actual Close Price", y = "Predicted Close Price")
## Warning: Removed 12562 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Advanced EDA: Trends over Time
# Average Close Price over Time by Sector
trend_data <- data |>
  group_by(Year, Sector) |>
  summarize(Avg_Close = mean(Close, na.rm = TRUE), .groups = "drop")

ggplot(trend_data, aes(x = Year, y = Avg_Close, color = Sector)) +
  geom_line() +
  theme_minimal() +
  labs(title = "Average Close Price Over Time by Sector", x = "Year", y = "Average Close Price")

# Data Preprocessing
data <- data |>
  mutate(
    Date = as.Date(Date, format = "%Y-%m-%d"),
    Year = year(Date),
    Month = month(Date, label = TRUE)
  ) |>
  filter(!is.na(Sector))  # Remove rows with missing sector information

# Initial Exploratory Data Analysis (EDA)
# Summary statistics
summary_stats <- data |>
  group_by(Sector) |>
  summarize(
    Avg_Close = mean(Close, na.rm = TRUE),
    Median_Close = median(Close, na.rm = TRUE),
    Total_Volume = sum(Volume, na.rm = TRUE)
  )

# Visualization: Average Close Price by Sector
ggplot(summary_stats, aes(x = reorder(Sector, -Avg_Close), y = Avg_Close, fill = Sector)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(axis.text.x=element_blank())+
  labs(title = "Average Close Price by Sector", x = "Sector", y = "Average Close Price")

# Hypothesis Testing
# Compare sector performance using ANOVA
anova_results <- aov(Close ~ Sector, data = data)
anova_summary <- summary(anova_results)

# Post-hoc test to see which sectors differ
posthoc <- TukeyHSD(anova_results)

# Visualization: Sector Performance Distribution
ggplot(data, aes(x = Sector, y = Close, fill = Sector)) +
  geom_boxplot() +
  theme_minimal() +
  theme(axis.text.x=element_blank())+
  labs(title = "Sector Performance Distribution", x = "Sector", y = "Close Price")
## Warning: Removed 12562 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Filter data for the Technology sector and select relevant columns
tech_data <- data |>
  filter(Sector == "Technology") |>
  select(Close, Volume, Marketcap, Revenuegrowth) |>
  drop_na()

# Transform variables to handle outliers
tech_data <- tech_data |>
  mutate(
    Log_Close = log(Close),
    Log_Volume = log(Volume + 1),  # Add 1 to avoid log(0)
    Log_Marketcap = log(Marketcap + 1),  # Add 1 to avoid log(0)
    Log_Revenuegrowth = log(Revenuegrowth + 1)  # Add 1 to avoid log(0)
  )

# Build the initial regression model
initial_model <- lm(Log_Close ~ Log_Volume + Log_Marketcap + Log_Revenuegrowth, data = tech_data)

# Calculate Cook's Distance to identify outliers
cooks_dist <- cooks.distance(initial_model)

# Define a threshold for influential points (commonly 4/n)
threshold <- 4 / nrow(tech_data)

# Filter out observations with Cook's Distance greater than the threshold
tech_data_cleaned <- tech_data[cooks_dist <= threshold, ]

# Refit the regression model without outliers
cleaned_model <- lm(Log_Close ~ Log_Volume + Log_Marketcap + Log_Revenuegrowth, data = tech_data_cleaned)

# Display the summary of the cleaned regression model
summary(cleaned_model)
## 
## Call:
## lm(formula = Log_Close ~ Log_Volume + Log_Marketcap + Log_Revenuegrowth, 
##     data = tech_data_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4492 -0.5796  0.0251  0.6708  2.7940 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.742116   0.089273   19.51   <2e-16 ***
## Log_Volume        -0.630302   0.004351 -144.86   <2e-16 ***
## Log_Marketcap      0.466472   0.005298   88.04   <2e-16 ***
## Log_Revenuegrowth -1.045889   0.036747  -28.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8952 on 52726 degrees of freedom
## Multiple R-squared:  0.3242, Adjusted R-squared:  0.3242 
## F-statistic:  8431 on 3 and 52726 DF,  p-value: < 2.2e-16
# Backtesting: Add predictions to the cleaned dataset
tech_data_cleaned <- tech_data_cleaned |>
  mutate(Predicted_Log_Close = predict(cleaned_model),
         Predicted_Close = exp(Predicted_Log_Close))  # Reverse the log transformation

# Visualize actual vs predicted values
ggplot(tech_data_cleaned, aes(x = Close, y = Predicted_Close)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(title = "Actual vs Predicted Close Prices for Technology Sector (Outliers Removed)",
       x = "Actual Close Price",
       y = "Predicted Close Price")

# Check for multicollinearity using VIF
vif_values <- vif(cleaned_model)
print(vif_values)
##        Log_Volume     Log_Marketcap Log_Revenuegrowth 
##          3.096886          3.102245          1.006337
# Coefficients and p-values
tidy(cleaned_model)
## # A tibble: 4 × 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)          1.74    0.0893       19.5 1.65e- 84
## 2 Log_Volume          -0.630   0.00435    -145.  0        
## 3 Log_Marketcap        0.466   0.00530      88.0 0        
## 4 Log_Revenuegrowth   -1.05    0.0367      -28.5 7.67e-177
# Filter data for the Technology sector and select relevant columns
tech_data <- data |>
  filter(Sector == "Technology") |>
  select(Close, Volume, Marketcap, Revenuegrowth) |>
  drop_na()

# Build the initial regression model without log transformation
initial_model <- lm(Close ~ Volume + Marketcap + Revenuegrowth, data = tech_data)

# Calculate Cook's Distance to identify outliers
cooks_dist <- cooks.distance(initial_model)

# Define a threshold for influential points (commonly 4/n)
threshold <- 4 / nrow(tech_data)

# Filter out observations with Cook's Distance greater than the threshold
tech_data_cleaned <- tech_data |>
  filter(cooks_dist <= threshold)

# Refit the regression model without outliers
cleaned_model <- lm(Close ~ Volume + Marketcap + Revenuegrowth, data = tech_data_cleaned)

# Display the summary of the cleaned regression model
summary(cleaned_model)
## 
## Call:
## lm(formula = Close ~ Volume + Marketcap + Revenuegrowth, data = tech_data_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -85.64 -57.64 -26.81  34.12 368.21 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.297e+01  4.417e-01 210.479   <2e-16 ***
## Volume        -2.264e-07  8.357e-09 -27.085   <2e-16 ***
## Marketcap      1.199e-12  6.739e-13   1.779   0.0752 .  
## Revenuegrowth -5.569e+01  3.326e+00 -16.743   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.49 on 53659 degrees of freedom
## Multiple R-squared:  0.03403,    Adjusted R-squared:  0.03397 
## F-statistic: 630.1 on 3 and 53659 DF,  p-value: < 2.2e-16
# Backtesting: Add predictions to the cleaned dataset
tech_data_cleaned <- tech_data_cleaned |>
  mutate(Predicted_Close = predict(cleaned_model))

# Visualize actual vs predicted values
ggplot(tech_data_cleaned, aes(x = Close, y = Predicted_Close)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(title = "Actual vs Predicted Close Prices for Technology Sector (Outliers Removed)",
       x = "Actual Close Price",
       y = "Predicted Close Price")

# Check for multicollinearity using VIF
vif_values <- car::vif(cleaned_model)
print(vif_values)
##        Volume     Marketcap Revenuegrowth 
##      2.443724      2.443728      1.000061
# Coefficients and p-values
broom::tidy(cleaned_model)
## # A tibble: 4 × 5
##   term           estimate std.error statistic   p.value
##   <chr>             <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    9.30e+ 1  4.42e- 1    210.   0        
## 2 Volume        -2.26e- 7  8.36e- 9    -27.1  1.78e-160
## 3 Marketcap      1.20e-12  6.74e-13      1.78 7.52e-  2
## 4 Revenuegrowth -5.57e+ 1  3.33e+ 0    -16.7  9.27e- 63