library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)
library(ggthemes)
library(ggplot2)
library(dplyr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(broom)
data <- read.csv("C:/Users/aiden/OneDrive/mergedfile.csv")
group_by_sector <- data |>
group_by(Sector) |>
summarise(mean_marketcap = mean(Marketcap, na.rm = TRUE))
# Visualizing: Marketcap by Sector
ggplot(group_by_sector, aes(x = Sector, y = mean_marketcap)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Average Market Cap by Sector")
# Set the seed for reproducibility
set.seed(123)
# Function to generate a random subsample
generate_subsample <- function(data, sample_size) {
data |> sample_n(size = sample_size, replace = TRUE)
}
# Create 5 random subsamples, each 50% of the original data size
sample_size <- nrow(data) * 0.5
subsamples <- 1:5 |> map(~ generate_subsample(data, sample_size))
# Assign the subsamples to separate data frames
df_1 <- subsamples[[1]]
df_2 <- subsamples[[2]]
df_3 <- subsamples[[3]]
df_4 <- subsamples[[4]]
df_5 <- subsamples[[5]]
# Function to summarize each subsample by Sector and Industry
summarize_by_sector_industry <- function(df) {
df |>
group_by(Sector, Industry) |>
summarize(
mean_adj_close = mean(Adj.Close, na.rm = TRUE),
mean_marketcap = mean(Marketcap, na.rm = TRUE),
mean_current_price = mean(Currentprice, na.rm = TRUE),
mean_ebitda = mean(Ebitda, na.rm = TRUE),
mean_revenue_growth = mean(Revenuegrowth, na.rm = TRUE),
.groups = 'drop'
)
}
# Apply the summary function to each subsample
summaries <- subsamples |> map(summarize_by_sector_industry)
# View the summary of the subsamples
print(summaries[[1]])
## # A tibble: 56 × 7
## Sector Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic … Special… 122. 3.61e10 183. 1.69e 9
## 2 Commun… Interne… 59.8 2.05e12 167. 1.15e11
## 3 Commun… Telecom… 13.9 1.41e11 19.7 4.21e10
## 4 Consum… Auto & … 64.8 1.34e10 85.8 9.63e 8
## 5 Consum… Auto Pa… 52.4 1.22e10 52.5 2.40e 9
## 6 Consum… Interne… 66.8 1.86e12 177. 1.04e11
## 7 Consum… Packagi… 50.5 1.78e10 101. 1.79e 9
## 8 Consum… Resorts… 40.6 8.28e 9 38.3 3.71e 9
## 9 Consum… Special… 365. 2.69e10 1087. 2.83e 9
## 10 Consum… Travel … 711. 7.39e10 1265. 4.55e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
data$Date <- as.Date(data$Date)
# Calculate the average Adjusted Close price across all companies for each date
average_price <- data |>
group_by(Date) |>
summarize(avg_price = mean(Adj.Close, na.rm = TRUE))
# Plot the average price over time as a line graph
ggplot(average_price, aes(x = Date, y = avg_price)) +
geom_line(color = "blue", linewidth = 1) + # Traditional line graph
labs(title = "Average Price of S&P 500 Companies Over Time",
x = "Date", y = "Average Adjusted Close Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Calculate the percentage change from the first to the last date for each company
percent_change <- data |>
group_by(Symbol, Sector) |>
arrange(Date) |>
summarize(first_price = first(Adj.Close),
last_price = last(Adj.Close)) |>
mutate(percent_change = ((last_price - first_price) / first_price) * 100)
## `summarise()` has grouped output by 'Symbol'. You can override using the
## `.groups` argument.
# Calculate the average percent change for each sector
average_percent_change <- percent_change |>
group_by(Sector) |>
summarize(avg_percent_change = mean(percent_change, na.rm = TRUE)) |>
arrange(desc(avg_percent_change)) # Arrange by highest-performing sector
# Print the table of average percent change for each sector
print(average_percent_change)
## # A tibble: 11 × 2
## Sector avg_percent_change
## <chr> <dbl>
## 1 Technology 2257.
## 2 Industrials 1822.
## 3 Financial Services 843.
## 4 Consumer Cyclical 825.
## 5 Communication Services 701.
## 6 Healthcare 466.
## 7 Utilities 463.
## 8 Real Estate 394.
## 9 Basic Materials 303.
## 10 Consumer Defensive 255.
## 11 Energy -0.242
# Filter data for the Technology and Consumer Defensive sectors
tech_data <- data |>
filter(Sector == "Technology")
consumer_defensive_data <- data |>
filter(Sector == "Consumer Defensive")
# Calculate the average Adjusted Close price for the tech sector for each date
average_tech_price <- tech_data |>
group_by(Date) |>
summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
mutate(Sector = "Technology")
# Calculate the average Adjusted Close price for the consumer defensive sector for each date
average_consumer_defensive_price <- consumer_defensive_data |>
group_by(Date) |>
summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
mutate(Sector = "Consumer Defensive")
# Combine the two datasets
combined_data <- bind_rows(average_tech_price, average_consumer_defensive_price)
# Plot the average adjusted close price over time for both sectors
ggplot(combined_data, aes(x = Date, y = avg_price, color = Sector)) +
geom_line(linewidth = 1) + # Line graph for both sectors
labs(title = "Average Adjusted Close Price: Technology vs Consumer Defensive",
x = "Date", y = "Average Adjusted Close Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Filter data for Boeing (BA)
boeing_data <- data |>
filter(Symbol == "BA")
# Create a mock dataset for negative news headlines about Boeing
negative_news <- data.frame(
Date = as.Date(c("2019-03-10", "2019-12-18", "2020-01-06", "2020-11-09")),
News = c("737 MAX crash in Ethiopia",
"737 MAX grounded worldwide",
"Investigations into 737 MAX continue",
"FAA approves return of 737 MAX")
)
# Join the stock data with the negative news
boeing_data <- boeing_data |>
left_join(negative_news, by = "Date")
# Plotting
ggplot(boeing_data, aes(x = Date, y = Adj.Close)) +
geom_line(color = "blue", linewidth = 1) + # Line for stock price
geom_point(data = boeing_data[!is.na(boeing_data$News), ], aes(x = Date, y = Adj.Close),
color = "red", size = 3, shape = 21, fill = "red") + # Points for negative news
labs(title = "Boeing (BA) Stock Price and Negative News Correlation",
x = "Date", y = "Adjusted Close Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels
geom_text(data = boeing_data[!is.na(boeing_data$News), ], aes(label = News),
vjust = -1, hjust = 1, size = 3, color = "red") # Add text for news
# Visualization: Boxplot of Revenue Growth by Sector
ggplot(data, aes(x = Sector, y = Revenuegrowth, fill = Sector)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x=element_blank())+
labs(title = "Revenue Growth by Sector", y = "Revenue Growth", x = "Sector")
# Select your response variable and explanatory variable
response_var <- data$Adj.Close
explanatory_var <- data$Sector
# Fit an ANOVA model
anova_model <- aov(response_var ~ explanatory_var, data = data)
# Print ANOVA summary
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## explanatory_var 10 1.222e+09 122160896 2406 <2e-16 ***
## Residuals 339524 1.724e+10 50766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12562 observations deleted due to missingness
# If there are more than 10 categories in 'Sector', you can consolidate them manually or using a threshold
data$Sector <- fct_lump_n(data$Sector, n = 10)
# Re-run the ANOVA with the consolidated categories if necessary
anova_model_consolidated <- aov(response_var ~ explanatory_var, data = data)
summary(anova_model_consolidated)
## Df Sum Sq Mean Sq F value Pr(>F)
## explanatory_var 10 1.222e+09 122160896 2406 <2e-16 ***
## Residuals 339524 1.724e+10 50766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12562 observations deleted due to missingness
ANOVA Results:
After running the ANOVA, if the p-value is below 0.05, this suggests that the stock prices differ significantly across sectors. You can explain this finding in terms of how sectors may impact stock prices differently, which could guide investment decisions.
# Select another continuous variable for linear regression, e.g., Revenue Growth
linear_model <- lm(Adj.Close ~ Revenuegrowth, data = data)
# Print model summary
summary(linear_model)
##
## Call:
## lm(formula = Adj.Close ~ Revenuegrowth, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -111.6 -78.1 -53.0 -2.9 4012.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104.7525 0.4273 245.172 < 2e-16 ***
## Revenuegrowth 22.8919 3.5845 6.386 1.7e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 233.1 on 339533 degrees of freedom
## (12562 observations deleted due to missingness)
## Multiple R-squared: 0.0001201, Adjusted R-squared: 0.0001172
## F-statistic: 40.79 on 1 and 339533 DF, p-value: 1.701e-10
# Print regression coefficients and model fit statistics
tidy(linear_model) # Coefficients
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 105. 0.427 245. 0
## 2 Revenuegrowth 22.9 3.58 6.39 1.70e-10
glance(linear_model) # Model summary (R-squared, etc.)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000120 0.000117 233. 40.8 1.70e-10 1 -2332807. 4.67e6 4.67e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Visualize the regression line
ggplot(data, aes(x = Revenuegrowth, y = Adj.Close)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Linear Relationship Between Revenue Growth and Adjusted Close",
x = "Revenue Growth", y = "Adjusted Closing Price")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 12562 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12562 rows containing missing values or values outside the scale range
## (`geom_point()`).
Regression Analysis:
The linear regression model shows how Revenuegrowth affects Adj.Close. For example, a positive coefficient would suggest that companies with higher revenue growth tend to have higher stock prices
# Ensure the data used for fitting the model and for prediction are the same
# Regression Analysis
reg_model <- lm(Close ~ Volume + Sector, data = data)
# Add Predicted_Close to the original data frame
data <- data |>
mutate(Predicted_Close = predict(reg_model, newdata = data))
# Extract Year from Date (assuming Date is in "yyyy-mm-dd" format)
data <- data |>
mutate(Year = as.numeric(format(as.Date(Date), "%Y")))
# Visualization: Actual vs Predicted Close Prices
ggplot(data, aes(x = Close, y = Predicted_Close, color = Sector)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
theme_minimal() +
labs(title = "Actual vs Predicted Close Prices", x = "Actual Close Price", y = "Predicted Close Price")
## Warning: Removed 12562 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Advanced EDA: Trends over Time
# Average Close Price over Time by Sector
trend_data <- data |>
group_by(Year, Sector) |>
summarize(Avg_Close = mean(Close, na.rm = TRUE), .groups = "drop")
ggplot(trend_data, aes(x = Year, y = Avg_Close, color = Sector)) +
geom_line() +
theme_minimal() +
labs(title = "Average Close Price Over Time by Sector", x = "Year", y = "Average Close Price")
# Data Preprocessing
data <- data |>
mutate(
Date = as.Date(Date, format = "%Y-%m-%d"),
Year = year(Date),
Month = month(Date, label = TRUE)
) |>
filter(!is.na(Sector)) # Remove rows with missing sector information
# Initial Exploratory Data Analysis (EDA)
# Summary statistics
summary_stats <- data |>
group_by(Sector) |>
summarize(
Avg_Close = mean(Close, na.rm = TRUE),
Median_Close = median(Close, na.rm = TRUE),
Total_Volume = sum(Volume, na.rm = TRUE)
)
# Visualization: Average Close Price by Sector
ggplot(summary_stats, aes(x = reorder(Sector, -Avg_Close), y = Avg_Close, fill = Sector)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(axis.text.x=element_blank())+
labs(title = "Average Close Price by Sector", x = "Sector", y = "Average Close Price")
# Hypothesis Testing
# Compare sector performance using ANOVA
anova_results <- aov(Close ~ Sector, data = data)
anova_summary <- summary(anova_results)
# Post-hoc test to see which sectors differ
posthoc <- TukeyHSD(anova_results)
# Visualization: Sector Performance Distribution
ggplot(data, aes(x = Sector, y = Close, fill = Sector)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x=element_blank())+
labs(title = "Sector Performance Distribution", x = "Sector", y = "Close Price")
## Warning: Removed 12562 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Filter data for the Technology sector and select relevant columns
tech_data <- data |>
filter(Sector == "Technology") |>
select(Close, Volume, Marketcap, Revenuegrowth) |>
drop_na()
# Transform variables to handle outliers
tech_data <- tech_data |>
mutate(
Log_Close = log(Close),
Log_Volume = log(Volume + 1), # Add 1 to avoid log(0)
Log_Marketcap = log(Marketcap + 1), # Add 1 to avoid log(0)
Log_Revenuegrowth = log(Revenuegrowth + 1) # Add 1 to avoid log(0)
)
# Build the initial regression model
initial_model <- lm(Log_Close ~ Log_Volume + Log_Marketcap + Log_Revenuegrowth, data = tech_data)
# Calculate Cook's Distance to identify outliers
cooks_dist <- cooks.distance(initial_model)
# Define a threshold for influential points (commonly 4/n)
threshold <- 4 / nrow(tech_data)
# Filter out observations with Cook's Distance greater than the threshold
tech_data_cleaned <- tech_data[cooks_dist <= threshold, ]
# Refit the regression model without outliers
cleaned_model <- lm(Log_Close ~ Log_Volume + Log_Marketcap + Log_Revenuegrowth, data = tech_data_cleaned)
# Display the summary of the cleaned regression model
summary(cleaned_model)
##
## Call:
## lm(formula = Log_Close ~ Log_Volume + Log_Marketcap + Log_Revenuegrowth,
## data = tech_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4492 -0.5796 0.0251 0.6708 2.7940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.742116 0.089273 19.51 <2e-16 ***
## Log_Volume -0.630302 0.004351 -144.86 <2e-16 ***
## Log_Marketcap 0.466472 0.005298 88.04 <2e-16 ***
## Log_Revenuegrowth -1.045889 0.036747 -28.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8952 on 52726 degrees of freedom
## Multiple R-squared: 0.3242, Adjusted R-squared: 0.3242
## F-statistic: 8431 on 3 and 52726 DF, p-value: < 2.2e-16
# Backtesting: Add predictions to the cleaned dataset
tech_data_cleaned <- tech_data_cleaned |>
mutate(Predicted_Log_Close = predict(cleaned_model),
Predicted_Close = exp(Predicted_Log_Close)) # Reverse the log transformation
# Visualize actual vs predicted values
ggplot(tech_data_cleaned, aes(x = Close, y = Predicted_Close)) +
geom_point(color = "blue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Actual vs Predicted Close Prices for Technology Sector (Outliers Removed)",
x = "Actual Close Price",
y = "Predicted Close Price")
# Check for multicollinearity using VIF
vif_values <- vif(cleaned_model)
print(vif_values)
## Log_Volume Log_Marketcap Log_Revenuegrowth
## 3.096886 3.102245 1.006337
# Coefficients and p-values
tidy(cleaned_model)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.74 0.0893 19.5 1.65e- 84
## 2 Log_Volume -0.630 0.00435 -145. 0
## 3 Log_Marketcap 0.466 0.00530 88.0 0
## 4 Log_Revenuegrowth -1.05 0.0367 -28.5 7.67e-177
# Filter data for the Technology sector and select relevant columns
tech_data <- data |>
filter(Sector == "Technology") |>
select(Close, Volume, Marketcap, Revenuegrowth) |>
drop_na()
# Build the initial regression model without log transformation
initial_model <- lm(Close ~ Volume + Marketcap + Revenuegrowth, data = tech_data)
# Calculate Cook's Distance to identify outliers
cooks_dist <- cooks.distance(initial_model)
# Define a threshold for influential points (commonly 4/n)
threshold <- 4 / nrow(tech_data)
# Filter out observations with Cook's Distance greater than the threshold
tech_data_cleaned <- tech_data |>
filter(cooks_dist <= threshold)
# Refit the regression model without outliers
cleaned_model <- lm(Close ~ Volume + Marketcap + Revenuegrowth, data = tech_data_cleaned)
# Display the summary of the cleaned regression model
summary(cleaned_model)
##
## Call:
## lm(formula = Close ~ Volume + Marketcap + Revenuegrowth, data = tech_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.64 -57.64 -26.81 34.12 368.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.297e+01 4.417e-01 210.479 <2e-16 ***
## Volume -2.264e-07 8.357e-09 -27.085 <2e-16 ***
## Marketcap 1.199e-12 6.739e-13 1.779 0.0752 .
## Revenuegrowth -5.569e+01 3.326e+00 -16.743 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79.49 on 53659 degrees of freedom
## Multiple R-squared: 0.03403, Adjusted R-squared: 0.03397
## F-statistic: 630.1 on 3 and 53659 DF, p-value: < 2.2e-16
# Backtesting: Add predictions to the cleaned dataset
tech_data_cleaned <- tech_data_cleaned |>
mutate(Predicted_Close = predict(cleaned_model))
# Visualize actual vs predicted values
ggplot(tech_data_cleaned, aes(x = Close, y = Predicted_Close)) +
geom_point(color = "blue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Actual vs Predicted Close Prices for Technology Sector (Outliers Removed)",
x = "Actual Close Price",
y = "Predicted Close Price")
# Check for multicollinearity using VIF
vif_values <- car::vif(cleaned_model)
print(vif_values)
## Volume Marketcap Revenuegrowth
## 2.443724 2.443728 1.000061
# Coefficients and p-values
broom::tidy(cleaned_model)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 9.30e+ 1 4.42e- 1 210. 0
## 2 Volume -2.26e- 7 8.36e- 9 -27.1 1.78e-160
## 3 Marketcap 1.20e-12 6.74e-13 1.78 7.52e- 2
## 4 Revenuegrowth -5.57e+ 1 3.33e+ 0 -16.7 9.27e- 63