library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(readr)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(dplyr)
library(purrr)
library(tibble)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3
companies = read.csv("sp500_companies.csv", header = TRUE)
financials = company = read.csv("constituents-financials.csv", header = TRUE)
companies_clean = na.omit(companies)
financials_clean = na.omit(financials)
fundamentals = financials_clean |>
dplyr::select(Symbol, Price, PE, ES, PS, PB, Yield, X52Low, X52High, MarketCap, EBITDA, Sector) |>
left_join(companies_clean |> dplyr::select(Symbol, Industry, Fulltimeemployees), by = "Symbol")
earnings = fundamentals |> mutate(Shares = MarketCap / Price)
sales = fundamentals |> mutate(Shares = MarketCap / Price)
book = fundamentals |> mutate(Shares = MarketCap / Price)
full_model = lm(Price ~ . - Symbol - Industry - Sector, data = fundamentals)
summary(full_model)
##
## Call:
## lm(formula = Price ~ . - Symbol - Industry - Sector, data = fundamentals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154.134 -8.053 2.357 8.911 118.461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.996e+00 4.006e+00 -0.748 0.4551
## PE 2.486e-02 2.230e-02 1.115 0.2659
## ES 8.561e-01 3.897e-01 2.197 0.0287 *
## PS 1.079e+00 5.051e-01 2.136 0.0334 *
## PB 5.111e-02 5.863e-02 0.872 0.3841
## Yield -2.315e+02 9.943e+01 -2.329 0.0205 *
## X52Low 3.781e-01 5.434e-02 6.958 1.94e-11 ***
## X52High 5.867e-01 3.833e-02 15.309 < 2e-16 ***
## MarketCap 3.085e-13 1.070e-11 0.029 0.9770
## EBITDA 5.998e-11 2.332e-10 0.257 0.7972
## Fulltimeemployees 1.712e-05 1.004e-05 1.706 0.0890 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.88 on 322 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.9808, Adjusted R-squared: 0.9802
## F-statistic: 1643 on 10 and 322 DF, p-value: < 2.2e-16
step_model <- stepAIC(full_model, direction = "both", trace = FALSE)
summary(step_model)
##
## Call:
## lm(formula = Price ~ ES + PS + Yield + X52Low + X52High + Fulltimeemployees,
## data = fundamentals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -156.850 -8.330 2.070 8.958 118.008
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.880e+00 3.830e+00 -0.491 0.62391
## ES 7.849e-01 3.753e-01 2.091 0.03728 *
## PS 1.255e+00 4.158e-01 3.018 0.00275 **
## Yield -2.388e+02 9.690e+01 -2.464 0.01424 *
## X52Low 3.772e-01 5.367e-02 7.028 1.23e-11 ***
## X52High 5.894e-01 3.790e-02 15.550 < 2e-16 ***
## Fulltimeemployees 1.980e-05 9.479e-06 2.089 0.03751 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.83 on 326 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.9806, Adjusted R-squared: 0.9803
## F-statistic: 2749 on 6 and 326 DF, p-value: < 2.2e-16
fundamentals = fundamentals |>
mutate(undervalued = if_else(PE < 15, 1, 0))
model_log_under = glm(undervalued ~ PE + PB + PS + ES + EBITDA + MarketCap + X52Low + X52High + Fulltimeemployees, family = binomial(), data = fundamentals)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_log_under)
##
## Call:
## glm(formula = undervalued ~ PE + PB + PS + ES + EBITDA + MarketCap +
## X52Low + X52High + Fulltimeemployees, family = binomial(),
## data = fundamentals)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.543e+02 3.245e+04 0.020 0.984
## PE -4.091e+01 2.072e+03 -0.020 0.984
## PB 1.229e+00 3.922e+02 0.003 0.997
## PS -1.052e+01 1.541e+03 -0.007 0.995
## ES -2.992e+00 1.499e+03 -0.002 0.998
## EBITDA -7.654e-10 1.094e-06 -0.001 0.999
## MarketCap -4.820e-11 1.562e-07 0.000 1.000
## X52Low 2.407e+00 2.873e+02 0.008 0.993
## X52High -1.641e+00 2.035e+02 -0.008 0.994
## Fulltimeemployees 2.364e-05 1.126e-02 0.002 0.998
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.9520e+02 on 332 degrees of freedom
## Residual deviance: 1.4331e-06 on 323 degrees of freedom
## (5 observations deleted due to missingness)
## AIC: 20
##
## Number of Fisher Scoring iterations: 25
numeric_vars = fundamentals |>
select_if(is.numeric) |>
na.omit()
cor_matrix = cor(numeric_vars)
findCorrelation(cor_matrix, cutoff = 0.9)
## [1] 1 8 9
fundamentals = fundamentals |>
mutate(overvalued = if_else(PE > 16, 1, 0))
model_log_over = glm(overvalued ~ PE + PB + PS + ES + EBITDA + MarketCap + X52Low + X52High + Fulltimeemployees, family = binomial(), data = fundamentals)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_log_over)
##
## Call:
## glm(formula = overvalued ~ PE + PB + PS + ES + EBITDA + MarketCap +
## X52Low + X52High + Fulltimeemployees, family = binomial(),
## data = fundamentals)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.126e+03 3.551e+04 -0.032 0.975
## PE 7.129e+01 2.300e+03 0.031 0.975
## PB -1.203e+00 9.421e+02 -0.001 0.999
## PS -5.300e+00 2.068e+03 -0.003 0.998
## ES 4.178e+00 1.679e+03 0.002 0.998
## EBITDA -7.995e-12 2.669e-07 0.000 1.000
## MarketCap -1.669e-10 6.308e-08 -0.003 0.998
## X52Low -1.653e-01 8.021e+01 -0.002 0.998
## X52High -1.237e-01 5.685e+01 -0.002 0.998
## Fulltimeemployees 2.050e-04 3.060e-02 0.007 0.995
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3.2593e+02 on 332 degrees of freedom
## Residual deviance: 3.5397e-06 on 323 degrees of freedom
## (5 observations deleted due to missingness)
## AIC: 20
##
## Number of Fisher Scoring iterations: 25
numeric_vars = fundamentals |>
select_if(is.numeric) |>
na.omit()
cor_matrix = cor(numeric_vars)
findCorrelation(cor_matrix, cutoff = 0.9)
## [1] 1 8 9 13
fundamentals <- fundamentals %>%
na.omit()
model_rf = randomForest(Price ~ ., data = fundamentals, importance = TRUE)
importance(model_rf)
## %IncMSE IncNodePurity
## Symbol 4.3746123 73822.31
## PE 8.3443652 182283.76
## ES 13.4339280 1258412.93
## PS 3.9049755 163506.61
## PB 7.7835184 348198.08
## Yield 5.3588342 220874.41
## X52Low 23.7298858 3081131.74
## X52High 27.7980950 3623875.20
## MarketCap 2.8456150 199091.86
## EBITDA 0.0303136 27520.35
## Sector -0.4475268 35942.30
## Industry 1.2545842 47801.73
## Fulltimeemployees 1.5351466 47997.19
## undervalued 3.2570512 16451.24
## overvalued 4.8820618 23394.45
PCA = fundamentals |> select_if(is.numeric) |>
scale()
fviz_nbclust(PCA, kmeans, method = "wss")

km = kmeans(PCA, centers = 3)
fviz_cluster(km, data = PCA)

financials_clustered = PCA |> as.data.frame() |>
mutate(cluster = factor(km$cluster))
anova_results = map_dfr(
names(financials_clustered)[-ncol(financials_clustered)], # exclude 'cluster'
\(var) {
aov_result <- aov(as.formula(paste(var, "~ cluster")), data = financials_clustered)
tibble(
variable = var,
p_value = summary(aov_result)[[1]][["Pr(>F)"]][1]
)
}
) |>
arrange(p_value)
head(anova_results, 10)
cluster_summary = financials_clustered |>
group_by(cluster) |>
summarise(across(everything(), mean), .groups = "drop")
print(cluster_summary)
## # A tibble: 3 × 14
## cluster Price PE ES PS PB Yield X52Low X52High MarketCap
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.633 0.0295 0.353 1.97 0.729 -1.20 0.419 0.528 6.15
## 2 2 2.09 0.0546 1.65 0.637 0.389 -0.728 2.11 2.08 0.0625
## 3 3 -0.308 -0.00836 -0.239 -0.137 -0.0723 0.131 -0.305 -0.304 -0.159
## # ℹ 4 more variables: EBITDA <dbl>, Fulltimeemployees <dbl>, undervalued <dbl>,
## # overvalued <dbl>
financials_labeled = fundamentals |>
mutate(
cluster = factor(km$cluster),
cluster_label = case_when(
cluster == 1 ~ "Stable dividend payers",
cluster == 2 ~ "High-growth large caps",
cluster == 3 ~ "Speculative / undervalued"
)
)
financials_labeled |>
count(cluster_label, Sector) |>
arrange(cluster_label, desc(n))
ggplot(financials_labeled, aes(x = cluster_label, fill = Sector)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Sector Distribution by Cluster",
y = "Proportion",
x = "Cluster"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))

earnings = earnings |> mutate(
Earn = ES * Shares,
PE.new = (Price*Shares)/16,
PE_Price = (16*Earn)/ Shares,
Diff_Earn = Earn - PE.new,
PE_Diff_Price = PE_Price - Price
)
sales = sales |> mutate(
Sales = (Price * Shares) / PS,
PS_Sales = (Price * Shares) / 2,
PS_Price = (2*Sales)/ Shares,
Diff_Sales = Sales - PS_Sales,
PS_Diff_Price = PS_Price - Price
)
book = book |> mutate(
Debt = (Price * Shares) / PB,
PB_Debt = (Price * Shares) / 3,
PB_Price = (3*Debt)/ Shares,
PB_Diff_Debt = Debt - PB_Debt,
PB_Diff_Price = PB_Price - Price
)
stocks = fundamentals |> dplyr::select(Symbol,Price, PE, PS, PB,ES, Yield, EBITDA,X52Low,X52High, Sector, Industry, Fulltimeemployees) |>
left_join(earnings |> dplyr::select(Symbol, PE.new, PE_Price, Diff_Earn, PE_Diff_Price), by = "Symbol") |>
left_join(sales |> dplyr::select(Symbol, PS_Sales, PS_Price, Diff_Sales, PS_Diff_Price), by = "Symbol") |>
left_join(book |> dplyr::select(Symbol, PB_Debt, PB_Price,PB_Diff_Debt, PB_Diff_Price), by = "Symbol")
stocks = stocks |> mutate(Avg = (PE_Price + PS_Price + PB_Price) / 3)
model_lm.new = lm(Price ~ PE_Price + PB_Price + PS_Price + ES + Yield + EBITDA + X52Low + X52High + Fulltimeemployees, data = stocks)
summary(model_lm.new)
##
## Call:
## lm(formula = Price ~ PE_Price + PB_Price + PS_Price + ES + Yield +
## EBITDA + X52Low + X52High + Fulltimeemployees, data = stocks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -158.217 -8.095 1.916 8.440 113.572
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.854e+00 3.532e+00 0.808 0.41958
## PE_Price 3.495e-02 2.441e-02 1.432 0.15314
## PB_Price 3.244e-03 1.688e-02 0.192 0.84769
## PS_Price -1.700e-02 5.455e-03 -3.117 0.00199 **
## ES NA NA NA NA
## Yield -2.510e+02 9.682e+01 -2.592 0.00997 **
## EBITDA 1.140e-10 8.501e-11 1.341 0.18088
## X52Low 3.756e-01 5.391e-02 6.967 1.81e-11 ***
## X52High 6.099e-01 3.799e-02 16.054 < 2e-16 ***
## Fulltimeemployees 1.500e-05 9.815e-06 1.528 0.12755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.79 on 324 degrees of freedom
## Multiple R-squared: 0.9808, Adjusted R-squared: 0.9803
## F-statistic: 2070 on 8 and 324 DF, p-value: < 2.2e-16
model_rf.new = randomForest(Price ~ PE_Price + PB_Price + PS_Price + ES + Yield + EBITDA + X52Low + X52High + Fulltimeemployees,
data = stocks,
importance = TRUE)
importance(model_rf.new)
## %IncMSE IncNodePurity
## PE_Price 9.98812410 1033234.50
## PB_Price 2.05396469 202533.36
## PS_Price 0.07179863 142899.44
## ES 9.38553192 992054.95
## Yield 6.01544644 211445.19
## EBITDA 1.18350308 60305.80
## X52Low 24.48349383 3190894.24
## X52High 28.14745841 3559826.70
## Fulltimeemployees 0.72589331 68003.91
PCA.new = stocks |> select_if(is.numeric) |>
scale()
fviz_nbclust(PCA.new, kmeans, method = "wss")

km.new = kmeans(PCA.new, centers = 3, nstart = 25)
fviz_cluster(km.new, data = PCA.new)

stocks_clustered <- stocks %>%
mutate(cluster = factor(km.new$cluster))
stocks_clustered = stocks |>
mutate(cluster = factor(km.new$cluster))
stocks_clustered |>
group_by(cluster) |>
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
stocks_clustered |>
count(cluster, Sector) |>
arrange(cluster, desc(n))
stocks_clustered |>
filter(cluster == 3) |>
dplyr::select(Symbol, everything())
ggplot(stocks_clustered, aes(x = cluster, fill = Sector)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Sector Distribution by Cluster", y = "Percent", x = "Cluster") +
theme_minimal()

stocks_clustered = stocks_clustered |>
mutate(valuation_gap = Avg - Price,
valuation_status = case_when(
valuation_gap > 0 ~ "Undervalued",
valuation_gap < 0 ~ "Overvalued",
TRUE ~ "Fairly Priced"
))
stocks_clustered |>
arrange(desc(valuation_gap)) |>
dplyr::select(Symbol, Price, Avg, valuation_gap) |>
slice_head(n = 10)
stocks_clustered |>
arrange(valuation_gap) |>
dplyr::select(Symbol, Price, Avg, valuation_gap) |>
slice_head(n = 10)
top_undervalued = stocks_clustered |>
filter(valuation_gap > 0) |>
arrange(desc(valuation_gap)) |>
slice_head(n = 20)
# Plot
ggplot(top_undervalued, aes(x = reorder(Symbol, valuation_gap), y = valuation_gap, fill = Sector)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Undervalued Stocks",
y = "Avg Model Price - Actual Price",
x = "Symbol"
) +
theme_minimal()

# Step 1: Filter and prepare top 20 most overvalued
top_overvalued = stocks_clustered |>
filter(valuation_gap < 0) |>
arrange(valuation_gap) |>
slice_head(n = 20)
# Step 2: Create the bar plot
ggplot(top_overvalued, aes(x = reorder(Symbol, valuation_gap), y = valuation_gap, fill = Sector)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Overvalued Stocks",
x = "Symbol",
y = "Predicted Price - Actual Price (Valuation Gap)"
) +
theme_minimal()

ggplot(top_undervalued, aes(x = reorder(Industry, valuation_gap, FUN = median), y = valuation_gap, fill = Sector)) +
geom_col() +
coord_flip() +
labs(
title = "Under-Valuation Gap by Industry (Grouped by Sector)",
x = "Industry",
y = "Avg Model Price - Actual Price"
) +
theme_minimal() +
theme(axis.text.y = element_text(size = 6))

ggplot(top_overvalued, aes(x = reorder(Industry, valuation_gap, FUN = median), y = valuation_gap, fill = Sector)) +
geom_col() +
coord_flip() +
labs(
title = "Over-Valuation Gap by Industry (Grouped by Sector)",
x = "Industry",
y = "Avg Model Price - Actual Price"
) +
theme_minimal(base_size = 12)
