library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(readr)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(dplyr)
library(purrr)
library(tibble)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3
companies = read.csv("sp500_companies.csv", header = TRUE)
financials = company = read.csv("constituents-financials.csv", header = TRUE)

companies_clean = na.omit(companies)
financials_clean = na.omit(financials)

fundamentals = financials_clean |>
   dplyr::select(Symbol, Price, PE, ES, PS, PB, Yield, X52Low, X52High, MarketCap, EBITDA, Sector) |> 
  left_join(companies_clean |> dplyr::select(Symbol, Industry, Fulltimeemployees), by = "Symbol") 
earnings = fundamentals |> mutate(Shares = MarketCap / Price)

sales = fundamentals |> mutate(Shares = MarketCap / Price)

book = fundamentals |> mutate(Shares = MarketCap / Price)
full_model = lm(Price ~ . - Symbol - Industry - Sector, data = fundamentals)
summary(full_model)
## 
## Call:
## lm(formula = Price ~ . - Symbol - Industry - Sector, data = fundamentals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -154.134   -8.053    2.357    8.911  118.461 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.996e+00  4.006e+00  -0.748   0.4551    
## PE                 2.486e-02  2.230e-02   1.115   0.2659    
## ES                 8.561e-01  3.897e-01   2.197   0.0287 *  
## PS                 1.079e+00  5.051e-01   2.136   0.0334 *  
## PB                 5.111e-02  5.863e-02   0.872   0.3841    
## Yield             -2.315e+02  9.943e+01  -2.329   0.0205 *  
## X52Low             3.781e-01  5.434e-02   6.958 1.94e-11 ***
## X52High            5.867e-01  3.833e-02  15.309  < 2e-16 ***
## MarketCap          3.085e-13  1.070e-11   0.029   0.9770    
## EBITDA             5.998e-11  2.332e-10   0.257   0.7972    
## Fulltimeemployees  1.712e-05  1.004e-05   1.706   0.0890 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.88 on 322 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.9808, Adjusted R-squared:  0.9802 
## F-statistic:  1643 on 10 and 322 DF,  p-value: < 2.2e-16
step_model <- stepAIC(full_model, direction = "both", trace = FALSE)
summary(step_model)
## 
## Call:
## lm(formula = Price ~ ES + PS + Yield + X52Low + X52High + Fulltimeemployees, 
##     data = fundamentals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -156.850   -8.330    2.070    8.958  118.008 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.880e+00  3.830e+00  -0.491  0.62391    
## ES                 7.849e-01  3.753e-01   2.091  0.03728 *  
## PS                 1.255e+00  4.158e-01   3.018  0.00275 ** 
## Yield             -2.388e+02  9.690e+01  -2.464  0.01424 *  
## X52Low             3.772e-01  5.367e-02   7.028 1.23e-11 ***
## X52High            5.894e-01  3.790e-02  15.550  < 2e-16 ***
## Fulltimeemployees  1.980e-05  9.479e-06   2.089  0.03751 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.83 on 326 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.9806, Adjusted R-squared:  0.9803 
## F-statistic:  2749 on 6 and 326 DF,  p-value: < 2.2e-16
fundamentals = fundamentals |>
  mutate(undervalued = if_else(PE < 15, 1, 0))

model_log_under = glm(undervalued ~ PE + PB + PS + ES + EBITDA + MarketCap + X52Low + X52High + Fulltimeemployees, family = binomial(), data = fundamentals)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_log_under)
## 
## Call:
## glm(formula = undervalued ~ PE + PB + PS + ES + EBITDA + MarketCap + 
##     X52Low + X52High + Fulltimeemployees, family = binomial(), 
##     data = fundamentals)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)        6.543e+02  3.245e+04   0.020    0.984
## PE                -4.091e+01  2.072e+03  -0.020    0.984
## PB                 1.229e+00  3.922e+02   0.003    0.997
## PS                -1.052e+01  1.541e+03  -0.007    0.995
## ES                -2.992e+00  1.499e+03  -0.002    0.998
## EBITDA            -7.654e-10  1.094e-06  -0.001    0.999
## MarketCap         -4.820e-11  1.562e-07   0.000    1.000
## X52Low             2.407e+00  2.873e+02   0.008    0.993
## X52High           -1.641e+00  2.035e+02  -0.008    0.994
## Fulltimeemployees  2.364e-05  1.126e-02   0.002    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.9520e+02  on 332  degrees of freedom
## Residual deviance: 1.4331e-06  on 323  degrees of freedom
##   (5 observations deleted due to missingness)
## AIC: 20
## 
## Number of Fisher Scoring iterations: 25
numeric_vars = fundamentals |>
  select_if(is.numeric) |>
  na.omit()

cor_matrix = cor(numeric_vars)
findCorrelation(cor_matrix, cutoff = 0.9)
## [1] 1 8 9
fundamentals = fundamentals |>
  mutate(overvalued = if_else(PE > 16, 1, 0))

model_log_over = glm(overvalued ~ PE + PB + PS + ES + EBITDA + MarketCap + X52Low + X52High + Fulltimeemployees, family = binomial(), data = fundamentals)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_log_over)
## 
## Call:
## glm(formula = overvalued ~ PE + PB + PS + ES + EBITDA + MarketCap + 
##     X52Low + X52High + Fulltimeemployees, family = binomial(), 
##     data = fundamentals)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)       -1.126e+03  3.551e+04  -0.032    0.975
## PE                 7.129e+01  2.300e+03   0.031    0.975
## PB                -1.203e+00  9.421e+02  -0.001    0.999
## PS                -5.300e+00  2.068e+03  -0.003    0.998
## ES                 4.178e+00  1.679e+03   0.002    0.998
## EBITDA            -7.995e-12  2.669e-07   0.000    1.000
## MarketCap         -1.669e-10  6.308e-08  -0.003    0.998
## X52Low            -1.653e-01  8.021e+01  -0.002    0.998
## X52High           -1.237e-01  5.685e+01  -0.002    0.998
## Fulltimeemployees  2.050e-04  3.060e-02   0.007    0.995
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3.2593e+02  on 332  degrees of freedom
## Residual deviance: 3.5397e-06  on 323  degrees of freedom
##   (5 observations deleted due to missingness)
## AIC: 20
## 
## Number of Fisher Scoring iterations: 25
numeric_vars = fundamentals |>
  select_if(is.numeric) |>
  na.omit()

cor_matrix = cor(numeric_vars)
findCorrelation(cor_matrix, cutoff = 0.9)
## [1]  1  8  9 13
fundamentals <- fundamentals %>%
  na.omit()

model_rf = randomForest(Price ~ ., data = fundamentals, importance = TRUE)

importance(model_rf)
##                      %IncMSE IncNodePurity
## Symbol             4.3746123      73822.31
## PE                 8.3443652     182283.76
## ES                13.4339280    1258412.93
## PS                 3.9049755     163506.61
## PB                 7.7835184     348198.08
## Yield              5.3588342     220874.41
## X52Low            23.7298858    3081131.74
## X52High           27.7980950    3623875.20
## MarketCap          2.8456150     199091.86
## EBITDA             0.0303136      27520.35
## Sector            -0.4475268      35942.30
## Industry           1.2545842      47801.73
## Fulltimeemployees  1.5351466      47997.19
## undervalued        3.2570512      16451.24
## overvalued         4.8820618      23394.45
PCA = fundamentals |> select_if(is.numeric) |> 
  scale()

fviz_nbclust(PCA, kmeans, method = "wss")

km = kmeans(PCA, centers = 3)
fviz_cluster(km, data = PCA)

financials_clustered = PCA |>  as.data.frame() |>
  mutate(cluster = factor(km$cluster))

anova_results = map_dfr(
  names(financials_clustered)[-ncol(financials_clustered)],  # exclude 'cluster'
  \(var) {
    aov_result <- aov(as.formula(paste(var, "~ cluster")), data = financials_clustered)
    tibble(
      variable = var,
      p_value = summary(aov_result)[[1]][["Pr(>F)"]][1]
    )
  }
) |>
  arrange(p_value)

head(anova_results, 10)
cluster_summary = financials_clustered |>
  group_by(cluster) |>
  summarise(across(everything(), mean), .groups = "drop")

print(cluster_summary)
## # A tibble: 3 × 14
##   cluster  Price       PE     ES     PS      PB  Yield X52Low X52High MarketCap
##   <fct>    <dbl>    <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>     <dbl>
## 1 1        0.633  0.0295   0.353  1.97   0.729  -1.20   0.419   0.528    6.15  
## 2 2        2.09   0.0546   1.65   0.637  0.389  -0.728  2.11    2.08     0.0625
## 3 3       -0.308 -0.00836 -0.239 -0.137 -0.0723  0.131 -0.305  -0.304   -0.159 
## # ℹ 4 more variables: EBITDA <dbl>, Fulltimeemployees <dbl>, undervalued <dbl>,
## #   overvalued <dbl>
financials_labeled = fundamentals |>
  mutate(
    cluster = factor(km$cluster),
    cluster_label = case_when(
      cluster == 1 ~ "Stable dividend payers",
      cluster == 2 ~ "High-growth large caps",
      cluster == 3 ~ "Speculative / undervalued"
    )
  )

financials_labeled |>
  count(cluster_label, Sector) |>
  arrange(cluster_label, desc(n))
ggplot(financials_labeled, aes(x = cluster_label, fill = Sector)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Sector Distribution by Cluster",
    y = "Proportion",
    x = "Cluster"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

earnings = earnings |> mutate(
  Earn = ES * Shares,
  PE.new = (Price*Shares)/16,
  PE_Price = (16*Earn)/ Shares,
  Diff_Earn = Earn - PE.new,
  PE_Diff_Price = PE_Price - Price
)
sales = sales |> mutate(
  Sales = (Price * Shares) / PS,
  PS_Sales = (Price * Shares) / 2,
  PS_Price = (2*Sales)/ Shares,
  Diff_Sales = Sales - PS_Sales,
  PS_Diff_Price = PS_Price - Price
)
book = book |> mutate(
  Debt = (Price * Shares) / PB,
  PB_Debt = (Price * Shares) / 3,
  PB_Price = (3*Debt)/ Shares,
  PB_Diff_Debt = Debt - PB_Debt,
  PB_Diff_Price = PB_Price - Price
)
stocks = fundamentals |> dplyr::select(Symbol,Price, PE, PS, PB,ES, Yield, EBITDA,X52Low,X52High, Sector, Industry, Fulltimeemployees) |> 
  left_join(earnings |> dplyr::select(Symbol, PE.new, PE_Price, Diff_Earn, PE_Diff_Price), by = "Symbol") |>
  left_join(sales |> dplyr::select(Symbol, PS_Sales, PS_Price, Diff_Sales, PS_Diff_Price), by = "Symbol") |>
  left_join(book |> dplyr::select(Symbol, PB_Debt, PB_Price,PB_Diff_Debt, PB_Diff_Price), by = "Symbol")

stocks = stocks |> mutate(Avg = (PE_Price + PS_Price + PB_Price) / 3)
model_lm.new = lm(Price ~ PE_Price + PB_Price + PS_Price + ES + Yield + EBITDA + X52Low + X52High + Fulltimeemployees, data = stocks)
summary(model_lm.new)
## 
## Call:
## lm(formula = Price ~ PE_Price + PB_Price + PS_Price + ES + Yield + 
##     EBITDA + X52Low + X52High + Fulltimeemployees, data = stocks)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -158.217   -8.095    1.916    8.440  113.572 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.854e+00  3.532e+00   0.808  0.41958    
## PE_Price           3.495e-02  2.441e-02   1.432  0.15314    
## PB_Price           3.244e-03  1.688e-02   0.192  0.84769    
## PS_Price          -1.700e-02  5.455e-03  -3.117  0.00199 ** 
## ES                        NA         NA      NA       NA    
## Yield             -2.510e+02  9.682e+01  -2.592  0.00997 ** 
## EBITDA             1.140e-10  8.501e-11   1.341  0.18088    
## X52Low             3.756e-01  5.391e-02   6.967 1.81e-11 ***
## X52High            6.099e-01  3.799e-02  16.054  < 2e-16 ***
## Fulltimeemployees  1.500e-05  9.815e-06   1.528  0.12755    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.79 on 324 degrees of freedom
## Multiple R-squared:  0.9808, Adjusted R-squared:  0.9803 
## F-statistic:  2070 on 8 and 324 DF,  p-value: < 2.2e-16
model_rf.new = randomForest(Price ~ PE_Price + PB_Price + PS_Price + ES + Yield + EBITDA + X52Low + X52High + Fulltimeemployees,
                        data = stocks, 
                        importance = TRUE)

importance(model_rf.new)
##                       %IncMSE IncNodePurity
## PE_Price           9.98812410    1033234.50
## PB_Price           2.05396469     202533.36
## PS_Price           0.07179863     142899.44
## ES                 9.38553192     992054.95
## Yield              6.01544644     211445.19
## EBITDA             1.18350308      60305.80
## X52Low            24.48349383    3190894.24
## X52High           28.14745841    3559826.70
## Fulltimeemployees  0.72589331      68003.91
PCA.new = stocks |> select_if(is.numeric) |> 
  scale()

fviz_nbclust(PCA.new, kmeans, method = "wss")

km.new = kmeans(PCA.new, centers = 3, nstart = 25)
fviz_cluster(km.new, data = PCA.new)

stocks_clustered <- stocks %>%
  mutate(cluster = factor(km.new$cluster))
stocks_clustered = stocks |>
  mutate(cluster = factor(km.new$cluster))

stocks_clustered |>
  group_by(cluster) |>
  summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
stocks_clustered |>
  count(cluster, Sector) |>
  arrange(cluster, desc(n))
stocks_clustered |>
  filter(cluster == 3) |>
  dplyr::select(Symbol, everything())
ggplot(stocks_clustered, aes(x = cluster, fill = Sector)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(title = "Sector Distribution by Cluster", y = "Percent", x = "Cluster") +
  theme_minimal()

stocks_clustered = stocks_clustered |>
  mutate(valuation_gap = Avg - Price,
         valuation_status = case_when(
           valuation_gap > 0 ~ "Undervalued",
           valuation_gap < 0 ~ "Overvalued",
           TRUE ~ "Fairly Priced"
         ))

stocks_clustered |>
  arrange(desc(valuation_gap)) |>
  dplyr::select(Symbol, Price, Avg, valuation_gap) |>
  slice_head(n = 10)
stocks_clustered |>
  arrange(valuation_gap) |>
  dplyr::select(Symbol, Price, Avg, valuation_gap) |>
  slice_head(n = 10)
top_undervalued = stocks_clustered |>
  filter(valuation_gap > 0) |>
  arrange(desc(valuation_gap)) |>
  slice_head(n = 20)

# Plot
ggplot(top_undervalued, aes(x = reorder(Symbol, valuation_gap), y = valuation_gap, fill = Sector)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Undervalued Stocks",
    y = "Avg Model Price - Actual Price",
    x = "Symbol"
  ) +
  theme_minimal()

# Step 1: Filter and prepare top 20 most overvalued
top_overvalued = stocks_clustered |>
  filter(valuation_gap < 0) |>
  arrange(valuation_gap) |>
  slice_head(n = 20)

# Step 2: Create the bar plot
ggplot(top_overvalued, aes(x = reorder(Symbol, valuation_gap), y = valuation_gap, fill = Sector)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Overvalued Stocks",
    x = "Symbol",
    y = "Predicted Price - Actual Price (Valuation Gap)"
  ) +
  theme_minimal()

ggplot(top_undervalued, aes(x = reorder(Industry, valuation_gap, FUN = median), y = valuation_gap, fill = Sector)) +
  geom_col() +
  
  coord_flip() +
  labs(
    title = "Under-Valuation Gap by Industry (Grouped by Sector)",
    x = "Industry",
    y = "Avg Model Price - Actual Price"
  ) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

ggplot(top_overvalued, aes(x = reorder(Industry, valuation_gap, FUN = median), y = valuation_gap, fill = Sector)) +
  geom_col() +
  
  coord_flip() +
  labs(
    title = "Over-Valuation Gap by Industry (Grouped by Sector)",
    x = "Industry",
    y = "Avg Model Price - Actual Price"
  ) +
  theme_minimal(base_size = 12)