R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

DJI = read.table("dow_jones_index.data")
summary(DJI)
##       V1           
##  Length:751        
##  Class :character  
##  Mode  :character
DJI2 <- read_excel("dowjones.xlsx")
summary(DJI2)
##     quarter        stock                date                    
##  Min.   :1.00   Length:750         Min.   :2011-01-07 00:00:00  
##  1st Qu.:1.00   Class :character   1st Qu.:2011-02-18 00:00:00  
##  Median :2.00   Mode  :character   Median :2011-04-01 00:00:00  
##  Mean   :1.52                      Mean   :2011-03-31 23:02:24  
##  3rd Qu.:2.00                      3rd Qu.:2011-05-13 00:00:00  
##  Max.   :2.00                      Max.   :2011-06-24 00:00:00  
##                                                                 
##       open             high             low             close       
##  Min.   : 10.59   Min.   : 10.94   Min.   : 10.40   Min.   : 10.52  
##  1st Qu.: 29.83   1st Qu.: 30.63   1st Qu.: 28.72   1st Qu.: 30.36  
##  Median : 45.97   Median : 46.88   Median : 44.80   Median : 45.93  
##  Mean   : 53.65   Mean   : 54.67   Mean   : 52.64   Mean   : 53.73  
##  3rd Qu.: 72.72   3rd Qu.: 74.29   3rd Qu.: 71.04   3rd Qu.: 72.67  
##  Max.   :172.11   Max.   :173.54   Max.   :167.82   Max.   :170.58  
##                                                                     
##      volume          percent_change_price percent_change_volume_over_last_wk
##  Min.   :9.719e+06   Min.   :-15.42290    Min.   :-61.4332                  
##  1st Qu.:3.087e+07   1st Qu.: -1.28805    1st Qu.:-19.8043                  
##  Median :5.306e+07   Median :  0.00000    Median :  0.5126                  
##  Mean   :1.175e+08   Mean   :  0.05026    Mean   :  5.5936                  
##  3rd Qu.:1.327e+08   3rd Qu.:  1.65089    3rd Qu.: 21.8006                  
##  Max.   :1.453e+09   Max.   :  9.88223    Max.   :327.4089                  
##                                           NA's   :30                        
##  previous_weeks_volume next_weeks_open  next_weeks_close
##  Min.   :9.719e+06     Min.   : 10.52   Min.   : 10.52  
##  1st Qu.:3.068e+07     1st Qu.: 30.32   1st Qu.: 30.46  
##  Median :5.295e+07     Median : 46.02   Median : 46.12  
##  Mean   :1.174e+08     Mean   : 53.70   Mean   : 53.89  
##  3rd Qu.:1.333e+08     3rd Qu.: 72.72   3rd Qu.: 72.92  
##  Max.   :1.453e+09     Max.   :172.11   Max.   :174.54  
##  NA's   :30                                             
##  percent_change_next_weeks_price days_to_next_dividend
##  Min.   :-15.4229                Min.   :  0.00       
##  1st Qu.: -1.2221                1st Qu.: 24.00       
##  Median :  0.1012                Median : 47.00       
##  Mean   :  0.2385                Mean   : 52.53       
##  3rd Qu.:  1.8456                3rd Qu.: 69.00       
##  Max.   :  9.8822                Max.   :336.00       
##                                                       
##  percent_return_next_dividend
##  Min.   :0.06557             
##  1st Qu.:0.53455             
##  Median :0.68107             
##  Mean   :0.69183             
##  3rd Qu.:0.85429             
##  Max.   :1.56421             
## 
anyNA(DJI2)
## [1] TRUE
na_count_DJI2 <- sum(is.na(DJI2))

print(na_count_DJI2) 
## [1] 60
DJI2 %>%
  summarise(across(everything(), ~sum(is.na(.x)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "na_count") %>%
  filter(na_count > 0) %>%
  arrange(desc(na_count))
## # A tibble: 2 × 2
##   variable                           na_count
##   <chr>                                 <int>
## 1 percent_change_volume_over_last_wk       30
## 2 previous_weeks_volume                    30
DJI3 <- DJI2

DJI3$percent_change_volume_over_last_wk[is.na(DJI2$percent_change_volume_over_last_wk)] <- mean(DJI3$percent_change_volume_over_last_wk, na.rm=TRUE) 

DJI3$previous_weeks_volume[is.na(DJI2$previous_weeks_volume)] <- mean(DJI3$previous_weeks_volume, na.rm=TRUE)
na_count_DJI3 <- sum(is.na(DJI3))

print(na_count_DJI3) 
## [1] 0
glm(percent_change_next_weeks_price ~ . , family = gaussian(), DJI3)
## 
## Call:  glm(formula = percent_change_next_weeks_price ~ ., family = gaussian(), 
##     data = DJI3)
## 
## Coefficients:
##                        (Intercept)                             quarter  
##                          1.861e+01                           1.578e-01  
##                           stockAXP                             stockBA  
##                         -3.945e+00                          -8.260e+00  
##                           stockBAC                            stockCAT  
##                         -1.281e+00                          -1.001e+01  
##                          stockCSCO                            stockCVX  
##                         -2.832e+00                          -1.241e+01  
##                            stockDD                            stockDIS  
##                         -7.913e+00                          -8.602e+00  
##                            stockGE                             stockHD  
##                         -5.166e+00                          -5.645e+00  
##                           stockHPQ                            stockIBM  
##                         -3.039e+00                          -1.672e+01  
##                          stockINTC                            stockJNJ  
##                         -6.573e+00                          -1.015e+01  
##                           stockJPM                             stockKO  
##                         -5.824e+00                          -8.836e+00  
##                          stockKRFT                            stockMCD  
##                         -6.848e+00                          -1.069e+01  
##                           stockMMM                            stockMRK  
##                         -1.038e+01                          -9.373e+00  
##                          stockMSFT                            stockPFE  
##                         -5.198e+00                          -7.235e+00  
##                            stockPG                              stockT  
##                         -9.629e+00                          -1.191e+01  
##                           stockTRV                            stockUTX  
##                         -7.613e+00                          -9.256e+00  
##                            stockVZ                            stockWMT  
##                         -1.157e+01                          -7.513e+00  
##                           stockXOM                                date  
##                         -9.238e+00                          -1.726e-08  
##                               open                                high  
##                          2.497e-02                          -2.835e-02  
##                                low                               close  
##                          7.937e-02                          -5.720e-02  
##                             volume                percent_change_price  
##                          3.979e-09                           1.463e-02  
## percent_change_volume_over_last_wk               previous_weeks_volume  
##                         -1.152e-03                          -7.917e-10  
##                    next_weeks_open                    next_weeks_close  
##                         -1.369e+00                           1.447e+00  
##              days_to_next_dividend        percent_return_next_dividend  
##                         -1.361e-04                           8.587e+00  
## 
## Degrees of Freedom: 749 Total (i.e. Null);  706 Residual
## Null Deviance:       5378 
## Residual Deviance: 1485  AIC: 2731
str(DJI3)
## tibble [750 × 16] (S3: tbl_df/tbl/data.frame)
##  $ quarter                           : num [1:750] 1 1 1 1 1 1 1 1 1 1 ...
##  $ stock                             : chr [1:750] "AA" "AA" "AA" "AA" ...
##  $ date                              : POSIXct[1:750], format: "2011-01-07" "2011-01-14" ...
##  $ open                              : num [1:750] 15.8 16.7 16.2 15.9 16.2 ...
##  $ high                              : num [1:750] 16.7 16.7 16.4 16.6 17.4 ...
##  $ low                               : num [1:750] 15.8 15.6 15.6 15.8 16.2 ...
##  $ close                             : num [1:750] 16.4 16 15.8 16.1 17.1 ...
##  $ volume                            : num [1:750] 2.40e+08 2.43e+08 1.38e+08 1.51e+08 1.54e+08 ...
##  $ percent_change_price              : num [1:750] 3.79 -4.43 -2.47 1.64 5.93 ...
##  $ percent_change_volume_over_last_wk: num [1:750] 5.59 1.38 -43.02 9.36 1.99 ...
##  $ previous_weeks_volume             : num [1:750] 1.17e+08 2.40e+08 2.43e+08 1.38e+08 1.51e+08 ...
##  $ next_weeks_open                   : num [1:750] 16.7 16.2 15.9 16.2 17.3 ...
##  $ next_weeks_close                  : num [1:750] 16 15.8 16.1 17.1 17.4 ...
##  $ percent_change_next_weeks_price   : num [1:750] -4.428 -2.471 1.638 5.933 0.231 ...
##  $ days_to_next_dividend             : num [1:750] 26 19 12 5 97 90 83 76 69 62 ...
##  $ percent_return_next_dividend      : num [1:750] 0.183 0.188 0.19 0.186 0.175 ...
DJI3$date <- as.Date(DJI3$date, format = "%Y-%m-%d")

DJI4 <- DJI3 |> arrange(date)

head(DJI4)
## # A tibble: 6 × 16
##   quarter stock date        open  high   low close   volume percent_change_price
##     <dbl> <chr> <date>     <dbl> <dbl> <dbl> <dbl>    <dbl>                <dbl>
## 1       1 AA    2011-01-07  15.8  16.7  15.8  16.4   2.40e8                3.79 
## 2       1 AXP   2011-01-07  43.3  45.6  43.1  44.4   4.51e7                2.45 
## 3       1 BA    2011-01-07  66.2  70.1  66    69.4   3.63e7                4.88 
## 4       1 BAC   2011-01-07  13.8  14.7  13.8  14.2   1.45e9                2.89 
## 5       1 CAT   2011-01-07  94.4  94.8  92.3  93.7   2.41e7               -0.689
## 6       1 CSCO  2011-01-07  20.4  21    20.4  21.0   3.04e8                2.54 
## # ℹ 7 more variables: percent_change_volume_over_last_wk <dbl>,
## #   previous_weeks_volume <dbl>, next_weeks_open <dbl>, next_weeks_close <dbl>,
## #   percent_change_next_weeks_price <dbl>, days_to_next_dividend <dbl>,
## #   percent_return_next_dividend <dbl>
train_DJI5 <- DJI4 |>
  filter(quarter == 1)

test_DJI5 <- DJI4 |>
  filter(quarter ==2)

summary(train_DJI5)
##     quarter     stock                date                 open       
##  Min.   :1   Length:360         Min.   :2011-01-07   Min.   : 13.71  
##  1st Qu.:1   Class :character   1st Qu.:2011-01-26   1st Qu.: 28.38  
##  Median :1   Mode  :character   Median :2011-02-14   Median : 45.81  
##  Mean   :1                      Mean   :2011-02-14   Mean   : 52.77  
##  3rd Qu.:1                      3rd Qu.:2011-03-05   3rd Qu.: 71.23  
##  Max.   :1                      Max.   :2011-03-25   Max.   :164.18  
##       high             low             close            volume         
##  Min.   : 14.22   Min.   : 13.32   Min.   : 13.34   Min.   :1.071e+07  
##  1st Qu.: 28.82   1st Qu.: 27.91   1st Qu.: 28.45   1st Qu.:3.438e+07  
##  Median : 46.84   Median : 44.68   Median : 45.80   Median :5.708e+07  
##  Mean   : 53.82   Mean   : 51.81   Mean   : 52.98   Mean   :1.254e+08  
##  3rd Qu.: 72.84   3rd Qu.: 69.32   3rd Qu.: 71.65   3rd Qu.:1.398e+08  
##  Max.   :167.72   Max.   :163.18   Max.   :164.84   Max.   :1.453e+09  
##  percent_change_price percent_change_volume_over_last_wk previous_weeks_volume
##  Min.   :-15.4229     Min.   :-60.988                    Min.   :1.071e+07    
##  1st Qu.: -1.0501     1st Qu.:-23.613                    1st Qu.:3.594e+07    
##  Median :  0.2789     Median :  2.142                    Median :6.526e+07    
##  Mean   :  0.2623     Mean   :  4.638                    Mean   :1.265e+08    
##  3rd Qu.:  1.8813     3rd Qu.: 21.409                    3rd Qu.:1.313e+08    
##  Max.   :  7.6217     Max.   :327.409                    Max.   :1.453e+09    
##  next_weeks_open  next_weeks_close percent_change_next_weeks_price
##  Min.   : 13.42   Min.   : 13.34   Min.   :-15.4229               
##  1st Qu.: 28.38   1st Qu.: 28.41   1st Qu.: -0.9999               
##  Median : 46.02   Median : 45.93   Median :  0.3704               
##  Mean   : 52.99   Mean   : 53.23   Mean   :  0.2871               
##  3rd Qu.: 71.45   3rd Qu.: 71.71   3rd Qu.:  1.8305               
##  Max.   :164.18   Max.   :164.84   Max.   :  7.6217               
##  days_to_next_dividend percent_return_next_dividend
##  Min.   :  0.00        Min.   :0.06557             
##  1st Qu.: 23.75        1st Qu.:0.52827             
##  Median : 47.00        Median :0.67720             
##  Mean   : 53.76        Mean   :0.69453             
##  3rd Qu.: 69.00        3rd Qu.:0.86207             
##  Max.   :336.00        Max.   :1.56421
summary(test_DJI5)
##     quarter     stock                date                 open       
##  Min.   :2   Length:390         Min.   :2011-04-01   Min.   : 10.59  
##  1st Qu.:2   Class :character   1st Qu.:2011-04-21   1st Qu.: 30.74  
##  Median :2   Mode  :character   Median :2011-05-13   Median : 46.02  
##  Mean   :2                      Mean   :2011-05-12   Mean   : 54.46  
##  3rd Qu.:2                      3rd Qu.:2011-06-03   3rd Qu.: 74.56  
##  Max.   :2                      Max.   :2011-06-24   Max.   :172.11  
##       high             low             close            volume         
##  Min.   : 10.94   Min.   : 10.40   Min.   : 10.52   Min.   :  9718851  
##  1st Qu.: 31.31   1st Qu.: 30.20   1st Qu.: 30.73   1st Qu.: 28794263  
##  Median : 46.91   Median : 45.19   Median : 46.27   Median : 49836700  
##  Mean   : 55.46   Mean   : 53.40   Mean   : 54.43   Mean   :110307075  
##  3rd Qu.: 75.49   3rd Qu.: 72.95   3rd Qu.: 74.67   3rd Qu.:117721728  
##  Max.   :173.54   Max.   :167.82   Max.   :170.58   Max.   :889460755  
##  percent_change_price percent_change_volume_over_last_wk previous_weeks_volume
##  Min.   :-10.4975     Min.   :-61.433                    Min.   :  9718851    
##  1st Qu.: -1.5454     1st Qu.:-14.278                    1st Qu.: 28324494    
##  Median : -0.1317     Median :  1.475                    Median : 48269156    
##  Mean   : -0.1455     Mean   :  6.475                    Mean   :109000266    
##  3rd Qu.:  1.4685     3rd Qu.: 19.442                    3rd Qu.:117095779    
##  Max.   :  9.8822     Max.   :249.984                    Max.   :889460755    
##  next_weeks_open  next_weeks_close percent_change_next_weeks_price
##  Min.   : 10.52   Min.   : 10.52   Min.   :-10.49750              
##  1st Qu.: 30.74   1st Qu.: 30.86   1st Qu.: -1.45328              
##  Median : 46.03   Median : 46.27   Median : -0.06029              
##  Mean   : 54.36   Mean   : 54.49   Mean   :  0.19355              
##  3rd Qu.: 74.56   3rd Qu.: 74.70   3rd Qu.:  1.88444              
##  Max.   :172.11   Max.   :174.54   Max.   :  9.88223              
##  days_to_next_dividend percent_return_next_dividend
##  Min.   :  0.00        Min.   :0.07418             
##  1st Qu.: 26.00        1st Qu.:0.53839             
##  Median : 47.00        Median :0.68614             
##  Mean   : 51.38        Mean   :0.68933             
##  3rd Qu.: 69.00        3rd Qu.:0.85081             
##  Max.   :252.00        Max.   :1.41727
df <- train_DJI5

# Example: df has columns Stock and Value
stocks <- unique(train_DJI5$stock)

for (stock in stocks) {
  stock_plot <- ggplot(df[train_DJI5$stock == stock, ], aes(x = stock, y = percent_change_price)) +
    geom_boxplot(fill = "skyblue") +
    geom_jitter(width = 0.15, size = 1.5, alpha = 0.7, color = "darkblue") +
    labs(title = paste("Q1_ Boxplot for", stock),
         x = "Stock",
         y = "% Return") +
    theme_minimal()
  
  ggsave(filename = paste0("Q1_Boxplot_", stock, ".png"), plot = stock_plot, width = 6, height = 4)
}
stocks <- unique(train_DJI5$stock)

lm_stocks <- data.frame(
  Stock = character(),
  Beta  = numeric(),
  MSE   = numeric(),
  R2    = numeric(),
  stringsAsFactors = FALSE
)

for (s in stocks) {
  train_s <- train_DJI5 %>% filter(stock == s)
  test_s  <- test_DJI5  %>% filter(stock == s)

  if (nrow(train_s) < 2 || nrow(test_s) == 0) next

  drop_cols <- intersect(c("stock", "Stock", "quarter", "Quarter"), names(train_s))
  train_s <- dplyr::select(train_s, -all_of(drop_cols))
  test_s  <- dplyr::select(test_s,  -all_of(drop_cols))

  train_s <- dplyr::select(train_s, where(is.numeric))
  test_s  <- dplyr::select(test_s,  where(is.numeric))

  keep_cols <- sapply(train_s, function(x) length(unique(x)) > 1)
  train_s <- train_s[, keep_cols, drop = FALSE]
  test_s  <- test_s[, keep_cols, drop = FALSE]

  if (!"percent_change_price" %in% names(train_s)) next

  preds <- setdiff(names(train_s), "percent_change_price")
  if (length(preds) == 0) next

  formula_s <- as.formula(
    paste("percent_change_price ~", paste(preds, collapse = " + "))
  )

  fit_s <- lm(formula_s, data = train_s)
  pred_vals <- predict(fit_s, newdata = test_s)

  mse_s <- mean((pred_vals - test_s$percent_change_price)^2, na.rm = TRUE)
  sse <- sum((pred_vals - test_s$percent_change_price)^2, na.rm = TRUE)
  sst <- sum((test_s$percent_change_price - mean(test_s$percent_change_price, na.rm = TRUE))^2, na.rm = TRUE)
  r2_test <- ifelse(sst == 0, NA_real_, 1 - sse/sst)
  beta_s <- coef(fit_s)[2]
  rmse <- sqrt(mean((pred_vals - test_s$percent_change_price)^2, na.rm = TRUE))

  lm_stocks <- rbind(lm_stocks,
                     data.frame(Stock = s, Beta = beta_s, MSE = mse_s, R2 = r2_test, RMSE = rmse))

  # Create data for plot
  plot_df <- data.frame(
    obs_id = seq_len(nrow(test_s)),
    Actual = test_s$percent_change_price,
    Pred   = pred_vals
  )

  p <- ggplot(plot_df, aes(x = obs_id)) +
    geom_line(aes(y = Actual, color = "Actual"), linewidth = 1) +
    geom_line(aes(y = Pred, color = "Predicted"), linewidth = 1, linetype = "dashed") +
    labs(
      title = paste("Actual vs Predicted for", s),
      x = "Q2 Week # (Test Set)",
      y = "percent_change_price",
      color = ""
    ) +
    theme_minimal(base_size = 12) +
    theme(plot.title = element_text(face = "bold", hjust = 0.5))

  # Save the plot as a PNG
  filename <- paste0("linear_plot_", s, ".png")
  ggsave(filename = filename, plot = p, width = 6, height = 4, dpi = 300)

  message("Saved plot for ", s, " → ", filename)
}
## Saved plot for AA → linear_plot_AA.png
## Saved plot for AXP → linear_plot_AXP.png
## Saved plot for BA → linear_plot_BA.png
## Saved plot for BAC → linear_plot_BAC.png
## Saved plot for CAT → linear_plot_CAT.png
## Saved plot for CSCO → linear_plot_CSCO.png
## Saved plot for CVX → linear_plot_CVX.png
## Saved plot for DD → linear_plot_DD.png
## Saved plot for DIS → linear_plot_DIS.png
## Saved plot for GE → linear_plot_GE.png
## Saved plot for HD → linear_plot_HD.png
## Saved plot for HPQ → linear_plot_HPQ.png
## Saved plot for IBM → linear_plot_IBM.png
## Saved plot for INTC → linear_plot_INTC.png
## Saved plot for JNJ → linear_plot_JNJ.png
## Saved plot for JPM → linear_plot_JPM.png
## Saved plot for KRFT → linear_plot_KRFT.png
## Saved plot for KO → linear_plot_KO.png
## Saved plot for MCD → linear_plot_MCD.png
## Saved plot for MMM → linear_plot_MMM.png
## Saved plot for MRK → linear_plot_MRK.png
## Saved plot for MSFT → linear_plot_MSFT.png
## Saved plot for PFE → linear_plot_PFE.png
## Saved plot for PG → linear_plot_PG.png
## Saved plot for T → linear_plot_T.png
## Saved plot for TRV → linear_plot_TRV.png
## Saved plot for UTX → linear_plot_UTX.png
## Saved plot for VZ → linear_plot_VZ.png
## Saved plot for WMT → linear_plot_WMT.png
## Saved plot for XOM → linear_plot_XOM.png
# View summary table
lm_stocks
##        Stock       Beta          MSE          R2        RMSE
## open      AA -6.2029824 1.561763e-01   0.9892393  0.39519150
## open1    AXP -2.1372260 1.106554e-01   0.9782802  0.33264906
## open2     BA -1.4498696 2.668131e-01   0.9614604  0.51653953
## open3    BAC -5.2427173 5.850675e+00  -0.4636001  2.41881689
## open4    CAT -2.1891859 1.911225e+02 -13.0480830 13.82470471
## open5   CSCO -7.6664370 2.941887e+02 -39.6192764 17.15192858
## open6    CVX -1.7515942 4.721032e+00   0.2438875  2.17279351
## open7     DD -1.5879892 1.206705e+01  -0.5690389  3.47376585
## open8    DIS -2.4743154 2.151446e+00   0.6008815  1.46678074
## open9     GE -4.5246677 1.928530e-01   0.9513643  0.43915028
## open10    HD -2.3118176 1.804019e+00   0.5407556  1.34313787
## open11   HPQ -4.2788385 2.067081e+01  -0.8780404  4.54651643
## open12   IBM -0.6310905 9.244657e-03   0.9947966  0.09614914
## open13  INTC -4.5505550 9.368176e-01   0.9412649  0.96789337
## open14   JNJ -1.7631830 2.354709e-02   0.9948454  0.15345060
## open15   JPM -2.1086835 5.335163e-02   0.9882694  0.23097971
## open16  KRFT -3.0755085 4.276757e-01   0.8618524  0.65396920
## open17    KO -1.5219102 8.496587e-03   0.9956161  0.09217693
## open18   MCD -1.3392114 4.763068e-02   0.9691705  0.21824453
## open19   MMM -1.1137690 1.048478e-02   0.9968134  0.10239519
## open20   MRK -2.8511429 2.083228e-01   0.9584148  0.45642389
## open21  MSFT -3.0594096 2.885462e+00   0.3804582  1.69866481
## open22   PFE -5.4821097 2.352371e-01   0.9511855  0.48501250
## open23    PG -1.7052305 3.661273e-01   0.8927803  0.60508455
## open24     T -4.5303538 4.577664e+01 -12.4959012  6.76584396
## open25   TRV -2.2396539 7.363370e-01   0.7847406  0.85810079
## open26   UTX -1.2347414 1.804701e-02   0.9966683  0.13433916
## open27    VZ -2.8342122 2.712158e-02   0.9917136  0.16468632
## open28   WMT -1.9022279 2.550235e-02   0.9859318  0.15969456
## open29   XOM -1.2299851 2.847315e-02   0.9962196  0.16873990
lm_stocks |> arrange(desc(R2))
##        Stock       Beta          MSE          R2        RMSE
## open19   MMM -1.1137690 1.048478e-02   0.9968134  0.10239519
## open26   UTX -1.2347414 1.804701e-02   0.9966683  0.13433916
## open29   XOM -1.2299851 2.847315e-02   0.9962196  0.16873990
## open17    KO -1.5219102 8.496587e-03   0.9956161  0.09217693
## open14   JNJ -1.7631830 2.354709e-02   0.9948454  0.15345060
## open12   IBM -0.6310905 9.244657e-03   0.9947966  0.09614914
## open27    VZ -2.8342122 2.712158e-02   0.9917136  0.16468632
## open      AA -6.2029824 1.561763e-01   0.9892393  0.39519150
## open15   JPM -2.1086835 5.335163e-02   0.9882694  0.23097971
## open28   WMT -1.9022279 2.550235e-02   0.9859318  0.15969456
## open1    AXP -2.1372260 1.106554e-01   0.9782802  0.33264906
## open18   MCD -1.3392114 4.763068e-02   0.9691705  0.21824453
## open2     BA -1.4498696 2.668131e-01   0.9614604  0.51653953
## open20   MRK -2.8511429 2.083228e-01   0.9584148  0.45642389
## open9     GE -4.5246677 1.928530e-01   0.9513643  0.43915028
## open22   PFE -5.4821097 2.352371e-01   0.9511855  0.48501250
## open13  INTC -4.5505550 9.368176e-01   0.9412649  0.96789337
## open23    PG -1.7052305 3.661273e-01   0.8927803  0.60508455
## open16  KRFT -3.0755085 4.276757e-01   0.8618524  0.65396920
## open25   TRV -2.2396539 7.363370e-01   0.7847406  0.85810079
## open8    DIS -2.4743154 2.151446e+00   0.6008815  1.46678074
## open10    HD -2.3118176 1.804019e+00   0.5407556  1.34313787
## open21  MSFT -3.0594096 2.885462e+00   0.3804582  1.69866481
## open6    CVX -1.7515942 4.721032e+00   0.2438875  2.17279351
## open3    BAC -5.2427173 5.850675e+00  -0.4636001  2.41881689
## open7     DD -1.5879892 1.206705e+01  -0.5690389  3.47376585
## open11   HPQ -4.2788385 2.067081e+01  -0.8780404  4.54651643
## open24     T -4.5303538 4.577664e+01 -12.4959012  6.76584396
## open4    CAT -2.1891859 1.911225e+02 -13.0480830 13.82470471
## open5   CSCO -7.6664370 2.941887e+02 -39.6192764 17.15192858
# 1) read files
dowjones <- read_excel("dowjones.xlsx")
sp500    <- read_excel("sp500.xlsx")

# 2) make sure date is Date
dowjones$date <- as.Date(dowjones$date)
sp500$date    <- as.Date(sp500$date)

# 3) rename S&P return so we can tell them apart
sp500 <- sp500 %>%
  dplyr::rename(sp500_return = percent_change_price) %>%
  dplyr::select(date, sp500_return)

# 4) merge the two
df <- dowjones %>%
  dplyr::select(date, stock, percent_change_price) %>%
  dplyr::inner_join(sp500, by = "date") %>%
  dplyr::rename(
    stock_return  = percent_change_price,
    market_return = sp500_return
  )


print(df)
## # A tibble: 720 × 4
##    date       stock stock_return market_return
##    <date>     <chr>        <dbl>         <dbl>
##  1 2011-01-07 AA           3.79         0.0171
##  2 2011-01-14 AA          -4.43        -0.0077
##  3 2011-01-21 AA          -2.47        -0.0055
##  4 2011-01-28 AA           1.64         0.0271
##  5 2011-02-04 AA           5.93         0.014 
##  6 2011-02-11 AA           0.231        0.0104
##  7 2011-02-18 AA          -0.633       -0.0172
##  8 2011-02-25 AA          -1.77         0.001 
##  9 2011-03-04 AA          -1.37        -0.0128
## 10 2011-03-11 AA          -3.32        -0.0192
## # ℹ 710 more rows
# 5) run CAPM per stock
stocks <- unique(df$stock)

capm_results <- data.frame(
  Stock = character(),
  Alpha = numeric(),
  Beta  = numeric(),
  R2    = numeric(),
  RMSE  = numeric(),
  stringsAsFactors = FALSE
)

for (s in stocks) {
  df_s <- df %>% dplyr::filter(stock == s)

  # skip tiny samples
  if (nrow(df_s) < 10) next

  capm_fit <- lm(stock_return ~ market_return, data = df_s)
  summ     <- summary(capm_fit)

  alpha_s <- coef(capm_fit)[1]
  beta_s  <- coef(capm_fit)[2]
  r2_s    <- summ$r.squared
  rmse_s  <- sqrt(mean((predict(capm_fit, df_s) - df_s$stock_return)^2, na.rm = TRUE))

  capm_results <- rbind(
    capm_results,
    data.frame(
      Stock = s,
      Alpha = alpha_s,
      Beta  = beta_s,
      R2    = r2_s,
      RMSE  = rmse_s
    )
  )

  # Security Characteristic Line
  p <- ggplot(df_s, aes(x = market_return, y = stock_return)) +
    geom_point(color = "gray70") +
    geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 1) +
    labs(
      title = paste("CAPM: SCL for", s),
      subtitle = paste0("α = ", round(alpha_s, 4), " | β = ", round(beta_s, 3)),
      x = "S&P 500 return",
      y = paste(s, "return")
    ) +
    theme_minimal()

  ggsave(paste0("CAPM_", s, ".png"), p, width = 6, height = 4, dpi = 300)
  message("Saved CAPM plot for ", s)
}
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for AA
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for AXP
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for BA
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for BAC
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for CAT
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for CSCO
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for CVX
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for DD
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for DIS
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for GE
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for HD
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for HPQ
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for IBM
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for INTC
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for JNJ
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for JPM
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for KRFT
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for KO
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for MCD
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for MMM
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for MRK
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for MSFT
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for PFE
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for PG
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for T
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for TRV
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for UTX
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for VZ
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for WMT
## 
## `geom_smooth()` using formula = 'y ~ x'
## Saved CAPM plot for XOM
# ---- Step 6: View CAPM summary table ----
capm_results
##               Stock        Alpha         Beta           R2     RMSE
## (Intercept)      AA -0.577924574  70.94495302 1.355143e-01 3.244169
## (Intercept)1    AXP  0.436013313  -5.11494553 1.232143e-03 2.636565
## (Intercept)2     BA  0.153085705 -29.55464460 4.543791e-02 2.452530
## (Intercept)3    BAC -0.896512831 -18.73512480 1.117968e-02 3.190040
## (Intercept)4    CAT -0.075268128  54.40775542 9.608016e-02 3.021374
## (Intercept)5   CSCO -1.227101304  -9.92968528 2.085422e-03 3.932610
## (Intercept)6    CVX  0.222200900   3.79943564 7.495573e-04 2.511594
## (Intercept)7     DD  0.153896184  64.79037235 1.912173e-01 2.412451
## (Intercept)8    DIS  0.018484568  20.91494179 1.832884e-02 2.771199
## (Intercept)9     GE -0.282185446  -4.10406279 1.220999e-03 2.125136
## (Intercept)10    HD  0.077660413  15.92576114 1.385907e-02 2.432194
## (Intercept)11   HPQ -0.643643163  29.20879736 1.885881e-02 3.814327
## (Intercept)12   IBM  0.502013428  11.85758169 1.656144e-02 1.654308
## (Intercept)13  INTC -0.215081164  -0.06793618 2.235314e-07 2.601521
## (Intercept)14   JNJ  0.085634538 -35.12053153 1.448399e-01 1.545028
## (Intercept)15   JPM -0.072795784 -25.52512839 3.867987e-02 2.303855
## (Intercept)16  KRFT  0.486315866  -2.18328176 4.667386e-04 1.829225
## (Intercept)17    KO  0.081198873 -10.38387254 1.141419e-02 1.749604
## (Intercept)18   MCD  0.242231747 -25.16491045 6.533737e-02 1.723208
## (Intercept)19   MMM  0.155249012 -19.28258341 3.218299e-02 1.914451
## (Intercept)20   MRK  0.031065889 -21.69551601 2.156415e-02 2.645852
## (Intercept)21  MSFT -0.564588438   4.79376755 2.192071e-03 1.851692
## (Intercept)22   PFE  0.729224799  -2.76701007 4.528491e-04 2.353592
## (Intercept)23    PG  0.046712519 -25.15717143 7.805838e-02 1.565306
## (Intercept)24     T  0.065350379 -25.97413122 6.595381e-02 1.769705
## (Intercept)25   TRV  0.003905631 -19.11335203 2.803254e-02 2.037638
## (Intercept)26   UTX  0.063147221  19.37766308 4.243202e-02 1.666614
## (Intercept)27    VZ -0.112558507  39.15827876 1.881299e-01 1.472766
## (Intercept)28   WMT  0.050189252  -4.12780933 2.448794e-03 1.508366
## (Intercept)29   XOM  0.106082268 -12.26801203 7.453260e-03 2.563139
beta_vector <- setNames(capm_results$Beta, capm_results$Stock)

beta_vector <- sort(beta_vector, decreasing = TRUE)

beta_vector
##           AA           DD          CAT           VZ          HPQ          DIS 
##  70.94495302  64.79037235  54.40775542  39.15827876  29.20879736  20.91494179 
##          UTX           HD          IBM         MSFT          CVX         INTC 
##  19.37766308  15.92576114  11.85758169   4.79376755   3.79943564  -0.06793618 
##         KRFT          PFE           GE          WMT          AXP         CSCO 
##  -2.18328176  -2.76701007  -4.10406279  -4.12780933  -5.11494553  -9.92968528 
##           KO          XOM          BAC          TRV          MMM          MRK 
## -10.38387254 -12.26801203 -18.73512480 -19.11335203 -19.28258341 -21.69551601 
##           PG          MCD          JPM            T           BA          JNJ 
## -25.15717143 -25.16491045 -25.52512839 -25.97413122 -29.55464460 -35.12053153
risky_stocks <- capm_results %>%
  dplyr::filter(Beta > 1) %>%
  dplyr::select(Stock, Beta)

risky_Bstocks <- setNames(risky_stocks$Beta, risky_stocks$Stock)

risky_Bstocks
##        AA       CAT       CVX        DD       DIS        HD       HPQ       IBM 
## 70.944953 54.407755  3.799436 64.790372 20.914942 15.925761 29.208797 11.857582 
##      MSFT       UTX        VZ 
##  4.793768 19.377663 39.158279