Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(e1071)
library(tree)
library(caret)
## Loading required package: lattice
library(rpart)

DA 6813 Case Study 3

df <- read.csv('/Users/ponce/Desktop/DA-6813/Case Study 3/dow_jones_index.data')

Exploratory Analysis

glimpse(df)
## Rows: 750
## Columns: 16
## $ quarter                            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stock                              <chr> "AA", "AA", "AA", "AA", "AA", "AA",…
## $ date                               <chr> "1/7/2011", "1/14/2011", "1/21/2011…
## $ open                               <chr> "$15.82", "$16.71", "$16.19", "$15.…
## $ high                               <chr> "$16.72", "$16.71", "$16.38", "$16.…
## $ low                                <chr> "$15.78", "$15.64", "$15.60", "$15.…
## $ close                              <chr> "$16.42", "$15.97", "$15.79", "$16.…
## $ volume                             <int> 239655616, 242963398, 138428495, 15…
## $ percent_change_price               <dbl> 3.7926700, -4.4284900, -2.4706600, …
## $ percent_change_volume_over_last_wk <dbl> NA, 1.380223, -43.024959, 9.355500,…
## $ previous_weeks_volume              <int> NA, 239655616, 242963398, 138428495…
## $ next_weeks_open                    <chr> "$16.71", "$16.19", "$15.87", "$16.…
## $ next_weeks_close                   <chr> "$15.97", "$15.79", "$16.13", "$17.…
## $ percent_change_next_weeks_price    <dbl> -4.4284900, -2.4706600, 1.6383100, …
## $ days_to_next_dividend              <int> 26, 19, 12, 5, 97, 90, 83, 76, 69, …
## $ percent_return_next_dividend       <dbl> 0.182704, 0.187852, 0.189994, 0.185…

Need to deal with ‘$’ to make data into numeric instead of character.

Cleaning data

df <- df |> 
  mutate(open = as.numeric(str_remove_all(open, '\\$')),
         high = as.numeric(str_remove_all(high, '\\$')),
         low = as.numeric(str_remove_all(low, '\\$')),
         close = as.numeric(str_remove_all(close, '\\$')),
         next_weeks_open = as.numeric(str_remove_all(next_weeks_open, '\\$')),
         next_weeks_close = as.numeric(str_remove_all(next_weeks_close, '\\$')),
         date = as.Date(date, format= '%m/%d/%Y'))

# Convert categorical into factor
df$quarter <- as.factor(df$quarter)
df$stock <- as.factor(df$stock)
glimpse(df)
## Rows: 750
## Columns: 16
## $ quarter                            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stock                              <fct> AA, AA, AA, AA, AA, AA, AA, AA, AA,…
## $ date                               <date> 2011-01-07, 2011-01-14, 2011-01-21…
## $ open                               <dbl> 15.82, 16.71, 16.19, 15.87, 16.18, …
## $ high                               <dbl> 16.72, 16.71, 16.38, 16.63, 17.39, …
## $ low                                <dbl> 15.78, 15.64, 15.60, 15.82, 16.18, …
## $ close                              <dbl> 16.42, 15.97, 15.79, 16.13, 17.14, …
## $ volume                             <int> 239655616, 242963398, 138428495, 15…
## $ percent_change_price               <dbl> 3.7926700, -4.4284900, -2.4706600, …
## $ percent_change_volume_over_last_wk <dbl> NA, 1.380223, -43.024959, 9.355500,…
## $ previous_weeks_volume              <int> NA, 239655616, 242963398, 138428495…
## $ next_weeks_open                    <dbl> 16.71, 16.19, 15.87, 16.18, 17.33, …
## $ next_weeks_close                   <dbl> 15.97, 15.79, 16.13, 17.14, 17.37, …
## $ percent_change_next_weeks_price    <dbl> -4.4284900, -2.4706600, 1.6383100, …
## $ days_to_next_dividend              <int> 26, 19, 12, 5, 97, 90, 83, 76, 69, …
## $ percent_return_next_dividend       <dbl> 0.182704, 0.187852, 0.189994, 0.185…
count(df, stock)
##    stock  n
## 1     AA 25
## 2    AXP 25
## 3     BA 25
## 4    BAC 25
## 5    CAT 25
## 6   CSCO 25
## 7    CVX 25
## 8     DD 25
## 9    DIS 25
## 10    GE 25
## 11    HD 25
## 12   HPQ 25
## 13   IBM 25
## 14  INTC 25
## 15   JNJ 25
## 16   JPM 25
## 17    KO 25
## 18  KRFT 25
## 19   MCD 25
## 20   MMM 25
## 21   MRK 25
## 22  MSFT 25
## 23   PFE 25
## 24    PG 25
## 25     T 25
## 26   TRV 25
## 27   UTX 25
## 28    VZ 25
## 29   WMT 25
## 30   XOM 25

25 observations for each stock

count(df, quarter)
##   quarter   n
## 1       1 360
## 2       2 390

360 observations in q1 vs 390 in q2.

q2 has one more day if it’s a leap year, hence the 30 more observations

cor_matrix <- cor(df[, sapply(df, is.numeric)], use = 'complete.obs')

#Correlation plot
corrplot(cor_matrix, method = 'circle', type = 'lower', order = 'hclust', tl.col = 'black', 
         tl.srt = 45)

A lot of variables are correlated to each other as we expected. This could potentially lead to multicollinearity issues.

volume has a negative correlation with price variables (open, high, low, close) which indicates that when stock prices go up, trading volume tends to go down and vice versa.

next_weeks_open and next_weeks_close have moderate positive correlations with the current price variables as expected. This suggests that prices are relatively consistent from week to week.

sum(is.na(df))
## [1] 60

60 missing values, all come from percent_change_volume_over_last_wk and previous_week_volume.

For each of our stocks, these variables have no data on the exact first observation based on the date.

This makes sense because we have no data of the previous week.

What should be done about these missing values?

Exploratory Analysis continued

summary(df)
##  quarter     stock          date                 open             high       
##  1:360   AA     : 25   Min.   :2011-01-07   Min.   : 10.59   Min.   : 10.94  
##  2:390   AXP    : 25   1st Qu.:2011-02-18   1st Qu.: 29.83   1st Qu.: 30.63  
##          BA     : 25   Median :2011-04-01   Median : 45.97   Median : 46.88  
##          BAC    : 25   Mean   :2011-03-31   Mean   : 53.65   Mean   : 54.67  
##          CAT    : 25   3rd Qu.:2011-05-13   3rd Qu.: 72.72   3rd Qu.: 74.29  
##          CSCO   : 25   Max.   :2011-06-24   Max.   :172.11   Max.   :173.54  
##          (Other):600                                                         
##       low             close            volume          percent_change_price
##  Min.   : 10.40   Min.   : 10.52   Min.   :9.719e+06   Min.   :-15.42290   
##  1st Qu.: 28.72   1st Qu.: 30.36   1st Qu.:3.087e+07   1st Qu.: -1.28805   
##  Median : 44.80   Median : 45.93   Median :5.306e+07   Median :  0.00000   
##  Mean   : 52.64   Mean   : 53.73   Mean   :1.175e+08   Mean   :  0.05026   
##  3rd Qu.: 71.04   3rd Qu.: 72.67   3rd Qu.:1.327e+08   3rd Qu.:  1.65089   
##  Max.   :167.82   Max.   :170.58   Max.   :1.453e+09   Max.   :  9.88223   
##                                                                            
##  percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open 
##  Min.   :-61.4332                   Min.   :9.719e+06     Min.   : 10.52  
##  1st Qu.:-19.8043                   1st Qu.:3.068e+07     1st Qu.: 30.32  
##  Median :  0.5126                   Median :5.295e+07     Median : 46.02  
##  Mean   :  5.5936                   Mean   :1.174e+08     Mean   : 53.70  
##  3rd Qu.: 21.8006                   3rd Qu.:1.333e+08     3rd Qu.: 72.72  
##  Max.   :327.4089                   Max.   :1.453e+09     Max.   :172.11  
##  NA's   :30                         NA's   :30                            
##  next_weeks_close percent_change_next_weeks_price days_to_next_dividend
##  Min.   : 10.52   Min.   :-15.4229                Min.   :  0.00       
##  1st Qu.: 30.46   1st Qu.: -1.2221                1st Qu.: 24.00       
##  Median : 46.12   Median :  0.1012                Median : 47.00       
##  Mean   : 53.89   Mean   :  0.2385                Mean   : 52.53       
##  3rd Qu.: 72.92   3rd Qu.:  1.8456                3rd Qu.: 69.00       
##  Max.   :174.54   Max.   :  9.8822                Max.   :336.00       
##                                                                        
##  percent_return_next_dividend
##  Min.   :0.06557             
##  1st Qu.:0.53455             
##  Median :0.68107             
##  Mean   :0.69183             
##  3rd Qu.:0.85429             
##  Max.   :1.56421             
## 

Distributions of Numeric Variables (All stocks combined)

#We use pivot_longer to reshape data into long format to be able to use facet_wrap
#Allows us to plot all histograms in one plot since we have all variables in one variable.
df_long <- df |> 
  pivot_longer(cols = c(open, high, low, close, volume, percent_change_price), 
               names_to = "variable", values_to = "value")

ggplot(df_long, aes(x = value)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.6) +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distributions of Numeric Variables")

Some of these aren’t too insightful since we are using all stocks to create the histograms, and not all stocks trade at the same prices or volumes.

We can see how close, open, high and low have this up and down behavior which shows the different prices of stocks.

percent_change_price is very interesting because it looks like an almost perfect normal distribution. This alludes to the 50/50 behavior of the stock market. Normal distribution suggests price changes behave symmetrically around the mean (0.05).

df_long2 <- df |> 
  pivot_longer(cols = c(percent_change_volume_over_last_wk, previous_weeks_volume, 
                        next_weeks_open, next_weeks_close, percent_change_next_weeks_price, 
                        days_to_next_dividend, percent_return_next_dividend), 
               names_to = "variable", values_to = "value")

ggplot(df_long2, aes(x = value)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.6) +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distributions of Numeric Variables")
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_bin()`).

next_weeks_close displays a decreasing behavior. This could be due to the fact that most stocks aren’t as expensive to purchase, so as price goes up, frequency goes down. This doesn’t tell us much about the real behavior anyways as we don’t know if it’s showing an increase or decrease throughout time.

percent_change_next_weeks_price follows same normal distribution around the mean (0.23) just like percent_change_price.

All other plots show interesting but not ultra insightful results again, since it’s not specific to a stock but it does speak about the overall behavior in some way.

# Summarize each stock's data with summary statistics
df_long_stocks <- df %>%
  group_by(stock) %>%
  summarise(
    count = n(),
    mean_open = mean(open, na.rm = TRUE),
    mean_high = mean(high, na.rm = TRUE),
    mean_low = mean(low, na.rm = TRUE),
    mean_close = mean(close, na.rm = TRUE),
    mean_volume = mean(volume, na.rm = TRUE),
    mean_percent_change_price = mean(percent_change_price, na.rm = TRUE),
    mean_percent_change_volume_over_last_wk = mean(percent_change_volume_over_last_wk, na.rm = TRUE),
    mean_previous_weeks_volume = mean(previous_weeks_volume, na.rm = TRUE),
    mean_next_weeks_open = mean(next_weeks_open, na.rm = TRUE),
    mean_next_weeks_close = mean(next_weeks_close, na.rm = TRUE),
    mean_percent_change_next_weeks_price = mean(percent_change_next_weeks_price, na.rm = TRUE),
    mean_days_to_next_dividend = mean(days_to_next_dividend, na.rm = TRUE),
    mean_percent_return_next_dividend = mean(percent_return_next_dividend, na.rm = TRUE),
    median_open = median(open, na.rm = TRUE),
    median_high = median(high, na.rm = TRUE),
    median_low = median(low, na.rm = TRUE),
    median_close = median(close, na.rm = TRUE),
    median_volume = median(volume, na.rm = TRUE),
    median_percent_change_price = median(percent_change_price, na.rm = TRUE),
    median_percent_change_volume_over_last_wk = median(percent_change_volume_over_last_wk, na.rm = TRUE),
    median_previous_weeks_volume = median(previous_weeks_volume, na.rm = TRUE),
    median_next_weeks_open = median(next_weeks_open, na.rm = TRUE),
    median_next_weeks_close = median(next_weeks_close, na.rm = TRUE),
    median_percent_change_next_weeks_price = median(percent_change_next_weeks_price, na.rm = TRUE),
    median_days_to_next_dividend = median(days_to_next_dividend, na.rm = TRUE),
    median_percent_return_next_dividend = median(percent_return_next_dividend, na.rm = TRUE)
  )

# View the summarized data for each stock
head(df_long_stocks)
## # A tibble: 6 × 28
##   stock count mean_open mean_high mean_low mean_close mean_volume
##   <fct> <int>     <dbl>     <dbl>    <dbl>      <dbl>       <dbl>
## 1 AA       25      16.6      17.0     16.1       16.5  129638810.
## 2 AXP      25      46.5      47.6     45.6       46.7   35208482.
## 3 BA       25      73.3      74.8     71.6       73.4   23781421.
## 4 BAC      25      13.2      13.5     12.8       13.1  722999136.
## 5 CAT      25     103.      106.     100.       103.    33731116.
## 6 CSCO     25      18.1      18.4     17.6       17.9  358661588.
## # ℹ 21 more variables: mean_percent_change_price <dbl>,
## #   mean_percent_change_volume_over_last_wk <dbl>,
## #   mean_previous_weeks_volume <dbl>, mean_next_weeks_open <dbl>,
## #   mean_next_weeks_close <dbl>, mean_percent_change_next_weeks_price <dbl>,
## #   mean_days_to_next_dividend <dbl>, mean_percent_return_next_dividend <dbl>,
## #   median_open <dbl>, median_high <dbl>, median_low <dbl>, median_close <dbl>,
## #   median_volume <int>, median_percent_change_price <dbl>, …

This gives us an idea about the average’s of each stock.

We can see that most of these stocks show volatility.

The highest average percent price change is 0.59% and the lowest is -1.17%.

This would equal to those stocks going up 0.59% and -1.17% each week.

To back this up we will create a plot to show their volatility.

ggplot(df, aes(x = date, y = percent_change_price, color = stock, 
                          group = stock)) +
  geom_line() +
  labs(title = 'Stock Percent Change in Price Over Time', 
       x = 'Date', 
       y = 'Percent Change in Price') +
  theme_minimal() + 
  theme(legend.position = 'none')

In the plot we see that from week to week, some stocks are very volatile.

Dealing with missing values

Since dropping 30 out of our 750 observations would be quite a bit since we are taking away 4% of each stock’s observations, and we have very limited data, we will create 2 dataframes.

One with those observations removed, and one with those observations showing a 0 to assume no change in the first week.

#Dataset with removed NA's
df_NA_removed <- na.omit(df)

#Dataset with inputted 0's instead of NA's
df_NA_inputted <- df
df_NA_inputted[is.na(df_NA_inputted)] <- 0
summary(df_NA_removed)
##  quarter     stock          date                 open             high       
##  1:330   AA     : 24   Min.   :2011-01-14   Min.   : 10.59   Min.   : 10.94  
##  2:390   AXP    : 24   1st Qu.:2011-02-23   1st Qu.: 29.96   1st Qu.: 30.66  
##          BA     : 24   Median :2011-04-04   Median : 46.02   Median : 46.91  
##          BAC    : 24   Mean   :2011-04-04   Mean   : 53.75   Mean   : 54.76  
##          CAT    : 24   3rd Qu.:2011-05-14   3rd Qu.: 72.81   3rd Qu.: 74.34  
##          CSCO   : 24   Max.   :2011-06-24   Max.   :172.11   Max.   :173.54  
##          (Other):576                                                         
##       low             close            volume          percent_change_price
##  Min.   : 10.40   Min.   : 10.52   Min.   :9.719e+06   Min.   :-15.42290   
##  1st Qu.: 28.71   1st Qu.: 30.41   1st Qu.:3.069e+07   1st Qu.: -1.29736   
##  Median : 44.95   Median : 46.12   Median :5.275e+07   Median :  0.00000   
##  Mean   : 52.73   Mean   : 53.83   Mean   :1.156e+08   Mean   :  0.03014   
##  3rd Qu.: 71.11   3rd Qu.: 72.77   3rd Qu.:1.312e+08   3rd Qu.:  1.63087   
##  Max.   :167.82   Max.   :170.58   Max.   :1.054e+09   Max.   :  9.88223   
##                                                                            
##  percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open 
##  Min.   :-61.4332                   Min.   :9.719e+06     Min.   : 10.52  
##  1st Qu.:-19.8043                   1st Qu.:3.068e+07     1st Qu.: 30.39  
##  Median :  0.5126                   Median :5.295e+07     Median : 46.04  
##  Mean   :  5.5936                   Mean   :1.174e+08     Mean   : 53.81  
##  3rd Qu.: 21.8006                   3rd Qu.:1.333e+08     3rd Qu.: 72.81  
##  Max.   :327.4089                   Max.   :1.453e+09     Max.   :172.11  
##                                                                           
##  next_weeks_close percent_change_next_weeks_price days_to_next_dividend
##  Min.   : 10.52   Min.   :-15.4229                Min.   :  0.00       
##  1st Qu.: 30.51   1st Qu.: -1.2524                1st Qu.: 24.00       
##  Median : 45.93   Median :  0.0360                Median : 47.00       
##  Mean   : 53.97   Mean   :  0.1933                Mean   : 52.26       
##  3rd Qu.: 73.00   3rd Qu.:  1.7927                3rd Qu.: 69.25       
##  Max.   :174.54   Max.   :  9.8822                Max.   :329.00       
##                                                                        
##  percent_return_next_dividend
##  Min.   :0.06557             
##  1st Qu.:0.53409             
##  Median :0.68014             
##  Mean   :0.69156             
##  3rd Qu.:0.85402             
##  Max.   :1.56421             
## 
summary(df_NA_inputted)
##  quarter     stock          date                 open             high       
##  1:360   AA     : 25   Min.   :2011-01-07   Min.   : 10.59   Min.   : 10.94  
##  2:390   AXP    : 25   1st Qu.:2011-02-18   1st Qu.: 29.83   1st Qu.: 30.63  
##          BA     : 25   Median :2011-04-01   Median : 45.97   Median : 46.88  
##          BAC    : 25   Mean   :2011-03-31   Mean   : 53.65   Mean   : 54.67  
##          CAT    : 25   3rd Qu.:2011-05-13   3rd Qu.: 72.72   3rd Qu.: 74.29  
##          CSCO   : 25   Max.   :2011-06-24   Max.   :172.11   Max.   :173.54  
##          (Other):600                                                         
##       low             close            volume          percent_change_price
##  Min.   : 10.40   Min.   : 10.52   Min.   :9.719e+06   Min.   :-15.42290   
##  1st Qu.: 28.72   1st Qu.: 30.36   1st Qu.:3.087e+07   1st Qu.: -1.28805   
##  Median : 44.80   Median : 45.93   Median :5.306e+07   Median :  0.00000   
##  Mean   : 52.64   Mean   : 53.73   Mean   :1.175e+08   Mean   :  0.05026   
##  3rd Qu.: 71.04   3rd Qu.: 72.67   3rd Qu.:1.327e+08   3rd Qu.:  1.65089   
##  Max.   :167.82   Max.   :170.58   Max.   :1.453e+09   Max.   :  9.88223   
##                                                                            
##  percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open 
##  Min.   :-61.43                     Min.   :0.000e+00     Min.   : 10.52  
##  1st Qu.:-18.89                     1st Qu.:2.812e+07     1st Qu.: 30.32  
##  Median :  0.00                     Median :5.069e+07     Median : 46.02  
##  Mean   :  5.37                     Mean   :1.127e+08     Mean   : 53.70  
##  3rd Qu.: 19.98                     3rd Qu.:1.296e+08     3rd Qu.: 72.72  
##  Max.   :327.41                     Max.   :1.453e+09     Max.   :172.11  
##                                                                           
##  next_weeks_close percent_change_next_weeks_price days_to_next_dividend
##  Min.   : 10.52   Min.   :-15.4229                Min.   :  0.00       
##  1st Qu.: 30.46   1st Qu.: -1.2221                1st Qu.: 24.00       
##  Median : 46.12   Median :  0.1012                Median : 47.00       
##  Mean   : 53.89   Mean   :  0.2385                Mean   : 52.53       
##  3rd Qu.: 72.92   3rd Qu.:  1.8456                3rd Qu.: 69.00       
##  Max.   :174.54   Max.   :  9.8822                Max.   :336.00       
##                                                                        
##  percent_return_next_dividend
##  Min.   :0.06557             
##  1st Qu.:0.53455             
##  Median :0.68107             
##  Mean   :0.69183             
##  3rd Qu.:0.85429             
##  Max.   :1.56421             
## 

Only those 2 variables had changes in their summary statistics. (previous_week_volume, percent_change_volume_over_last_wk)

Dropping variables that will lead to data leakage

Variables that give us information about next week are important to remove. In reality, when predicting next week, we are technically not able to know all those things about the future.

#inputted dataframe
df_NA_inputted <- df_NA_inputted |>
  select(-next_weeks_close, -next_weeks_open, 
         -days_to_next_dividend)

#Dropped NA's dataframe
df_NA_removed <- df_NA_removed |>
  select(-next_weeks_close, -next_weeks_open, 
         -days_to_next_dividend)

Modeling

———————————————————————————-

LM

Create lagged variables

# Create lagged variables
df_NA_removed_lagged <- df_NA_removed |>
  arrange(stock) |>  # Make sure the data is sorted by stock
  mutate(
    lagged_open = lag(open, 1),  # Lagged open price (previous week's open price)
    lagged_high = lag(high, 1),  # Lagged high price
    lagged_low = lag(low, 1),    # Lagged low price
    lagged_close = lag(close, 1),  # Lagged close price
    lagged_volume = lag(volume, 1),  # Lagged volume (previous week's volume)
    lagged_percent_change_price = lag(percent_change_price, 1), # Lagged percent change in price
    lagged_percent_change_volume = lag(percent_change_volume_over_last_wk, 1),# Lagged percent change in volume
    lagged_previous_weeks_volume = lag(previous_weeks_volume, 1)
  )


df_NA_removed_lagged <- df_NA_removed_lagged |>
  mutate(
    lagged_open_2 = lag(open, 2),  # Lagged open price (two weeks ago)
    lagged_high_2 = lag(high, 2),
    lagged_low_2 = lag(low, 2),
    lagged_close_2 = lag(close, 2),
    lagged_volume_2 = lag(volume, 2),
    lagged_percent_change_price_2 = lag(percent_change_price, 2),
    lagged_percent_change_volume_2 = lag(percent_change_volume_over_last_wk, 2), 
    lagged_previous_weeks_volume_2 = lag(previous_weeks_volume, 2)
  )

#Dataset with removed NA's
df_NA_removed_lagged <- na.omit(df_NA_removed_lagged)


# -------------------------------------------------------------------------------------
# Create lagged variables
df_NA_inputted_lagged <- df_NA_inputted |>
  arrange(stock) |>  # Make sure the data is sorted by stock
  mutate(
    lagged_open = lag(open, 1),  # Lagged open price (previous week's open price)
    lagged_high = lag(high, 1),  # Lagged high price
    lagged_low = lag(low, 1),    # Lagged low price
    lagged_close = lag(close, 1),  # Lagged close price
    lagged_volume = lag(volume, 1),  # Lagged volume (previous week's volume)
    lagged_percent_change_price = lag(percent_change_price, 1), # Lagged percent change in price
    lagged_percent_change_volume = lag(percent_change_volume_over_last_wk, 1),# Lagged percent change in volume
    lagged_previous_weeks_volume = lag(previous_weeks_volume, 1)
  )


df_NA_inputted_lagged <- df_NA_inputted_lagged |>
  mutate(
    lagged_open_2 = lag(open, 2),  # Lagged open price (two weeks ago)
    lagged_high_2 = lag(high, 2),
    lagged_low_2 = lag(low, 2),
    lagged_close_2 = lag(close, 2),
    lagged_volume_2 = lag(volume, 2),
    lagged_percent_change_price_2 = lag(percent_change_price, 2),
    lagged_percent_change_volume_2 = lag(percent_change_volume_over_last_wk, 2), 
    lagged_previous_weeks_volume_2 = lag(previous_weeks_volume, 2)
  )

#Dataset with inputted 0's instead of NA's
df_NA_inputted_lagged <- df_NA_inputted_lagged
df_NA_inputted_lagged[is.na(df_NA_inputted_lagged)] <- 0
cor(df_NA_removed_lagged[, sapply(df_NA_removed_lagged, is.numeric)])
##                                           open         high         low
## open                                1.00000000  0.999611385  0.99933736
## high                                0.99961139  1.000000000  0.99941765
## low                                 0.99933736  0.999417648  1.00000000
## close                               0.99904623  0.999553822  0.99954695
## volume                             -0.52209679 -0.520950925 -0.52385759
## percent_change_price                0.07024937  0.087061325  0.09197482
## percent_change_volume_over_last_wk -0.01062046 -0.008627264 -0.01969032
## previous_weeks_volume              -0.51079386 -0.510222211 -0.51041632
## percent_change_next_weeks_price     0.06616456  0.069057497  0.06670463
## percent_return_next_dividend       -0.15338047 -0.155602054 -0.15092841
## lagged_open                         0.94992226  0.949491516  0.94853525
## lagged_high                         0.95000426  0.949654918  0.94865023
## lagged_low                          0.95010148  0.949587563  0.94882517
## lagged_close                        0.95093171  0.950515352  0.94962995
## lagged_volume                      -0.49355500 -0.493169280 -0.49289465
## lagged_percent_change_price         0.11011153  0.110007214  0.11110462
## lagged_percent_change_volume       -0.01882060 -0.018207471 -0.01840773
## lagged_previous_weeks_volume       -0.47640137 -0.476185656 -0.47595670
## lagged_open_2                       0.89875253  0.897698066  0.89672957
## lagged_high_2                       0.89950796  0.898401651  0.89741422
## lagged_low_2                        0.89874914  0.897598088  0.89679245
## lagged_close_2                      0.90000820  0.898801509  0.89786205
## lagged_volume_2                    -0.45729558 -0.456825006 -0.45631728
## lagged_percent_change_price_2       0.10790929  0.104851789  0.10523920
## lagged_percent_change_volume_2     -0.01858192 -0.016801425 -0.01802300
## lagged_previous_weeks_volume_2     -0.44419533 -0.443882454 -0.44327649
##                                          close      volume percent_change_price
## open                                0.99904623 -0.52209679           0.07024937
## high                                0.99955382 -0.52095093           0.08706133
## low                                 0.99954695 -0.52385759           0.09197482
## close                               1.00000000 -0.52358494           0.10657543
## volume                             -0.52358494  1.00000000          -0.17154294
## percent_change_price                0.10657543 -0.17154294           1.00000000
## percent_change_volume_over_last_wk -0.01863109  0.18232179          -0.23735502
## previous_weeks_volume              -0.51022874  0.88776696          -0.06591756
## percent_change_next_weeks_price     0.06724401 -0.09018005           0.01898666
## percent_return_next_dividend       -0.15336623 -0.26867837           0.04386879
## lagged_open                         0.94809539 -0.48788679           0.06083957
## lagged_high                         0.94828405 -0.48685618           0.06256990
## lagged_low                          0.94826037 -0.48867370           0.06026194
## lagged_close                        0.94920382 -0.48852929           0.06190385
## lagged_volume                      -0.49296477  0.85517252          -0.09948906
## lagged_percent_change_price         0.11124746 -0.10217854           0.01586226
## lagged_percent_change_volume       -0.01862318  0.03661341           0.02628712
## lagged_previous_weeks_volume       -0.47608534  0.81924674          -0.11416825
## lagged_open_2                       0.89620722 -0.45191718           0.04061948
## lagged_high_2                       0.89684916 -0.45160298           0.03868478
## lagged_low_2                        0.89613025 -0.45234911           0.03924222
## lagged_close_2                      0.89720403 -0.45260228           0.03686459
## lagged_volume_2                    -0.45673849  0.76287060          -0.08868801
## lagged_percent_change_price_2       0.10367740 -0.09670732          -0.05868190
## lagged_percent_change_volume_2     -0.01583372  0.01456590           0.03258479
## lagged_previous_weeks_volume_2     -0.44413581  0.75584470          -0.10363913
##                                    percent_change_volume_over_last_wk
## open                                                     -0.010620460
## high                                                     -0.008627264
## low                                                      -0.019690317
## close                                                    -0.018631086
## volume                                                    0.182321786
## percent_change_price                                     -0.237355021
## percent_change_volume_over_last_wk                        1.000000000
## previous_weeks_volume                                    -0.128326210
## percent_change_next_weeks_price                           0.013967095
## percent_return_next_dividend                             -0.020573308
## lagged_open                                              -0.006175426
## lagged_high                                              -0.007556415
## lagged_low                                               -0.003163766
## lagged_close                                             -0.003400151
## lagged_volume                                            -0.117911110
## lagged_percent_change_price                               0.087643041
## lagged_percent_change_volume                             -0.307370587
## lagged_previous_weeks_volume                             -0.035910745
## lagged_open_2                                            -0.027590464
## lagged_high_2                                            -0.028477492
## lagged_low_2                                             -0.026803630
## lagged_close_2                                           -0.026783489
## lagged_volume_2                                          -0.038944093
## lagged_percent_change_price_2                             0.031885748
## lagged_percent_change_volume_2                           -0.094053579
## lagged_previous_weeks_volume_2                           -0.003574668
##                                    previous_weeks_volume
## open                                         -0.51079386
## high                                         -0.51022221
## low                                          -0.51041632
## close                                        -0.51022874
## volume                                        0.88776696
## percent_change_price                         -0.06591756
## percent_change_volume_over_last_wk           -0.12832621
## previous_weeks_volume                         1.00000000
## percent_change_next_weeks_price              -0.10211187
## percent_return_next_dividend                 -0.26793188
## lagged_open                                  -0.45959234
## lagged_high                                  -0.45843756
## lagged_low                                   -0.46191203
## lagged_close                                 -0.46191819
## lagged_volume                                 0.91183604
## lagged_percent_change_price                  -0.17810986
## lagged_percent_change_volume                  0.17972928
## lagged_previous_weeks_volume                  0.79806858
## lagged_open_2                                -0.42731752
## lagged_high_2                                -0.42640569
## lagged_low_2                                 -0.42765881
## lagged_close_2                               -0.42764476
## lagged_volume_2                               0.76468762
## lagged_percent_change_price_2                -0.08401117
## lagged_percent_change_volume_2                0.04736541
## lagged_previous_weeks_volume_2                0.73309926
##                                    percent_change_next_weeks_price
## open                                                    0.06616456
## high                                                    0.06905750
## low                                                     0.06670463
## close                                                   0.06724401
## volume                                                 -0.09018005
## percent_change_price                                    0.01898666
## percent_change_volume_over_last_wk                      0.01396710
## previous_weeks_volume                                  -0.10211187
## percent_change_next_weeks_price                         1.00000000
## percent_return_next_dividend                            0.10243026
## lagged_open                                             0.04745815
## lagged_high                                             0.04642812
## lagged_low                                              0.04643954
## lagged_close                                            0.04391979
## lagged_volume                                          -0.07733783
## lagged_percent_change_price                            -0.05414914
## lagged_percent_change_volume                            0.04169501
## lagged_previous_weeks_volume                           -0.09624255
## lagged_open_2                                           0.04620233
## lagged_high_2                                           0.04535166
## lagged_low_2                                            0.04761554
## lagged_close_2                                          0.04454016
## lagged_volume_2                                        -0.07713344
## lagged_percent_change_price_2                          -0.02608339
## lagged_percent_change_volume_2                          0.02221783
## lagged_previous_weeks_volume_2                         -0.08782722
##                                    percent_return_next_dividend  lagged_open
## open                                                -0.15338047  0.949922257
## high                                                -0.15560205  0.949491516
## low                                                 -0.15092841  0.948535252
## close                                               -0.15336623  0.948095393
## volume                                              -0.26867837 -0.487886788
## percent_change_price                                 0.04386879  0.060839572
## percent_change_volume_over_last_wk                  -0.02057331 -0.006175426
## previous_weeks_volume                               -0.26793188 -0.459592340
## percent_change_next_weeks_price                      0.10243026  0.047458155
## percent_return_next_dividend                         1.00000000 -0.137853997
## lagged_open                                         -0.13785400  1.000000000
## lagged_high                                         -0.14005190  0.999612396
## lagged_low                                          -0.13497878  0.999338174
## lagged_close                                        -0.13783538  0.999050001
## lagged_volume                                       -0.25975805 -0.522082802
## lagged_percent_change_price                          0.04819822  0.072838335
## lagged_percent_change_volume                        -0.02455874 -0.009010208
## lagged_previous_weeks_volume                        -0.25732162 -0.511464726
## lagged_open_2                                       -0.12191685  0.949971352
## lagged_high_2                                       -0.12395720  0.950054464
## lagged_low_2                                        -0.11896245  0.950152236
## lagged_close_2                                      -0.12169322  0.950982254
## lagged_volume_2                                     -0.24852715 -0.494297247
## lagged_percent_change_price_2                        0.05014172  0.113215717
## lagged_percent_change_volume_2                      -0.02894675 -0.018729515
## lagged_previous_weeks_volume_2                      -0.24608821 -0.477029802
##                                     lagged_high   lagged_low lagged_close
## open                                0.950004255  0.950101476  0.950931710
## high                                0.949654918  0.949587563  0.950515352
## low                                 0.948650231  0.948825167  0.949629951
## close                               0.948284049  0.948260365  0.949203824
## volume                             -0.486856179 -0.488673698 -0.488529291
## percent_change_price                0.062569896  0.060261939  0.061903855
## percent_change_volume_over_last_wk -0.007556415 -0.003163766 -0.003400151
## previous_weeks_volume              -0.458437557 -0.461912030 -0.461918194
## percent_change_next_weeks_price     0.046428123  0.046439540  0.043919787
## percent_return_next_dividend       -0.140051903 -0.134978781 -0.137835384
## lagged_open                         0.999612396  0.999338174  0.999050001
## lagged_high                         1.000000000  0.999419603  0.999560325
## lagged_low                          0.999419603  1.000000000  0.999549129
## lagged_close                        0.999560325  0.999549129  1.000000000
## lagged_volume                      -0.520952595 -0.523831162 -0.523529221
## lagged_percent_change_price         0.089689711  0.094538016  0.109063056
## lagged_percent_change_volume       -0.007025568 -0.018051389 -0.016965888
## lagged_previous_weeks_volume       -0.510907058 -0.511083629 -0.510879255
## lagged_open_2                       0.949538420  0.948587696  0.948156577
## lagged_high_2                       0.949703781  0.948703494  0.948344664
## lagged_low_2                        0.949636656  0.948879227  0.948322916
## lagged_close_2                      0.950565383  0.949683270  0.949263466
## lagged_volume_2                    -0.493925450 -0.493634292 -0.493688156
## lagged_percent_change_price_2       0.113146630  0.114207328  0.114318059
## lagged_percent_change_volume_2     -0.018119874 -0.018314096 -0.018522460
## lagged_previous_weeks_volume_2     -0.476824532 -0.476583284 -0.476700159
##                                    lagged_volume lagged_percent_change_price
## open                                 -0.49355500                  0.11011153
## high                                 -0.49316928                  0.11000721
## low                                  -0.49289465                  0.11110462
## close                                -0.49296477                  0.11124746
## volume                                0.85517252                 -0.10217854
## percent_change_price                 -0.09948906                  0.01586226
## percent_change_volume_over_last_wk   -0.11791111                  0.08764304
## previous_weeks_volume                 0.91183604                 -0.17810986
## percent_change_next_weeks_price      -0.07733783                 -0.05414914
## percent_return_next_dividend         -0.25975805                  0.04819822
## lagged_open                          -0.52208280                  0.07283834
## lagged_high                          -0.52095259                  0.08968971
## lagged_low                           -0.52383116                  0.09453802
## lagged_close                         -0.52352922                  0.10906306
## lagged_volume                         1.00000000                 -0.17171421
## lagged_percent_change_price          -0.17171421                  1.00000000
## lagged_percent_change_volume          0.18188628                 -0.23503010
## lagged_previous_weeks_volume          0.88754357                 -0.06713164
## lagged_open_2                        -0.48792623                  0.06347063
## lagged_high_2                        -0.48687888                  0.06518509
## lagged_low_2                         -0.48869738                  0.06290927
## lagged_close_2                       -0.48853711                  0.06450827
## lagged_volume_2                       0.85492670                 -0.10074022
## lagged_percent_change_price_2        -0.10232930                  0.01764445
## lagged_percent_change_volume_2        0.03658853                  0.02653782
## lagged_previous_weeks_volume_2        0.81907820                 -0.11540234
##                                    lagged_percent_change_volume
## open                                               -0.018820598
## high                                               -0.018207471
## low                                                -0.018407727
## close                                              -0.018623181
## volume                                              0.036613406
## percent_change_price                                0.026287119
## percent_change_volume_over_last_wk                 -0.307370587
## previous_weeks_volume                               0.179729278
## percent_change_next_weeks_price                     0.041695015
## percent_return_next_dividend                       -0.024558739
## lagged_open                                        -0.009010208
## lagged_high                                        -0.007025568
## lagged_low                                         -0.018051389
## lagged_close                                       -0.016965888
## lagged_volume                                       0.181886279
## lagged_percent_change_price                        -0.235030105
## lagged_percent_change_volume                        1.000000000
## lagged_previous_weeks_volume                       -0.129426103
## lagged_open_2                                      -0.004622536
## lagged_high_2                                      -0.005975442
## lagged_low_2                                       -0.001578447
## lagged_close_2                                     -0.001803112
## lagged_volume_2                                    -0.119100461
## lagged_percent_change_price_2                       0.090575810
## lagged_percent_change_volume_2                     -0.306943116
## lagged_previous_weeks_volume_2                     -0.037078170
##                                    lagged_previous_weeks_volume lagged_open_2
## open                                                -0.47640137   0.898752526
## high                                                -0.47618566   0.897698066
## low                                                 -0.47595670   0.896729566
## close                                               -0.47608534   0.896207218
## volume                                               0.81924674  -0.451917178
## percent_change_price                                -0.11416825   0.040619478
## percent_change_volume_over_last_wk                  -0.03591074  -0.027590464
## previous_weeks_volume                                0.79806858  -0.427317519
## percent_change_next_weeks_price                     -0.09624255   0.046202333
## percent_return_next_dividend                        -0.25732162  -0.121916851
## lagged_open                                         -0.51146473   0.949971352
## lagged_high                                         -0.51090706   0.949538420
## lagged_low                                          -0.51108363   0.948587696
## lagged_close                                        -0.51087926   0.948156577
## lagged_volume                                        0.88754357  -0.487926225
## lagged_percent_change_price                         -0.06713164   0.063470628
## lagged_percent_change_volume                        -0.12942610  -0.004622536
## lagged_previous_weeks_volume                         1.00000000  -0.460319971
## lagged_open_2                                       -0.46031997   1.000000000
## lagged_high_2                                       -0.45916283   0.999612959
## lagged_low_2                                        -0.46264236   0.999338540
## lagged_close_2                                      -0.46263789   0.999051557
## lagged_volume_2                                      0.91191460  -0.522808818
## lagged_percent_change_price_2                       -0.17970581   0.076026713
## lagged_percent_change_volume_2                       0.17954975  -0.008932521
## lagged_previous_weeks_volume_2                       0.79823577  -0.512071004
##                                    lagged_high_2 lagged_low_2 lagged_close_2
## open                                 0.899507963  0.898749140    0.900008205
## high                                 0.898401651  0.897598088    0.898801509
## low                                  0.897414225  0.896792447    0.897862053
## close                                0.896849158  0.896130248    0.897204030
## volume                              -0.451602976 -0.452349108   -0.452602279
## percent_change_price                 0.038684777  0.039242225    0.036864590
## percent_change_volume_over_last_wk  -0.028477492 -0.026803630   -0.026783489
## previous_weeks_volume               -0.426405693 -0.427658806   -0.427644761
## percent_change_next_weeks_price      0.045351660  0.047615541    0.044540162
## percent_return_next_dividend        -0.123957197 -0.118962452   -0.121693221
## lagged_open                          0.950054464  0.950152236    0.950982254
## lagged_high                          0.949703781  0.949636656    0.950565383
## lagged_low                           0.948703494  0.948879227    0.949683270
## lagged_close                         0.948344664  0.948322916    0.949263466
## lagged_volume                       -0.486878884 -0.488697380   -0.488537105
## lagged_percent_change_price          0.065185086  0.062909267    0.064508267
## lagged_percent_change_volume        -0.005975442 -0.001578447   -0.001803112
## lagged_previous_weeks_volume        -0.459162831 -0.462642361   -0.462637890
## lagged_open_2                        0.999612959  0.999338540    0.999051557
## lagged_high_2                        1.000000000  0.999420212    0.999560885
## lagged_low_2                         0.999420212  1.000000000    0.999550165
## lagged_close_2                       0.999560885  0.999550165    1.000000000
## lagged_volume_2                     -0.521674406 -0.524559086   -0.524244645
## lagged_percent_change_price_2        0.092844465  0.097713521    0.112176520
## lagged_percent_change_volume_2      -0.006944765 -0.017965299   -0.016876672
## lagged_previous_weeks_volume_2      -0.511509561 -0.511692883   -0.511478704
##                                    lagged_volume_2
## open                                   -0.45729558
## high                                   -0.45682501
## low                                    -0.45631728
## close                                  -0.45673849
## volume                                  0.76287060
## percent_change_price                   -0.08868801
## percent_change_volume_over_last_wk     -0.03894409
## previous_weeks_volume                   0.76468762
## percent_change_next_weeks_price        -0.07713344
## percent_return_next_dividend           -0.24852715
## lagged_open                            -0.49429725
## lagged_high                            -0.49392545
## lagged_low                             -0.49363429
## lagged_close                           -0.49368816
## lagged_volume                           0.85492670
## lagged_percent_change_price            -0.10074022
## lagged_percent_change_volume           -0.11910046
## lagged_previous_weeks_volume            0.91191460
## lagged_open_2                          -0.52280882
## lagged_high_2                          -0.52167441
## lagged_low_2                           -0.52455909
## lagged_close_2                         -0.52424465
## lagged_volume_2                         1.00000000
## lagged_percent_change_price_2          -0.17343031
## lagged_percent_change_volume_2          0.18168842
## lagged_previous_weeks_volume_2          0.88764091
##                                    lagged_percent_change_price_2
## open                                                  0.10790929
## high                                                  0.10485179
## low                                                   0.10523920
## close                                                 0.10367740
## volume                                               -0.09670732
## percent_change_price                                 -0.05868190
## percent_change_volume_over_last_wk                    0.03188575
## previous_weeks_volume                                -0.08401117
## percent_change_next_weeks_price                      -0.02608339
## percent_return_next_dividend                          0.05014172
## lagged_open                                           0.11321572
## lagged_high                                           0.11314663
## lagged_low                                            0.11420733
## lagged_close                                          0.11431806
## lagged_volume                                        -0.10232930
## lagged_percent_change_price                           0.01764445
## lagged_percent_change_volume                          0.09057581
## lagged_previous_weeks_volume                         -0.17970581
## lagged_open_2                                         0.07602671
## lagged_high_2                                         0.09284446
## lagged_low_2                                          0.09771352
## lagged_close_2                                        0.11217652
## lagged_volume_2                                      -0.17343031
## lagged_percent_change_price_2                         1.00000000
## lagged_percent_change_volume_2                       -0.23423721
## lagged_previous_weeks_volume_2                       -0.06898473
##                                    lagged_percent_change_volume_2
## open                                                 -0.018581918
## high                                                 -0.016801425
## low                                                  -0.018023002
## close                                                -0.015833720
## volume                                                0.014565902
## percent_change_price                                  0.032584790
## percent_change_volume_over_last_wk                   -0.094053579
## previous_weeks_volume                                 0.047365406
## percent_change_next_weeks_price                       0.022217830
## percent_return_next_dividend                         -0.028946753
## lagged_open                                          -0.018729515
## lagged_high                                          -0.018119874
## lagged_low                                           -0.018314096
## lagged_close                                         -0.018522460
## lagged_volume                                         0.036588531
## lagged_percent_change_price                           0.026537817
## lagged_percent_change_volume                         -0.306943116
## lagged_previous_weeks_volume                          0.179549751
## lagged_open_2                                        -0.008932521
## lagged_high_2                                        -0.006944765
## lagged_low_2                                         -0.017965299
## lagged_close_2                                       -0.016876672
## lagged_volume_2                                       0.181688418
## lagged_percent_change_price_2                        -0.234237214
## lagged_percent_change_volume_2                        1.000000000
## lagged_previous_weeks_volume_2                       -0.129468003
##                                    lagged_previous_weeks_volume_2
## open                                                 -0.444195334
## high                                                 -0.443882454
## low                                                  -0.443276492
## close                                                -0.444135813
## volume                                                0.755844701
## percent_change_price                                 -0.103639132
## percent_change_volume_over_last_wk                   -0.003574668
## previous_weeks_volume                                 0.733099256
## percent_change_next_weeks_price                      -0.087827222
## percent_return_next_dividend                         -0.246088210
## lagged_open                                          -0.477029802
## lagged_high                                          -0.476824532
## lagged_low                                           -0.476583284
## lagged_close                                         -0.476700159
## lagged_volume                                         0.819078202
## lagged_percent_change_price                          -0.115402337
## lagged_percent_change_volume                         -0.037078170
## lagged_previous_weeks_volume                          0.798235770
## lagged_open_2                                        -0.512071004
## lagged_high_2                                        -0.511509561
## lagged_low_2                                         -0.511692883
## lagged_close_2                                       -0.511478704
## lagged_volume_2                                       0.887640905
## lagged_percent_change_price_2                        -0.068984727
## lagged_percent_change_volume_2                       -0.129468003
## lagged_previous_weeks_volume_2                        1.000000000

After analyzing the correlation, our lags do not seem to be correlated with our predictor variable. This means that adding our lags to the model will more than likely not add anything or improve the model.

Train-test split

We are going to use 1st quarter as training and 2nd quarter as testing.

train_df_removed <- subset(df_NA_removed, quarter == 1)
test_df_removed <- subset(df_NA_removed, quarter == 2)

train_df_inputted <- subset(df_NA_inputted, quarter == 1)
test_df_inputted <- subset(df_NA_inputted, quarter == 2)

Linear Model (Removed)

lm_model_full_r <- lm(percent_change_next_weeks_price ~ stock + date + open +
                        high + low + close + volume +
                        percent_change_price + percent_change_volume_over_last_wk +
                        previous_weeks_volume + percent_return_next_dividend,
                      data = train_df_removed)

summary(lm_model_full_r)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date + 
##     open + high + low + close + volume + percent_change_price + 
##     percent_change_volume_over_last_wk + previous_weeks_volume + 
##     percent_return_next_dividend, data = train_df_removed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0884  -1.2144   0.0623   1.2512   8.1557 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                         4.056e+00  9.599e+01   0.042  0.96632   
## stockAXP                            7.109e+00  3.726e+00   1.908  0.05736 . 
## stockBA                             1.480e+01  6.893e+00   2.147  0.03265 * 
## stockBAC                           -2.513e+00  2.398e+00  -1.048  0.29554   
## stockCAT                            2.525e+01  8.626e+00   2.928  0.00368 **
## stockCSCO                          -2.576e+00  1.535e+00  -1.678  0.09447 . 
## stockCVX                            2.338e+01  9.958e+00   2.348  0.01956 * 
## stockDD                             8.622e+00  6.505e+00   1.325  0.18608   
## stockDIS                            4.391e+00  6.820e+00   0.644  0.52019   
## stockGE                            -1.554e+00  3.716e+00  -0.418  0.67603   
## stockHD                             3.474e+00  4.718e+00   0.736  0.46213   
## stockHPQ                            7.331e+00  2.779e+00   2.638  0.00880 **
## stockIBM                            4.332e+01  1.379e+01   3.142  0.00185 **
## stockINTC                          -3.103e+00  4.753e+00  -0.653  0.51443   
## stockJNJ                            9.393e+00  7.862e+00   1.195  0.23316   
## stockJPM                            7.038e+00  4.570e+00   1.540  0.12465   
## stockKO                             1.219e+01  7.098e+00   1.717  0.08708 . 
## stockKRFT                           6.193e-01  5.917e+00   0.105  0.91672   
## stockMCD                            1.474e+01  8.425e+00   1.749  0.08130 . 
## stockMMM                            2.056e+01  8.574e+00   2.398  0.01711 * 
## stockMRK                           -6.000e-01  7.452e+00  -0.081  0.93589   
## stockMSFT                          -6.131e-01  3.513e+00  -0.175  0.86158   
## stockPFE                           -3.085e+00  5.915e+00  -0.522  0.60235   
## stockPG                             1.048e+01  7.599e+00   1.380  0.16874   
## stockT                             -3.275e+00  9.559e+00  -0.343  0.73212   
## stockTRV                            1.099e+01  6.121e+00   1.795  0.07370 . 
## stockUTX                            1.873e+01  7.614e+00   2.459  0.01450 * 
## stockVZ                             2.532e-01  8.985e+00   0.028  0.97754   
## stockWMT                            8.616e+00  6.032e+00   1.428  0.15425   
## stockXOM                            1.833e+01  7.558e+00   2.426  0.01589 * 
## date                                2.621e-05  6.436e-03   0.004  0.99675   
## open                               -4.017e-01  2.282e-01  -1.761  0.07937 . 
## high                                4.289e-01  2.556e-01   1.678  0.09439 . 
## low                                -1.648e-01  2.489e-01  -0.662  0.50843   
## close                              -1.823e-01  3.013e-01  -0.605  0.54565   
## volume                              1.829e-09  2.952e-09   0.619  0.53609   
## percent_change_price               -1.051e-01  1.072e-01  -0.981  0.32740   
## percent_change_volume_over_last_wk -3.569e-03  4.537e-03  -0.787  0.43206   
## previous_weeks_volume              -7.041e-10  2.176e-09  -0.324  0.74646   
## percent_return_next_dividend        5.175e+00  6.597e+00   0.785  0.43338   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.341 on 290 degrees of freedom
## Multiple R-squared:  0.1911, Adjusted R-squared:  0.08232 
## F-statistic: 1.757 on 39 and 290 DF,  p-value: 0.005192

Multicollinearity

vif(lm_model_full_r)
##                                            GVIF Df GVIF^(1/(2*Df))
## stock                              9.780008e+05 29        1.268474
## date                               1.222277e+00  1        1.105567
## open                               3.118355e+03  1       55.842233
## high                               4.094441e+03  1       63.987820
## low                                3.603692e+03  1       60.030755
## close                              5.549627e+03  1       74.495818
## volume                             1.405339e+01  1        3.748785
## percent_change_price               4.457538e+00  1        2.111288
## percent_change_volume_over_last_wk 2.488086e+00  1        1.577367
## previous_weeks_volume              8.992118e+00  1        2.998686
## percent_return_next_dividend       2.685693e+02  1       16.388085

In this block of code we ran and re-ran until we dropped all multicollinear variables in order to not have a bunch of useless models.

Had to remove close, high, low, open, and xpercent_return_next_dividend

lm_model_r <- lm(percent_change_next_weeks_price ~ stock + date +
                        volume + percent_change_price + percent_change_volume_over_last_wk +
                        previous_weeks_volume,
                      data = train_df_removed)

vif(lm_model_r)
##                                         GVIF Df GVIF^(1/(2*Df))
## stock                              14.775943 29        1.047526
## date                                1.028527  1        1.014163
## volume                             12.675917  1        3.560325
## percent_change_price                1.328936  1        1.152795
## percent_change_volume_over_last_wk  1.927818  1        1.388459
## previous_weeks_volume               8.352334  1        2.890041
summary(lm_model_r)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date + 
##     volume + percent_change_price + percent_change_volume_over_last_wk + 
##     previous_weeks_volume, data = train_df_removed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.8389  -1.3704   0.1016   1.4537   7.5444 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                         7.833e+01  9.247e+01   0.847   0.3976  
## stockAXP                           -3.657e-01  1.084e+00  -0.337   0.7361  
## stockBA                             9.668e-02  1.096e+00   0.088   0.9297  
## stockBAC                           -3.171e+00  2.241e+00  -1.415   0.1581  
## stockCAT                            1.011e+00  1.094e+00   0.924   0.3564  
## stockCSCO                          -3.083e+00  1.215e+00  -2.537   0.0117 *
## stockCVX                            1.142e+00  1.085e+00   1.052   0.2937  
## stockDD                             7.381e-01  1.091e+00   0.677   0.4992  
## stockDIS                            7.476e-01  1.077e+00   0.694   0.4883  
## stockGE                            -3.150e-01  1.156e+00  -0.272   0.7855  
## stockHD                            -1.903e-01  1.077e+00  -0.177   0.8599  
## stockHPQ                           -1.262e+00  1.054e+00  -1.197   0.2321  
## stockIBM                            7.227e-01  1.099e+00   0.658   0.5114  
## stockINTC                          -1.325e+00  1.118e+00  -1.186   0.2366  
## stockJNJ                           -7.195e-01  1.073e+00  -0.670   0.5032  
## stockJPM                           -7.886e-02  1.043e+00  -0.076   0.9398  
## stockKO                             4.312e-01  1.081e+00   0.399   0.6904  
## stockKRFT                          -1.099e-01  1.080e+00  -0.102   0.9191  
## stockMCD                           -1.226e-01  1.083e+00  -0.113   0.9100  
## stockMMM                            2.902e-02  1.107e+00   0.026   0.9791  
## stockMRK                           -7.421e-01  1.053e+00  -0.704   0.4817  
## stockMSFT                          -1.897e+00  1.132e+00  -1.675   0.0949 .
## stockPFE                            3.443e-01  1.095e+00   0.315   0.7533  
## stockPG                            -8.504e-01  1.074e+00  -0.792   0.4291  
## stockT                             -9.040e-02  1.041e+00  -0.087   0.9308  
## stockTRV                            6.792e-01  1.103e+00   0.616   0.5385  
## stockUTX                            3.117e-01  1.104e+00   0.282   0.7780  
## stockVZ                             2.580e-01  1.050e+00   0.246   0.8061  
## stockWMT                           -4.268e-01  1.066e+00  -0.400   0.6891  
## stockXOM                            1.110e-01  1.047e+00   0.106   0.9157  
## date                               -5.201e-03  6.154e-03  -0.845   0.3987  
## volume                              2.185e-09  2.923e-09   0.747   0.4554  
## percent_change_price               -5.785e-02  6.099e-02  -0.949   0.3436  
## percent_change_volume_over_last_wk  2.495e-04  4.162e-03   0.060   0.9522  
## previous_weeks_volume               8.847e-11  2.186e-09   0.040   0.9677  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.44 on 295 degrees of freedom
## Multiple R-squared:  0.106,  Adjusted R-squared:  0.002954 
## F-statistic: 1.029 on 34 and 295 DF,  p-value: 0.4292

Stepwise Linear Model

step_lm_model_r <- step(lm_model_r, direction = 'both', trace = FALSE)
summary(step_lm_model_r)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ previous_weeks_volume, 
##     data = train_df_removed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3217  -1.2889   0.0459   1.4861   6.0230 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            4.637e-01  1.637e-01   2.832  0.00492 **
## previous_weeks_volume -2.126e-09  7.494e-10  -2.837  0.00483 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.418 on 328 degrees of freedom
## Multiple R-squared:  0.02395,    Adjusted R-squared:  0.02098 
## F-statistic:  8.05 on 1 and 328 DF,  p-value: 0.004834

We ended up a single variable. It is also statistically significant.

All other variables are either multicollinear, or they were dropped in stepwise selection.

Our stepwise linear model on NA removed dataset explains about 2.049% of the variance in percent_change_next_weeks_price.

Assumption Check

par(mfrow = c(2, 2))
plot(step_lm_model_r)

par(mfrow = c(1, 1))

Our Diagnostic plots are not concerning. Residuals vs Fitted show a random pattern which suggest our residuals don’t have non-linear patterns.

Our Q-Q plot suggest normal residuals.

Our Scale-Location plot suggests that our residuals are not spread equally along the predictors. Assumption of equal variance might be rejected.

Our residuals vs leverage show a few potential high leverage points.

summary(influence.measures(step_lm_model_r))
## Potentially influential observations of
##   lm(formula = percent_change_next_weeks_price ~ previous_weeks_volume,      data = train_df_removed) :
## 
##     dfb.1_ dfb.pr__ dffit   cov.r   cook.d hat    
## 4    0.10   0.01     0.13    0.97_*  0.01   0.00  
## 17   0.15  -0.06     0.15    0.97_*  0.01   0.00  
## 38   0.28  -0.59    -0.60_*  1.20_*  0.18   0.17_*
## 39   0.10  -0.29    -0.30_*  1.04_*  0.04   0.04_*
## 40  -0.29   0.70     0.71_*  1.04_*  0.25   0.07_*
## 41  -0.20   0.46     0.47_*  1.08_*  0.11   0.09_*
## 42  -0.03   0.09     0.09    1.05_*  0.00   0.04_*
## 43   0.00  -0.01    -0.01    1.05_*  0.00   0.04_*
## 44   0.00  -0.02    -0.03    1.02_*  0.00   0.01  
## 45  -0.06   0.19     0.20    1.04_*  0.02   0.04_*
## 46   0.01  -0.04    -0.04    1.04_*  0.00   0.03_*
## 47   0.16  -0.44    -0.46_*  1.03_*  0.10   0.05_*
## 48  -0.03   0.08     0.08    1.05_*  0.00   0.05_*
## 64   0.07   0.05     0.13    0.98_*  0.01   0.00  
## 65  -0.13  -0.29    -0.47_*  0.78_*  0.10   0.00  
## 67  -0.06   0.14     0.15    1.09_*  0.01   0.08_*
## 101  0.15  -0.07     0.15    0.97_*  0.01   0.00  
## 127 -0.15   0.07    -0.16    0.97_*  0.01   0.00  
## 139 -0.24   0.06    -0.26_*  0.89_*  0.03   0.00  
## 268  0.10   0.03     0.14    0.97_*  0.01   0.00  
## 276  0.00   0.01     0.02    1.02_*  0.00   0.01  
## 300  0.09   0.02     0.12    0.98_*  0.01   0.00

A lot of our influential points seem to come from BAC stock which could mean we are removing meaningful behavior.

Instead of deleting these points, we will try transformations which we kind of already knew were going to be needed due to the skewness of our data

LM model NO INF OBS

lm_model_r_ni <- lm(percent_change_next_weeks_price ~ previous_weeks_volume, 
    data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 
                                                 268, 276, 300), ])
summary(lm_model_r_ni)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ previous_weeks_volume, 
##     data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 268, 
##         276, 300), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3407  -1.3413   0.0056   1.4529   5.9841 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            5.063e-01  1.619e-01   3.127  0.00193 **
## previous_weeks_volume -2.215e-09  7.361e-10  -3.009  0.00283 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.365 on 319 degrees of freedom
## Multiple R-squared:  0.0276, Adjusted R-squared:  0.02455 
## F-statistic: 9.054 on 1 and 319 DF,  p-value: 0.00283
par(mfrow = c(2, 2))
plot(lm_model_r_ni)

par(mfrow = c(1, 1))

RMSE

# Calculate RMSE
predictions <- predict(lm_model_r_ni, newdata = test_df_removed)  # Get predictions
actuals <- test_df_removed$percent_change_next_weeks_price  # Actual values
rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
print(rmse)
## [1] 2.836362

Linear Model (Inputted)

lm_model_full_i <- lm(percent_change_next_weeks_price ~ stock + date + open + 
                        high + low + close + volume + 
                        percent_change_price + percent_change_volume_over_last_wk + 
                        previous_weeks_volume, 
                      data = train_df_inputted)

summary(lm_model_full_i)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date + 
##     open + high + low + close + volume + percent_change_price + 
##     percent_change_volume_over_last_wk + previous_weeks_volume, 
##     data = train_df_inputted)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5451  -1.2325   0.0115   1.3152   7.6999 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -1.054e+01  8.503e+01  -0.124 0.901457    
## stockAXP                            1.005e+01  2.030e+00   4.949 1.20e-06 ***
## stockBA                             1.919e+01  3.562e+00   5.389 1.38e-07 ***
## stockBAC                           -2.559e+00  2.112e+00  -1.212 0.226520    
## stockCAT                            2.982e+01  5.321e+00   5.605 4.49e-08 ***
## stockCSCO                          -1.066e+00  1.131e+00  -0.942 0.346941    
## stockCVX                            2.945e+01  5.209e+00   5.654 3.47e-08 ***
## stockDD                             1.359e+01  2.451e+00   5.543 6.19e-08 ***
## stockDIS                            9.653e+00  1.845e+00   5.233 3.02e-07 ***
## stockGE                             1.574e+00  1.086e+00   1.450 0.148126    
## stockHD                             7.549e+00  1.613e+00   4.678 4.27e-06 ***
## stockHPQ                            9.188e+00  2.033e+00   4.519 8.75e-06 ***
## stockIBM                            4.992e+01  9.005e+00   5.544 6.17e-08 ***
## stockINTC                           9.970e-01  1.079e+00   0.924 0.356253    
## stockJNJ                            1.528e+01  2.982e+00   5.124 5.17e-07 ***
## stockJPM                            1.051e+01  2.058e+00   5.107 5.61e-07 ***
## stockKO                             1.712e+01  3.142e+00   5.449 1.01e-07 ***
## stockKRFT                           5.612e+00  1.368e+00   4.103 5.18e-05 ***
## stockMCD                            2.039e+01  3.818e+00   5.342 1.75e-07 ***
## stockMMM                            2.592e+01  4.728e+00   5.482 8.50e-08 ***
## stockMRK                            4.948e+00  1.452e+00   3.407 0.000741 ***
## stockMSFT                           2.399e+00  1.253e+00   1.915 0.056374 .  
## stockPFE                            1.774e+00  1.034e+00   1.716 0.087094 .  
## stockPG                             1.616e+01  3.133e+00   5.156 4.41e-07 ***
## stockT                              4.428e+00  1.224e+00   3.617 0.000346 ***
## stockTRV                            1.532e+01  2.771e+00   5.527 6.74e-08 ***
## stockUTX                            2.341e+01  4.240e+00   5.522 6.93e-08 ***
## stockVZ                             7.119e+00  1.567e+00   4.544 7.82e-06 ***
## stockWMT                            1.316e+01  2.572e+00   5.117 5.37e-07 ***
## stockXOM                            2.299e+01  4.187e+00   5.491 8.13e-08 ***
## date                                1.062e-03  5.678e-03   0.187 0.851692    
## open                               -4.001e-01  2.191e-01  -1.826 0.068745 .  
## high                                4.892e-01  2.418e-01   2.023 0.043898 *  
## low                                -1.355e-01  2.400e-01  -0.565 0.572666    
## close                              -3.074e-01  2.799e-01  -1.099 0.272808    
## volume                              4.622e-09  2.153e-09   2.146 0.032586 *  
## percent_change_price               -8.520e-02  9.834e-02  -0.866 0.386905    
## percent_change_volume_over_last_wk -7.305e-03  3.885e-03  -1.880 0.060959 .  
## previous_weeks_volume              -3.061e-09  1.514e-09  -2.022 0.044001 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.36 on 321 degrees of freedom
## Multiple R-squared:  0.2013, Adjusted R-squared:  0.1067 
## F-statistic: 2.129 on 38 and 321 DF,  p-value: 0.0002404

Multicollinearity (Recheck)

Here we redo vif() just to make sure the variables removed in our first model are the same.

vif(lm_model_full_i)
##                                           GVIF Df GVIF^(1/(2*Df))
## stock                              3927.030715 29        1.153365
## date                                  1.216898  1        1.103131
## open                               3052.736796  1       55.251577
## high                               3880.393244  1       62.292803
## low                                3559.092096  1       59.658127
## close                              5072.945663  1       71.224614
## volume                                9.148146  1        3.024590
## percent_change_price                  4.095707  1        2.023785
## percent_change_volume_over_last_wk    1.796414  1        1.340304
## previous_weeks_volume                 4.467358  1        2.113613
lm_model_i <- lm(percent_change_next_weeks_price ~ stock + date + 
                        volume + percent_change_price + percent_change_volume_over_last_wk + 
                        previous_weeks_volume, 
                      data = train_df_inputted)

vif(lm_model_i)
##                                         GVIF Df GVIF^(1/(2*Df))
## stock                              14.574951 29        1.047279
## date                                1.058982  1        1.029069
## volume                              8.928571  1        2.988071
## percent_change_price                1.253402  1        1.119554
## percent_change_volume_over_last_wk  1.466034  1        1.210799
## previous_weeks_volume               4.422164  1        2.102894

close, high, low, and open were removed the same as last model.

Stepwise Linear Model

step_lm_model_i <- step(lm_model_i, direction = 'both', trace = FALSE)
summary(step_lm_model_i)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ volume + previous_weeks_volume, 
##     data = train_df_inputted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.350  -1.275   0.040   1.529   5.983 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.856e-01  1.615e-01   3.007 0.002829 ** 
## volume                 2.162e-09  1.111e-09   1.946 0.052473 .  
## previous_weeks_volume -4.024e-09  1.118e-09  -3.600 0.000364 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.456 on 357 degrees of freedom
## Multiple R-squared:  0.0382, Adjusted R-squared:  0.03282 
## F-statistic:  7.09 on 2 and 357 DF,  p-value: 0.0009557

We ended up with 2 variables instead of just the 1 in the same model with different dataset.

volume is barely non-significant.

All other variables are either multicollinear, or they were dropped in stepwise selection.

Our stepwise linear model on NA inputted dataset explains about 3.282% of the variance in percent_change_next_weeks_price.

Better than model on df_NA_removed

Assumption Check

par(mfrow = c(2, 2))
plot(step_lm_model_i)

par(mfrow = c(1, 1))

Almost the same results as our model on the Removed NA’s dataset.

Residuals vs fitted does seem to be more along the 0.

Scale-Location also seems to be more along the 0 and curves at the end.

summary(influence.measures(step_lm_model_i))
## Potentially influential observations of
##   lm(formula = percent_change_next_weeks_price ~ volume + previous_weeks_volume,      data = train_df_inputted) :
## 
##     dfb.1_ dfb.volm dfb.pr__ dffit   cov.r   cook.d  hat    
## 4    0.09   0.01     0.00     0.12    0.97_*  0.01    0.00  
## 15  -0.13   0.02     0.03    -0.13    0.97_*  0.01    0.00  
## 17   0.14  -0.04     0.00     0.14    0.96_*  0.01    0.00  
## 37  -0.29   1.79_*  -1.40_*   1.80_*  1.66_*  1.07_*  0.41_*
## 38   0.12   0.15    -0.36    -0.40_*  1.25_*  0.05    0.19_*
## 39   0.19  -0.31    -0.01    -0.48_*  1.06_*  0.08    0.07_*
## 40  -0.32   0.31     0.20     0.72_*  1.06_*  0.17    0.09_*
## 41  -0.18  -0.06     0.37     0.51_*  1.08_*  0.09    0.08_*
## 42  -0.03   0.02     0.03     0.07    1.05_*  0.00    0.04_*
## 43  -0.01  -0.01     0.02     0.02    1.05_*  0.00    0.04_*
## 44   0.03  -0.08     0.02    -0.10    1.04_*  0.00    0.04_*
## 45  -0.07   0.06     0.07     0.19    1.04_*  0.01    0.04_*
## 46   0.03  -0.04    -0.01    -0.09    1.06_*  0.00    0.05_*
## 47   0.19  -0.15    -0.19    -0.49_*  1.02    0.08    0.05_*
## 48  -0.03   0.01     0.04     0.07    1.06_*  0.00    0.05_*
## 61   0.00   0.01    -0.01     0.01    1.03_*  0.00    0.02  
## 65  -0.12   0.00    -0.20    -0.46_*  0.71_*  0.06    0.00  
## 66   0.05  -0.27     0.19    -0.28_*  1.15_*  0.03    0.13_*
## 67  -0.06  -0.18     0.32     0.33_*  1.12_*  0.04    0.11_*
## 101  0.14  -0.02    -0.02     0.14    0.96_*  0.01    0.00  
## 119  0.00   0.00     0.00     0.00    1.03_*  0.00    0.02  
## 120  0.00  -0.14     0.24     0.26    1.01    0.02    0.03_*
## 127 -0.15   0.04     0.01    -0.15    0.96_*  0.01    0.00  
## 139 -0.24   0.10    -0.05    -0.26    0.86_*  0.02    0.00  
## 157  0.00   0.03    -0.03     0.03    1.04_*  0.00    0.03_*
## 241 -0.21  -0.06     0.13    -0.24    0.91_*  0.02    0.00  
## 253 -0.01  -0.05     0.04    -0.05    1.03_*  0.00    0.02  
## 256  0.00  -0.03     0.02    -0.03    1.03_*  0.00    0.02  
## 265 -0.01  -0.04     0.04    -0.05    1.04_*  0.00    0.03_*
## 268  0.05   0.13    -0.07     0.18    0.97_*  0.01    0.01  
## 275  0.00  -0.03     0.02    -0.03    1.03_*  0.00    0.02  
## 276  0.00  -0.02     0.04     0.04    1.03_*  0.00    0.02  
## 300  0.07   0.02     0.00     0.12    0.97_*  0.00    0.00

Almost the same as the model on Removed NA’s dataset.

Hopefully transformations will help make these points less influential since we would be removing an entire stock.

LM Model NO INF OBS

lm_model_i_ni <- lm(percent_change_next_weeks_price ~ volume + previous_weeks_volume, 
    data = train_df_inputted[-c(4, 17, 101, 127, 139, 232, 
                                                 268, 276, 300), ])
summary(lm_model_i_ni)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ volume + previous_weeks_volume, 
##     data = train_df_inputted[-c(4, 17, 101, 127, 139, 232, 268, 
##         276, 300), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.304  -1.249   0.075   1.514   5.584 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.674e-01  1.528e-01   3.059 0.002390 ** 
## volume                 1.929e-09  1.046e-09   1.844 0.065970 .  
## previous_weeks_volume -3.924e-09  1.056e-09  -3.716 0.000236 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.3 on 348 degrees of freedom
## Multiple R-squared:  0.04332,    Adjusted R-squared:  0.03782 
## F-statistic: 7.878 on 2 and 348 DF,  p-value: 0.0004505
par(mfrow = c(2, 2))
plot(lm_model_i_ni)

par(mfrow = c(1, 1))

RMSE

# Calculate RMSE
predictions <- predict(lm_model_i_ni, newdata = test_df_inputted)  # Get predictions
actuals <- test_df_inputted$percent_change_next_weeks_price  # Actual values
rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
print(rmse)
## [1] 2.839142

Modeling with Transformations

We are going to run skewness() function from the e1071 package to test for skewness.

Based on this and our previous histograms we are going to decide if and which variables to apply transformations to.

Checking skewness for both Inputted and Removed Datasets

Interpreting the results:

  • Skewness > 0 → Right-skewed (positive skew)

  • Skewness < 0 → Left-skewed (negative skew)

  • Skewness ≈ 0 → Approximately symmetric

#Skewness for all numeric variables in our df_NA_removed
knitr::kable(sapply(df_NA_removed[, sapply(df_NA_removed, is.numeric)], skewness))
x
open 1.2663722
high 1.2663619
low 1.2759798
close 1.2702365
volume 2.8568194
percent_change_price -0.4122834
percent_change_volume_over_last_wk 2.5334104
previous_weeks_volume 3.2429757
percent_change_next_weeks_price -0.1366288
percent_return_next_dividend 0.3967358
#Skewness for all numeric variables in our df_NA_inputted
knitr::kable(sapply(df_NA_inputted[, sapply(df_NA_inputted, is.numeric)], skewness))
x
open 1.2643025
high 1.2643332
low 1.2747240
close 1.2692821
volume 3.2105227
percent_change_price -0.3793271
percent_change_volume_over_last_wk 2.5996699
previous_weeks_volume 3.2776377
percent_change_next_weeks_price -0.1716834
percent_return_next_dividend 0.3927708

For both datasets we get pretty much the same results with a little variation in the numbers.

percent_change_next_weeks_price is the only normally distributed variable.

Left skewed variables:
percent_change_price

Right skewed variables:

open, high, low, close, volume, percent_change_volume_over_last_wk, and previous_weeks_volume.

previous_weeks_volume and percent_change_volume_over_last_wk are Highly right skewed

Transformed Linear Model (Removed)

skewness(log(df_NA_removed$open + 1))
## [1] -0.1056558
skewness(log(df_NA_removed$high + 1))
## [1] -0.1028366
skewness(log(df_NA_removed$low + 1))
## [1] -0.1130192
skewness(log(df_NA_removed$close + 1))
## [1] -0.1119091
skewness(log(df_NA_removed$volume + 1)) 
## [1] 0.5366983
skewness(sign(df_NA_removed$percent_change_price) * abs(df_NA_removed$percent_change_price)^(1/3)) #cube root transformation
## [1] -0.01831936
skewness(sign(df_NA_removed$percent_change_volume_over_last_wk) * abs(df_NA_removed$percent_change_volume_over_last_wk)^(1/3)) #cube root transformation
## [1] 0.09067206
skewness(log(df_NA_removed$previous_weeks_volume + 1)) 
## [1] 0.5419658

Since percent_change_price won’t allow us to do it’s transformation inside the formula, we create a new variable to use the transformation in our model.

train_df_removed$percent_change_price_transformed <- sign(train_df_removed$percent_change_price) * abs(train_df_removed$percent_change_price)^(1/3)

This full model is now using the optimal transformations for each variable.

t_lm_model_full_r <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(open + 1) + 
                          log(high + 1) + 
                          log(low + 1) + 
                          log(close + 1) + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_removed)

summary(t_lm_model_full_r)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date + 
##     log(open + 1) + log(high + 1) + log(low + 1) + log(close + 
##     1) + log(volume + 1) + percent_change_price_transformed + 
##     percent_change_volume_over_last_wk + log(previous_weeks_volume + 
##     1), data = train_df_removed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1822  -1.1612   0.0794   1.3467   8.7922 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         5.592e+01  8.999e+01   0.621  0.53479    
## stockAXP                            2.261e+01  4.013e+00   5.635 4.13e-08 ***
## stockBA                             3.387e+01  5.821e+00   5.819 1.56e-08 ***
## stockBAC                           -7.082e+00  1.640e+00  -4.319 2.15e-05 ***
## stockCAT                            4.226e+01  7.168e+00   5.896 1.03e-08 ***
## stockCSCO                          -4.784e-01  1.396e+00  -0.343  0.73200    
## stockCVX                            4.139e+01  7.104e+00   5.826 1.51e-08 ***
## stockDD                             2.750e+01  4.614e+00   5.961 7.26e-09 ***
## stockDIS                            2.171e+01  3.719e+00   5.838 1.41e-08 ***
## stockGE                             3.304e+00  1.453e+00   2.274  0.02368 *  
## stockHD                             1.817e+01  3.280e+00   5.539 6.81e-08 ***
## stockHPQ                            2.067e+01  4.043e+00   5.112 5.79e-07 ***
## stockIBM                            5.240e+01  9.035e+00   5.800 1.73e-08 ***
## stockINTC                           3.312e+00  1.576e+00   2.101  0.03653 *  
## stockJNJ                            2.846e+01  5.223e+00   5.449 1.08e-07 ***
## stockJPM                            2.180e+01  4.163e+00   5.235 3.17e-07 ***
## stockKO                             3.091e+01  5.393e+00   5.732 2.48e-08 ***
## stockKRFT                           1.465e+01  2.681e+00   5.465 9.98e-08 ***
## stockMCD                            3.416e+01  6.031e+00   5.664 3.55e-08 ***
## stockMMM                            3.947e+01  6.759e+00   5.839 1.40e-08 ***
## stockMRK                            1.453e+01  2.902e+00   5.006 9.65e-07 ***
## stockMSFT                           7.722e+00  2.352e+00   3.283  0.00115 ** 
## stockPFE                            2.780e+00  1.288e+00   2.159  0.03169 *  
## stockPG                             2.948e+01  5.392e+00   5.468 9.78e-08 ***
## stockT                              1.142e+01  2.356e+00   4.845 2.06e-06 ***
## stockTRV                            2.994e+01  5.023e+00   5.962 7.21e-09 ***
## stockUTX                            3.763e+01  6.391e+00   5.888 1.08e-08 ***
## stockVZ                             1.746e+01  3.205e+00   5.447 1.09e-07 ***
## stockWMT                            2.590e+01  4.766e+00   5.434 1.17e-07 ***
## stockXOM                            3.531e+01  6.440e+00   5.484 9.05e-08 ***
## date                               -9.309e-04  6.011e-03  -0.155  0.87703    
## log(open + 1)                      -9.821e+00  1.745e+01  -0.563  0.57411    
## log(high + 1)                       1.746e+01  1.735e+01   1.006  0.31515    
## log(low + 1)                        1.951e+01  1.513e+01   1.289  0.19838    
## log(close + 1)                     -4.974e+01  1.712e+01  -2.906  0.00394 ** 
## log(volume + 1)                     1.099e+00  1.314e+00   0.837  0.40346    
## percent_change_price_transformed    2.735e-01  2.756e-01   0.992  0.32185    
## percent_change_volume_over_last_wk -2.857e-03  1.099e-02  -0.260  0.79516    
## log(previous_weeks_volume + 1)      1.586e-01  1.224e+00   0.130  0.89698    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.293 on 291 degrees of freedom
## Multiple R-squared:  0.2214, Adjusted R-squared:  0.1197 
## F-statistic: 2.177 on 38 and 291 DF,  p-value: 0.0001769

We can already see adjusted R-squared increased from 0.08353 to 0.1197

Multicollinearity

vif(t_lm_model_full_r)
##                                           GVIF Df GVIF^(1/(2*Df))
## stock                              7352.325784 29        1.165903
## date                                  1.111689  1        1.054367
## log(open + 1)                      6488.596534  1       80.551825
## log(high + 1)                      6427.826500  1       80.173727
## log(low + 1)                       4927.769768  1       70.198075
## log(close + 1)                     6311.229996  1       79.443250
## log(volume + 1)                     108.909771  1       10.435984
## percent_change_price_transformed      6.662224  1        2.581128
## percent_change_volume_over_last_wk   15.235487  1        3.903266
## log(previous_weeks_volume + 1)       96.801153  1        9.838758

had to remove open, high, low, and close.

t_lm_model_r <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_removed)

vif(t_lm_model_r)
##                                         GVIF Df GVIF^(1/(2*Df))
## stock                              32.055606 29        1.061607
## date                                1.050758  1        1.025065
## log(volume + 1)                    89.729022  1        9.472540
## percent_change_price_transformed    1.221011  1        1.104994
## percent_change_volume_over_last_wk 10.206338  1        3.194736
## log(previous_weeks_volume + 1)     75.215769  1        8.672702

Transformed Stepwise lm R

t_step_lm_model_r <- step(t_lm_model_r, direction = 'both', trace = FALSE)
summary(t_step_lm_model_r)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(previous_weeks_volume + 
##     1), data = train_df_removed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1788  -1.3173   0.0646   1.5207   6.1766 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                      6.2236     2.3822   2.613   0.0094 **
## log(previous_weeks_volume + 1)  -0.3334     0.1315  -2.535   0.0117 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.424 on 328 degrees of freedom
## Multiple R-squared:  0.01922,    Adjusted R-squared:  0.01623 
## F-statistic: 6.429 on 1 and 328 DF,  p-value: 0.01169

Model ends up using the same 1 variable as non-transformed model. log(previous_weeks_volume)

Also performs slightly worse. Adj. R-squared dropped from 0.02098 to 0.01623

Assumption Check

par(mfrow = c(2, 2))
plot(t_step_lm_model_r)

par(mfrow = c(1, 1))

All plots seem to be behaving a lot normally than the models with no transformations.

summary(influence.measures(t_step_lm_model_r))
## Potentially influential observations of
##   lm(formula = percent_change_next_weeks_price ~ log(previous_weeks_volume +      1), data = train_df_removed) :
## 
##     dfb.1_ dfb.l(+1 dffit   cov.r   cook.d hat    
## 4   -0.08   0.09     0.16    0.97_*  0.01   0.00  
## 17   0.05  -0.05     0.15    0.97_*  0.01   0.00  
## 38   0.32  -0.33    -0.35_*  1.01    0.06   0.03_*
## 39   0.21  -0.21    -0.23    1.01    0.03   0.02_*
## 40  -0.29   0.30     0.32_*  1.00    0.05   0.02_*
## 41  -0.15   0.15     0.16    1.02_*  0.01   0.02_*
## 42  -0.02   0.02     0.03    1.03_*  0.00   0.02_*
## 43   0.03  -0.04    -0.04    1.03_*  0.00   0.02_*
## 44   0.03  -0.03    -0.03    1.02_*  0.00   0.01  
## 45  -0.10   0.10     0.11    1.02_*  0.01   0.02_*
## 46   0.05  -0.05    -0.06    1.02_*  0.00   0.02_*
## 47   0.30  -0.30    -0.33_*  0.99    0.05   0.02_*
## 48  -0.01   0.01     0.01    1.03_*  0.00   0.02_*
## 64  -0.12   0.12     0.18    0.98_*  0.02   0.01  
## 65   0.46  -0.48    -0.60_*  0.78_*  0.16   0.01  
## 67  -0.01   0.01     0.01    1.03_*  0.00   0.02_*
## 101  0.08  -0.07     0.16    0.97_*  0.01   0.00  
## 127 -0.10   0.09    -0.17    0.97_*  0.01   0.00  
## 139  0.02  -0.04    -0.25_*  0.90_*  0.03   0.00  
## 232 -0.02   0.02    -0.02    1.02_*  0.00   0.01  
## 268 -0.10   0.11     0.18    0.97_*  0.02   0.00  
## 276 -0.01   0.01     0.01    1.02_*  0.00   0.01  
## 300 -0.09   0.10     0.16    0.98_*  0.01   0.00

RMSE

# Calculate RMSE
predictions <- predict(t_step_lm_model_r, newdata = test_df_removed)  # Get predictions
actuals <- test_df_removed$percent_change_next_weeks_price  # Actual values
rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
print(rmse)
## [1] 2.843812

Transformed Linear Model (Inputted)

#To be able to use transformation of percent_change_price 
train_df_inputted$percent_change_price_transformed <- sign(train_df_inputted$percent_change_price) * abs(train_df_inputted$percent_change_price)^(1/3)
t_lm_model_full_i <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(open + 1) + 
                          log(high + 1) + 
                          log(low + 1) + 
                          log(close + 1) + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_inputted)

summary(t_lm_model_full_i)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date + 
##     log(open + 1) + log(high + 1) + log(low + 1) + log(close + 
##     1) + log(volume + 1) + percent_change_price_transformed + 
##     percent_change_volume_over_last_wk + log(previous_weeks_volume + 
##     1), data = train_df_inputted)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5666  -1.2241   0.1678   1.3135   8.2590 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         4.417e+01  9.106e+01   0.485 0.627967    
## stockAXP                            2.277e+01  3.790e+00   6.007 5.12e-09 ***
## stockBA                             3.336e+01  5.475e+00   6.093 3.16e-09 ***
## stockBAC                           -5.704e+00  1.577e+00  -3.616 0.000347 ***
## stockCAT                            4.153e+01  6.729e+00   6.171 2.04e-09 ***
## stockCSCO                           4.296e-01  1.325e+00   0.324 0.746029    
## stockCVX                            4.072e+01  6.665e+00   6.109 2.89e-09 ***
## stockDD                             2.734e+01  4.348e+00   6.289 1.04e-09 ***
## stockDIS                            2.143e+01  3.495e+00   6.131 2.56e-09 ***
## stockGE                             3.666e+00  1.359e+00   2.698 0.007336 ** 
## stockHD                             1.839e+01  3.091e+00   5.949 7.07e-09 ***
## stockHPQ                            2.094e+01  3.798e+00   5.512 7.29e-08 ***
## stockIBM                            5.142e+01  8.481e+00   6.063 3.75e-09 ***
## stockINTC                           3.862e+00  1.498e+00   2.578 0.010390 *  
## stockJNJ                            2.820e+01  4.937e+00   5.713 2.54e-08 ***
## stockJPM                            2.186e+01  3.902e+00   5.604 4.52e-08 ***
## stockKO                             3.048e+01  5.089e+00   5.991 5.61e-09 ***
## stockKRFT                           1.476e+01  2.557e+00   5.775 1.82e-08 ***
## stockMCD                            3.358e+01  5.688e+00   5.903 9.09e-09 ***
## stockMMM                            3.895e+01  6.368e+00   6.116 2.78e-09 ***
## stockMRK                            1.418e+01  2.775e+00   5.112 5.50e-07 ***
## stockMSFT                           8.215e+00  2.231e+00   3.682 0.000271 ***
## stockPFE                            3.061e+00  1.221e+00   2.508 0.012651 *  
## stockPG                             2.932e+01  5.090e+00   5.761 1.96e-08 ***
## stockT                              1.154e+01  2.240e+00   5.154 4.47e-07 ***
## stockTRV                            2.958e+01  4.736e+00   6.245 1.34e-09 ***
## stockUTX                            3.701e+01  6.023e+00   6.146 2.36e-09 ***
## stockVZ                             1.709e+01  3.027e+00   5.645 3.63e-08 ***
## stockWMT                            2.582e+01  4.496e+00   5.744 2.15e-08 ***
## stockXOM                            3.499e+01  6.023e+00   5.810 1.51e-08 ***
## date                               -1.581e-04  6.097e-03  -0.026 0.979330    
## log(open + 1)                      -8.782e+00  1.610e+01  -0.546 0.585762    
## log(high + 1)                       1.507e+01  1.657e+01   0.910 0.363538    
## log(low + 1)                        2.349e+01  1.474e+01   1.593 0.112032    
## log(close + 1)                     -5.183e+01  1.586e+01  -3.267 0.001203 ** 
## log(volume + 1)                     1.183e+00  6.834e-01   1.730 0.084535 .  
## percent_change_price_transformed    3.269e-01  2.473e-01   1.322 0.187163    
## percent_change_volume_over_last_wk -3.255e-03  4.291e-03  -0.759 0.448702    
## log(previous_weeks_volume + 1)     -2.188e-02  2.932e-02  -0.746 0.456123    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.336 on 321 degrees of freedom
## Multiple R-squared:  0.2174, Adjusted R-squared:  0.1247 
## F-statistic: 2.346 on 38 and 321 DF,  p-value: 3.436e-05

Multicollinearity

vif(t_lm_model_full_i)
##                                           GVIF Df GVIF^(1/(2*Df))
## stock                              5600.343135 29        1.160445
## date                                  1.431738  1        1.196552
## log(open + 1)                      5783.706012  1       76.050681
## log(high + 1)                      6125.080931  1       78.262896
## log(low + 1)                       4894.526710  1       69.960894
## log(close + 1)                     5665.392775  1       75.268803
## log(volume + 1)                      31.517979  1        5.614088
## percent_change_price_transformed      5.684867  1        2.384296
## percent_change_volume_over_last_wk    2.237066  1        1.495682
## log(previous_weeks_volume + 1)        1.470997  1        1.212847

removed same 4 variables as previous model.

t_lm_model_i <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_inputted)

vif(t_lm_model_i)
##                                         GVIF Df GVIF^(1/(2*Df))
## stock                              27.279018 29        1.058658
## date                                1.356846  1        1.164837
## log(volume + 1)                    25.616579  1        5.061282
## percent_change_price_transformed    1.168733  1        1.081079
## percent_change_volume_over_last_wk  1.813435  1        1.346638
## log(previous_weeks_volume + 1)      1.456620  1        1.206905

Transformed Stepwise lm I

t_step_lm_model_i <- step(t_lm_model_i, direction = 'both', trace = FALSE)
summary(t_step_lm_model_i)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1) + 
##     log(previous_weeks_volume + 1), data = train_df_inputted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.269  -1.229   0.056   1.546   6.847 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     5.16570    2.33246   2.215   0.0274 * 
## log(volume + 1)                -0.20813    0.12952  -1.607   0.1089   
## log(previous_weeks_volume + 1) -0.06736    0.02572  -2.619   0.0092 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.468 on 357 degrees of freedom
## Multiple R-squared:  0.02882,    Adjusted R-squared:  0.02338 
## F-statistic: 5.298 on 2 and 357 DF,  p-value: 0.005404

Adjusted R-squared dropped from .032 to .02338.

Assumption Check

par(mfrow = c(2, 2))
plot(t_step_lm_model_i)

par(mfrow = c(1, 1))

Once again, all of our plots seem to behave a lot better than the models with no transformations.

summary(influence.measures(t_step_lm_model_i))
## Potentially influential observations of
##   lm(formula = percent_change_next_weeks_price ~ log(volume + 1) +      log(previous_weeks_volume + 1), data = train_df_inputted) :
## 
##     dfb.1_ dfb.l(+1) dfb.l(_+1 dffit   cov.r   cook.d hat    
## 1    0.12  -0.20      0.43     -0.47_*  1.00    0.07   0.04_*
## 4   -0.09   0.09      0.04      0.17    0.96_*  0.01   0.00  
## 13   0.05   0.00     -0.22      0.24    1.03_*  0.02   0.03_*
## 17   0.08  -0.08      0.04      0.15    0.96_*  0.01   0.00  
## 25  -0.01   0.00      0.04     -0.05    1.04_*  0.00   0.03_*
## 37  -0.43   0.54     -0.57      0.76_*  1.01    0.19   0.06_*
## 38   0.25  -0.25     -0.07     -0.29_*  0.99    0.03   0.02  
## 40  -0.28   0.27      0.05      0.30_*  1.00    0.03   0.02  
## 42  -0.02   0.02      0.00      0.02    1.03_*  0.00   0.02  
## 44   0.03  -0.03     -0.01     -0.03    1.03_*  0.00   0.02  
## 46   0.06  -0.05     -0.01     -0.06    1.03_*  0.00   0.02  
## 47   0.29  -0.28     -0.06     -0.32_*  0.98    0.03   0.02  
## 48   0.00   0.00      0.00      0.00    1.03_*  0.00   0.02  
## 49  -0.02   0.01      0.05     -0.06    1.04_*  0.00   0.03_*
## 61   0.00   0.01     -0.01      0.02    1.05_*  0.00   0.04_*
## 64  -0.15   0.15      0.04      0.20    0.97_*  0.01   0.01  
## 65   0.40  -0.39     -0.14     -0.56_*  0.72_*  0.09   0.01  
## 66  -0.03   0.03      0.00      0.03    1.03_*  0.00   0.02  
## 73   0.01   0.00     -0.04      0.04    1.04_*  0.00   0.03_*
## 85   0.04  -0.02     -0.11      0.11    1.04_*  0.00   0.03_*
## 97   0.00  -0.01      0.05     -0.05    1.04_*  0.00   0.03_*
## 101  0.04  -0.04      0.03      0.14    0.96_*  0.01   0.00  
## 109  0.00   0.00      0.00      0.00    1.05_*  0.00   0.04_*
## 121  0.04   0.01     -0.26      0.27    1.02    0.02   0.03_*
## 127 -0.11   0.11     -0.03     -0.17    0.96_*  0.01   0.00  
## 133  0.00   0.03     -0.13      0.14    1.04_*  0.01   0.03_*
## 139 -0.09   0.09     -0.09     -0.26    0.86_*  0.02   0.00  
## 145  0.01  -0.01     -0.03      0.03    1.04_*  0.00   0.03_*
## 157 -0.02   0.03     -0.06      0.07    1.05_*  0.00   0.04_*
## 169 -0.01   0.00      0.07     -0.08    1.04_*  0.00   0.03_*
## 181 -0.05   0.09     -0.20      0.22    1.04_*  0.02   0.04_*
## 193  0.00   0.00      0.01     -0.01    1.04_*  0.00   0.03_*
## 205 -0.01   0.00      0.05     -0.06    1.04_*  0.00   0.03_*
## 217  0.00  -0.02      0.12     -0.12    1.04_*  0.01   0.03_*
## 229  0.04  -0.03     -0.07      0.09    1.04_*  0.00   0.04_*
## 241 -0.05  -0.09      0.71     -0.74_*  0.91_*  0.17   0.03_*
## 253  0.02  -0.03      0.06     -0.06    1.05_*  0.00   0.04_*
## 265  0.01  -0.02      0.03     -0.03    1.05_*  0.00   0.04_*
## 268 -0.18   0.18      0.04      0.23    0.96_*  0.02   0.01  
## 277  0.00   0.00     -0.02      0.02    1.04_*  0.00   0.03_*
## 289  0.02  -0.04      0.12     -0.13    1.04_*  0.01   0.04_*
## 300 -0.10   0.10      0.04      0.16    0.97_*  0.01   0.01  
## 301  0.04  -0.02     -0.08      0.09    1.04_*  0.00   0.03_*
## 313 -0.03   0.02      0.05     -0.05    1.04_*  0.00   0.04_*
## 325  0.04  -0.10      0.32     -0.34_*  1.02    0.04   0.04_*
## 337  0.01   0.00     -0.05      0.06    1.04_*  0.00   0.03_*
## 349  0.00   0.04     -0.17      0.18    1.04_*  0.01   0.03_*

RMSE

# Calculate RMSE
predictions <- predict(t_step_lm_model_i, newdata = test_df_inputted)  # Get predictions
actuals <- test_df_inputted$percent_change_next_weeks_price  # Actual values
rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
print(rmse)
## [1] 2.839231

Transformed Linear Models NO INF OBS———

TLM (removed)

t_lm_model_full_r_ni <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(open + 1) + 
                          log(high + 1) + 
                          log(low + 1) + 
                          log(close + 1) + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 
                                                 268, 276, 300), ])

vif(t_lm_model_full_r_ni)
##                                           GVIF Df GVIF^(1/(2*Df))
## stock                              7765.907359 29        1.167004
## date                                  1.112477  1        1.054740
## log(open + 1)                      6612.568873  1       81.317703
## log(high + 1)                      6520.929541  1       80.752273
## log(low + 1)                       5089.765047  1       71.342589
## log(close + 1)                     6455.649689  1       80.347058
## log(volume + 1)                     106.992529  1       10.343719
## percent_change_price_transformed      6.673583  1        2.583328
## percent_change_volume_over_last_wk   14.944679  1        3.865835
## log(previous_weeks_volume + 1)       94.902425  1        9.741788

Removed open, high, close, low .

t_lm_model_r_ni <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 
                                                 268, 276, 300), ])

vif(t_lm_model_r_ni)
##                                         GVIF Df GVIF^(1/(2*Df))
## stock                              32.499758 29        1.061859
## date                                1.046881  1        1.023172
## log(volume + 1)                    88.631048  1        9.414406
## percent_change_price_transformed    1.237671  1        1.112506
## percent_change_volume_over_last_wk  9.979608  1        3.159052
## log(previous_weeks_volume + 1)     73.906037  1        8.596862
t_step_lm_model_r_ni <- step(t_lm_model_r_ni, direction = 'both', trace = FALSE)
summary(t_step_lm_model_r_ni)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1), 
##     data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 268, 
##         276, 300), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2348  -1.3293   0.0021   1.4818   6.3440 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       6.3016     2.3975   2.628  0.00899 **
## log(volume + 1)  -0.3366     0.1326  -2.539  0.01160 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.375 on 319 degrees of freedom
## Multiple R-squared:  0.0198, Adjusted R-squared:  0.01673 
## F-statistic: 6.445 on 1 and 319 DF,  p-value: 0.0116

Also performs slightly worse. Adj. R-squared dropped from 0.02098 to 0.01623, back up to 0.01673.

RMSE

# Calculate RMSE
predictions <- predict(t_step_lm_model_r_ni, newdata = test_df_removed)  # Get predictions
actuals <- test_df_removed$percent_change_next_weeks_price  # Actual values
rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
print(rmse)
## [1] 2.844462

TLM(inputted)

t_lm_model_full_i_ni <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(open + 1) + 
                          log(high + 1) + 
                          log(low + 1) + 
                          log(close + 1) + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_inputted[-c(4, 17, 101, 127, 139, 232, 
                                                 268, 276, 300), ])

vif(t_lm_model_full_i_ni)
##                                           GVIF Df GVIF^(1/(2*Df))
## stock                              5841.723739 29        1.161289
## date                                  1.429843  1        1.195761
## log(open + 1)                      5874.638856  1       76.646193
## log(high + 1)                      6100.789330  1       78.107550
## log(low + 1)                       4998.728237  1       70.701685
## log(close + 1)                     5647.001938  1       75.146536
## log(volume + 1)                      31.713684  1        5.631490
## percent_change_price_transformed      5.740249  1        2.395882
## percent_change_volume_over_last_wk    2.263377  1        1.504452
## log(previous_weeks_volume + 1)        1.484811  1        1.218528
t_lm_model_i_ni <- lm(percent_change_next_weeks_price ~ 
                          stock + date + 
                          log(volume + 1) + 
                          percent_change_price_transformed + 
                          percent_change_volume_over_last_wk + 
                          log(previous_weeks_volume + 1), 
                      data = train_df_inputted[-c(4, 17, 101, 127, 139, 232, 
                                                 268, 276, 300), ])

vif(t_lm_model_i_ni)
##                                         GVIF Df GVIF^(1/(2*Df))
## stock                              27.636845 29        1.058896
## date                                1.369514  1        1.170262
## log(volume + 1)                    25.787878  1        5.078177
## percent_change_price_transformed    1.171139  1        1.082192
## percent_change_volume_over_last_wk  1.794374  1        1.339542
## log(previous_weeks_volume + 1)      1.464272  1        1.210071
t_step_lm_model_i_ni <- step(t_lm_model_i_ni, direction = 'both', trace = FALSE)
summary(t_step_lm_model_i_ni)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1) + 
##     log(previous_weeks_volume + 1), data = train_df_inputted[-c(4, 
##     17, 101, 127, 139, 232, 268, 276, 300), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1547  -1.1956   0.0968   1.4837   7.0296 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     6.22365    2.19659   2.833  0.00488 **
## log(volume + 1)                -0.26693    0.12193  -2.189  0.02924 * 
## log(previous_weeks_volume + 1) -0.06935    0.02404  -2.885  0.00416 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.303 on 348 degrees of freedom
## Multiple R-squared:  0.04074,    Adjusted R-squared:  0.03523 
## F-statistic:  7.39 on 2 and 348 DF,  p-value: 0.0007195

Adjusted R-squared dropped from .032 to .02338.

With this model Adjusted R-squared went up to 0.03523

RMSE

# Calculate RMSE
predictions <- predict(t_step_lm_model_i_ni, newdata = test_df_inputted)  # Get predictions
actuals <- test_df_inputted$percent_change_next_weeks_price  # Actual values
rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
print(rmse)
## [1] 2.843356

———————————————————————————-

Linear Models Lagged Variables

Since we found out after that our lagged variables aren’t good to introduce to our models, we will comment this part which now becomes useless.

LM on Removed

# lagged_lm_model_full_r <- lm(percent_change_next_weeks_price ~ stock + date + open +
#                         high + low + close + volume +
#                         percent_change_price + percent_change_volume_over_last_wk +
#                         previous_weeks_volume +
#                         lagged_open + lagged_high + lagged_low + lagged_close + lagged_volume +
#                         lagged_percent_change_price + lagged_percent_change_volume +
#                         lagged_open_2 + lagged_high_2 + lagged_low_2 + lagged_close_2 +
#                         lagged_volume_2 +
#                         lagged_percent_change_price_2 + lagged_percent_change_volume_2 +
#                         lagged_previous_weeks_volume + lagged_previous_weeks_volume_2,
#                       data = train_df_removed)
# 
# 
# vif(lagged_lm_model_full_r)

we remove the same 4 variables and their lags. open, close, high, low.

# lagged_lm_model_r <- lm(percent_change_next_weeks_price ~ stock + date + volume +
#                         percent_change_price + percent_change_volume_over_last_wk +
#                         previous_weeks_volume +
#                         lagged_volume +
#                         lagged_percent_change_price + lagged_percent_change_volume +
#                         lagged_volume_2 +
#                         lagged_percent_change_price_2 + lagged_percent_change_volume_2,
#                       data = train_df_removed)
# 
# vif(lagged_lm_model_r)
# lagged_step_lm_model_r <- step(lagged_lm_model_r, direction = 'both', trace = FALSE)
# summary(lagged_step_lm_model_r)

We end up with 2 variables instead of just 1. They are not statistically significant, but they explain a higher variance of 2.587%.

RMSE*** BEST RMSE SO FAR

# predictions <- predict(lagged_step_lm_model_r, newdata = test_df_removed)  # Get predictions
# actuals <- test_df_removed$percent_change_next_weeks_price  # Actual values
# rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
# print(rmse)

LM on Inputted

# lagged_lm_model_full_i <- lm(percent_change_next_weeks_price ~ stock + date + open +
#                         high + low + close + volume +
#                         percent_change_price + percent_change_volume_over_last_wk +
#                         previous_weeks_volume +
#                         lagged_open + lagged_high + lagged_low + lagged_close + lagged_volume +
#                         lagged_percent_change_price + lagged_percent_change_volume +
#                         lagged_open_2 + lagged_high_2 + lagged_low_2 + lagged_close_2 +
#                         lagged_volume_2 +
#                         lagged_percent_change_price_2 + lagged_percent_change_volume_2,
#                       data = train_df_inputted)
# 
# 
# vif(lagged_lm_model_full_i)

Remove same 4 and their lags.

# lagged_lm_model_i <- lm(percent_change_next_weeks_price ~ stock + date + volume +
#                         percent_change_price + percent_change_volume_over_last_wk +
#                         previous_weeks_volume +
#                         lagged_volume +
#                         lagged_percent_change_price + lagged_percent_change_volume +
#                         lagged_volume_2 +
#                         lagged_percent_change_price_2 + lagged_percent_change_volume_2,
#                       data = train_df_inputted)
# 
# vif(lagged_lm_model_i)
# lagged_step_lm_model_i <- step(lagged_lm_model_i, direction = 'both', trace = FALSE)
# summary(lagged_step_lm_model_i)

Increase in Variance explained by the model.

Also uses more variables.

RMSE

# predictions <- predict(lagged_step_lm_model_i, newdata = test_df_inputted)  # Get predictions
# actuals <- test_df_inputted$percent_change_next_weeks_price  # Actual values
# rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
# print(rmse)

Transformed LM on Removed

# lagged_t_lm_model_full_r <- lm(lm(percent_change_next_weeks_price ~ 
#     stock + date + 
#     log(open + 1) + 
#     log(high + 1) + 
#     log(low + 1) + 
#     log(close + 1) + 
#     log(volume + 1) + 
#     percent_change_price_transformed + 
#     percent_change_volume_over_last_wk + 
#     log(previous_weeks_volume + 1) + 
#     
#     # Lagged variables with corresponding transformations
#     log(lagged_open + 1) + 
#     log(lagged_high + 1) + 
#     log(lagged_low + 1) + 
#     log(lagged_close + 1) + 
#     log(lagged_volume + 1) +
#     log(lagged_open_2 + 1) + 
#     log(lagged_high_2 + 1) + 
#     log(lagged_low_2 + 1) + 
#     log(lagged_close_2 + 1) + 
#     log(lagged_volume_2 + 1),
#     data = train_df_removed)
# )
# 
# summary(lagged_t_lm_model_full_r)
# vif(lagged_t_lm_model_full_r)
# lagged_t_lm_model_r <- lm(percent_change_next_weeks_price ~ 
#     stock + date + 
#     log(volume + 1) + 
#     percent_change_price_transformed + 
#     percent_change_volume_over_last_wk + 
#     log(previous_weeks_volume + 1) + 
#     
#     # Lagged variables with corresponding transformations
#     log(lagged_volume + 1) +
#     log(lagged_volume_2 + 1),
#     data = train_df_removed)
# 
# vif(lagged_t_lm_model_r)
# lagged_t_step_lm_model_r <- step(lagged_t_lm_model_r, direction = 'both', trace = FALSE)
# summary(lagged_t_step_lm_model_r)

RMSE

# predictions <- predict(lagged_t_step_lm_model_r, newdata = test_df_removed)  # Get predictions
# actuals <- test_df_removed$percent_change_next_weeks_price  # Actual values
# rmse <- sqrt(mean((actuals - predictions)^2))  # Compute RMSE
# print(rmse)

———————————————————————————

Decision Trees

DT on Removed

can’t use stock or date because they are not numeric

tree_model_r <- tree(percent_change_next_weeks_price ~ open +
                        high + low + close + volume +
                        percent_change_price + percent_change_volume_over_last_wk +
                        previous_weeks_volume + percent_return_next_dividend,
                      data = train_df_removed)
summary(tree_model_r)
## 
## Regression tree:
## tree(formula = percent_change_next_weeks_price ~ open + high + 
##     low + close + volume + percent_change_price + percent_change_volume_over_last_wk + 
##     previous_weeks_volume + percent_return_next_dividend, data = train_df_removed)
## Variables actually used in tree construction:
## [1] "percent_change_price"               "percent_return_next_dividend"      
## [3] "open"                               "volume"                            
## [5] "percent_change_volume_over_last_wk" "close"                             
## Number of terminal nodes:  18 
## Residual mean deviance:  4.233 = 1321 / 312 
## Distribution of residuals:
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -11.420000  -1.155000  -0.005521   0.000000   1.227000   4.891000
plot(tree_model_r)
text(tree_model_r)

RMSE

predictions <- predict(tree_model_r, newdata = test_df_removed)

actuals <- test_df_removed$percent_change_next_weeks_price

sqrt(mean((actuals - predictions)^2))
## [1] 3.292289

Pruning

set.seed(42)
cv_tree_model_r <- cv.tree(tree_model_r)
best_size <- cv_tree_model_r$size[which.min(cv_tree_model_r$dev)]
best_k <- cv_tree_model_r$k[which.min(cv_tree_model_r$dev)]

#plot the pruned tree
plot(cv_tree_model_r$size, cv_tree_model_r$dev, type = 'b')

plot(cv_tree_model_r$k, cv_tree_model_r$dev, type = 'b')

best size = 1

pruning at around 70 - 110 shows the most improvement in performance

Our plots show that the lower the size, the better our model. Also the higher the k(more pruning) complexity decreases and performance is better(lower error).

Pruning won’t work because our size is too small and based on what we know about our variables, it won’t be enough to actually make meaningful predictions

# p_tree_model_r <- prune.tree(tree_model_r, best = best_size)
# predictions <- predict(p_tree_model_r, newdata = test_df_removed)

DT on Inputted

tree_model_i <- tree(percent_change_next_weeks_price ~ open +
                        high + low + close + volume +
                        percent_change_price + percent_change_volume_over_last_wk +
                        previous_weeks_volume + percent_return_next_dividend,
                      data = train_df_inputted)
summary(tree_model_i)
## 
## Regression tree:
## tree(formula = percent_change_next_weeks_price ~ open + high + 
##     low + close + volume + percent_change_price + percent_change_volume_over_last_wk + 
##     previous_weeks_volume + percent_return_next_dividend, data = train_df_inputted)
## Variables actually used in tree construction:
## [1] "previous_weeks_volume"              "percent_change_price"              
## [3] "volume"                             "percent_change_volume_over_last_wk"
## [5] "percent_return_next_dividend"       "open"                              
## [7] "close"                             
## Number of terminal nodes:  14 
## Residual mean deviance:  4.712 = 1630 / 346 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11.4100  -1.1790   0.2242   0.0000   1.2350   6.3080
plot(tree_model_i)
text(tree_model_i)

RMSE

predictions <- predict(tree_model_i, newdata = test_df_inputted)

actuals <- test_df_inputted$percent_change_next_weeks_price

sqrt(mean((actuals - predictions)^2))
## [1] 3.103051

Pruning

set.seed(42)
cv_tree_model_i <- cv.tree(tree_model_i)
best_size <- cv_tree_model_i$size[which.min(cv_tree_model_i$dev)]
best_k <- cv_tree_model_i$k[which.min(cv_tree_model_i$dev)]

#plot the pruned tree
plot(cv_tree_model_i$size, cv_tree_model_i$dev, type = 'b')

plot(cv_tree_model_i$k, cv_tree_model_i$dev, type = 'b')

Same as dataset with Removed NA’s.

Pruning results in size of 1 which will make our model have insignificant predictions.

———————————————————————————-

After trying to figure out multiple different variables in different models for our whole dataset, we are going to start training the models on each separate stock.

First of all, we found out that ‘removed’ dataset worked better than inputted.

Second, most of our linear models seem to only like to use previous_weeks_volume and it’s the only variable we ever get that is significant enough.

For LOOPS LM

lm_models = list()
lm_results = data.frame(stock = character(), RMSE = numeric(), stringsAsFactors = FALSE)


for (stock in unique(df_NA_removed$stock)) {
  subset_data <- df_NA_removed[df_NA_removed$stock == stock, ]
  train_data <- subset_data[subset_data$quarter == 1, ]
  test_data <- subset_data[subset_data$quarter == 2, ]
  
  model <- lm(percent_change_next_weeks_price ~ previous_weeks_volume, 
              data = train_data)
  
  lm_models[[stock]] <- model
  
  predictions <- predict(model, newdata = test_data)
  actuals <- test_data$percent_change_next_weeks_price
  rmse <- sqrt(mean((actuals - predictions)^2))
  
  lm_results <- rbind(lm_results, data.frame(stock = stock, RMSE = rmse))
  
  
}

RMSE

mean(lm_results$RMSE)
## [1] 2.893729

For LOOPS Decision Trees

dt_models = list()
dt_results = data.frame(stock = character(), RMSE = numeric(), stringsAsFactors = FALSE)

for (stock in unique(df_NA_removed$stock)) {
  subset_data <- df_NA_removed[df_NA_removed$stock == stock, ]
  train_data <- df_NA_removed[df_NA_removed$quarter == 1, ]
  test_data <- df_NA_removed[df_NA_removed$quarter == 2, ]
  
  model <- tree(percent_change_next_weeks_price ~ previous_weeks_volume, 
                 data = train_data)
  
  dt_models[[stock]] <- model
  
  predictions <- predict(model, newdata = test_data)
  actuals <- test_data$percent_change_next_weeks_price
  
  rmse <- sqrt(mean((actuals - predictions)^2))
  dt_results <- rbind(dt_results, data.frame(stock = stock, RMSE = rmse))
  
}

RMSE

mean(dt_results$RMSE)
## [1] 2.855011

For LOOPS SVR

svr_model <- svm(percent_change_next_weeks_price ~ previous_weeks_volume, 
                 data = train_data, kernel = 'radial')

svr_models <- list()
svr_results <- data.frame(stock = character(), RMSE = numeric(), stringsAsFactors = FALSE)

for (stock in unique(df_NA_removed$stock)) {
  subset_data <- df_NA_removed[df_NA_removed$stock == stock, ]
  train_data <- df_NA_removed[df_NA_removed$quarter == 1, ]
  test_data <- df_NA_removed[df_NA_removed$quarter == 2, ]
  
  model <- svm(percent_change_next_weeks_price ~ previous_weeks_volume, 
                 data = train_data, kernel = 'radial')
  
  svr_models[[stock]] <- model
  
  predictions <- predict(model, newdata = test_data)
  actuals <- test_data$percent_change_next_weeks_price
  
  rmse <- sqrt(mean((actuals - predictions)^2))
  svr_results <- rbind(svr_results, data.frame(stock = stock, RMSE = rmse))
  
}

RMSE

mean(svr_results$RMSE)
## [1] 2.843871

———————————————————————————-

CAPM

Download Yahoo finance for S&P500

library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.26     ✔ xts                  0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                   masks base::as.Date()
## ✖ zoo::as.Date.numeric()           masks base::as.Date.numeric()
## ✖ dplyr::filter()                  masks stats::filter()
## ✖ xts::first()                     masks dplyr::first()
## ✖ PerformanceAnalytics::kurtosis() masks e1071::kurtosis()
## ✖ dplyr::lag()                     masks stats::lag()
## ✖ xts::last()                      masks dplyr::last()
## ✖ PerformanceAnalytics::legend()   masks graphics::legend()
## ✖ car::recode()                    masks dplyr::recode()
## ✖ PerformanceAnalytics::skewness() masks e1071::skewness()
## ✖ quantmod::summary()              masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Get daily historical data for S&P 500 (^GSPC)
sp500_data = tq_get("^GSPC", from = "2011-01-14", to = "2011-06-24", get = "stock.prices", periodicity = "weekly")
# View first few rows
head(sp500_data)
## # A tibble: 6 × 8
##   symbol date        open  high   low close      volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>       <dbl>    <dbl>
## 1 ^GSPC  2011-01-10 1286. 1293. 1280. 1293.  8972430000    1293.
## 2 ^GSPC  2011-01-17 1293. 1296. 1271. 1283. 19899340000    1283.
## 3 ^GSPC  2011-01-24 1283. 1303. 1275. 1276. 23156650000    1276.
## 4 ^GSPC  2011-01-31 1276. 1311  1276. 1311. 21726860000    1311.
## 5 ^GSPC  2011-02-07 1312. 1331. 1312. 1329. 20109950000    1329.
## 6 ^GSPC  2011-02-14 1329. 1344. 1325. 1343. 12589110000    1343.
# Save data to CSV
write.csv(sp500_data, "SP500_data.csv", row.names = FALSE)

Compute the returns and remove missing values

#SP500
returnSP500 <- na.omit(Delt(sp500_data$close))

# loop for stocks in df_NA_removed
stock_returns <- list()

for (stock in unique(df_NA_removed$stock)) {
  stock_returns[[stock]] <- na.omit(Delt(df_NA_removed$close[df_NA_removed$stock == stock]))
}

stock_returns_df <- do.call(cbind, lapply(stock_returns, function(x) x[, 1]))

colnames(stock_returns_df) <- names(stock_returns)

stock_returns_df <- as.data.frame(stock_returns_df)

#add SP500 to dataframe
returnSP500_vector <- as.vector(returnSP500)

stock_returns_df$SP500 <- returnSP500_vector

See how the data looks

boxplot(stock_returns_df, main = 'Expected Return', xlab = 'Stock', ylab = 'Return')

Means and SD

DataMean <- apply(stock_returns_df, 2, mean)
DataSD <- apply(stock_returns_df, 2, sd)

cbind(DataMean, DataSD)
##            DataMean     DataSD
## AA    -0.0014758553 0.03486277
## AXP    0.0022992844 0.02809141
## BA     0.0011022005 0.02795967
## BAC   -0.0156049349 0.02911342
## CAT    0.0032480226 0.03411175
## CSCO  -0.0143473271 0.03971904
## CVX    0.0026118394 0.02482674
## DD     0.0022017080 0.02723925
## DIS   -0.0015588482 0.02808084
## GE    -0.0016902251 0.02574530
## HD    -0.0007599981 0.02198521
## HPQ   -0.0113893855 0.03927282
## IBM    0.0043318296 0.01836236
## INTC   0.0007425153 0.03258276
## JNJ    0.0019245403 0.02114399
## JPM   -0.0053481151 0.02181402
## KRFT   0.0044681344 0.01816930
## KO     0.0013550453 0.01664583
## MCD    0.0045364377 0.01954457
## MMM    0.0015614511 0.02073424
## MRK    0.0006072965 0.02069553
## MSFT  -0.0064307236 0.01898709
## PFE    0.0042807309 0.02655822
## PG    -0.0018427834 0.01779427
## T      0.0031507287 0.01934575
## TRV    0.0017787893 0.01920565
## UTX    0.0030140170 0.02130799
## VZ     0.0008114707 0.01799993
## WMT   -0.0017666125 0.01910305
## XOM   -0.0002927384 0.02512918
## SP500 -0.0002213339 0.01499232

Beta for each stock

beta_results <- data.frame(stock = character(), beta = numeric())

for (stock in unique(df_NA_removed$stock)) {
  lm_capm <- lm(stock_returns_df[[stock]] ~ stock_returns_df$SP500, data = stock_returns_df)
  
  beta <- coef(lm_capm)[2]
  
  beta_results <- rbind(beta_results, data.frame(stock = stock, beta = beta))
  
}
print(beta_results)
##                          stock      beta
## stock_returns_df$SP500      AA 1.5139715
## stock_returns_df$SP5001    AXP 0.8964116
## stock_returns_df$SP5002     BA 1.3487835
## stock_returns_df$SP5003    BAC 0.7006978
## stock_returns_df$SP5004    CAT 1.6273586
## stock_returns_df$SP5005   CSCO 0.7112362
## stock_returns_df$SP5006    CVX 0.8324819
## stock_returns_df$SP5007     DD 1.3509575
## stock_returns_df$SP5008    DIS 1.4134352
## stock_returns_df$SP5009     GE 1.0565302
## stock_returns_df$SP50010    HD 0.8387551
## stock_returns_df$SP50011   HPQ 1.1244543
## stock_returns_df$SP50012   IBM 0.7905043
## stock_returns_df$SP50013  INTC 1.1496233
## stock_returns_df$SP50014   JNJ 0.6804785
## stock_returns_df$SP50015   JPM 0.6300871
## stock_returns_df$SP50016  KRFT 0.2189592
## stock_returns_df$SP50017    KO 0.6385589
## stock_returns_df$SP50018   MCD 0.6457669
## stock_returns_df$SP50019   MMM 1.0278481
## stock_returns_df$SP50020   MRK 0.5219022
## stock_returns_df$SP50021  MSFT 0.6762358
## stock_returns_df$SP50022   PFE 0.6965205
## stock_returns_df$SP50023    PG 0.2733775
## stock_returns_df$SP50024     T 0.7897055
## stock_returns_df$SP50025   TRV 0.8666328
## stock_returns_df$SP50026   UTX 1.0654107
## stock_returns_df$SP50027    VZ 0.8204556
## stock_returns_df$SP50028   WMT 0.4493817
## stock_returns_df$SP50029   XOM 1.1084912