Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(e1071)
library(tree)
library(caret)
## Loading required package: lattice
library(rpart)
df <- read.csv('/Users/ponce/Desktop/DA-6813/Case Study 3/dow_jones_index.data')
glimpse(df)
## Rows: 750
## Columns: 16
## $ quarter <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stock <chr> "AA", "AA", "AA", "AA", "AA", "AA",…
## $ date <chr> "1/7/2011", "1/14/2011", "1/21/2011…
## $ open <chr> "$15.82", "$16.71", "$16.19", "$15.…
## $ high <chr> "$16.72", "$16.71", "$16.38", "$16.…
## $ low <chr> "$15.78", "$15.64", "$15.60", "$15.…
## $ close <chr> "$16.42", "$15.97", "$15.79", "$16.…
## $ volume <int> 239655616, 242963398, 138428495, 15…
## $ percent_change_price <dbl> 3.7926700, -4.4284900, -2.4706600, …
## $ percent_change_volume_over_last_wk <dbl> NA, 1.380223, -43.024959, 9.355500,…
## $ previous_weeks_volume <int> NA, 239655616, 242963398, 138428495…
## $ next_weeks_open <chr> "$16.71", "$16.19", "$15.87", "$16.…
## $ next_weeks_close <chr> "$15.97", "$15.79", "$16.13", "$17.…
## $ percent_change_next_weeks_price <dbl> -4.4284900, -2.4706600, 1.6383100, …
## $ days_to_next_dividend <int> 26, 19, 12, 5, 97, 90, 83, 76, 69, …
## $ percent_return_next_dividend <dbl> 0.182704, 0.187852, 0.189994, 0.185…
Need to deal with ‘$’ to make data into numeric instead of character.
df <- df |>
mutate(open = as.numeric(str_remove_all(open, '\\$')),
high = as.numeric(str_remove_all(high, '\\$')),
low = as.numeric(str_remove_all(low, '\\$')),
close = as.numeric(str_remove_all(close, '\\$')),
next_weeks_open = as.numeric(str_remove_all(next_weeks_open, '\\$')),
next_weeks_close = as.numeric(str_remove_all(next_weeks_close, '\\$')),
date = as.Date(date, format= '%m/%d/%Y'))
# Convert categorical into factor
df$quarter <- as.factor(df$quarter)
df$stock <- as.factor(df$stock)
glimpse(df)
## Rows: 750
## Columns: 16
## $ quarter <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stock <fct> AA, AA, AA, AA, AA, AA, AA, AA, AA,…
## $ date <date> 2011-01-07, 2011-01-14, 2011-01-21…
## $ open <dbl> 15.82, 16.71, 16.19, 15.87, 16.18, …
## $ high <dbl> 16.72, 16.71, 16.38, 16.63, 17.39, …
## $ low <dbl> 15.78, 15.64, 15.60, 15.82, 16.18, …
## $ close <dbl> 16.42, 15.97, 15.79, 16.13, 17.14, …
## $ volume <int> 239655616, 242963398, 138428495, 15…
## $ percent_change_price <dbl> 3.7926700, -4.4284900, -2.4706600, …
## $ percent_change_volume_over_last_wk <dbl> NA, 1.380223, -43.024959, 9.355500,…
## $ previous_weeks_volume <int> NA, 239655616, 242963398, 138428495…
## $ next_weeks_open <dbl> 16.71, 16.19, 15.87, 16.18, 17.33, …
## $ next_weeks_close <dbl> 15.97, 15.79, 16.13, 17.14, 17.37, …
## $ percent_change_next_weeks_price <dbl> -4.4284900, -2.4706600, 1.6383100, …
## $ days_to_next_dividend <int> 26, 19, 12, 5, 97, 90, 83, 76, 69, …
## $ percent_return_next_dividend <dbl> 0.182704, 0.187852, 0.189994, 0.185…
count(df, stock)
## stock n
## 1 AA 25
## 2 AXP 25
## 3 BA 25
## 4 BAC 25
## 5 CAT 25
## 6 CSCO 25
## 7 CVX 25
## 8 DD 25
## 9 DIS 25
## 10 GE 25
## 11 HD 25
## 12 HPQ 25
## 13 IBM 25
## 14 INTC 25
## 15 JNJ 25
## 16 JPM 25
## 17 KO 25
## 18 KRFT 25
## 19 MCD 25
## 20 MMM 25
## 21 MRK 25
## 22 MSFT 25
## 23 PFE 25
## 24 PG 25
## 25 T 25
## 26 TRV 25
## 27 UTX 25
## 28 VZ 25
## 29 WMT 25
## 30 XOM 25
25 observations for each stock
count(df, quarter)
## quarter n
## 1 1 360
## 2 2 390
360 observations in q1 vs 390 in q2.
q2 has one more day if it’s a leap year, hence the 30 more observations
cor_matrix <- cor(df[, sapply(df, is.numeric)], use = 'complete.obs')
#Correlation plot
corrplot(cor_matrix, method = 'circle', type = 'lower', order = 'hclust', tl.col = 'black',
tl.srt = 45)
A lot of variables are correlated to each other as we expected. This could potentially lead to multicollinearity issues.
volume has a negative correlation with price variables
(open, high, low,
close) which indicates that when stock prices go
up, trading volume tends to go down and vice versa.
next_weeks_open and next_weeks_close have
moderate positive correlations with the current price variables as
expected. This suggests that prices are relatively consistent from week
to week.
sum(is.na(df))
## [1] 60
60 missing values, all come from
percent_change_volume_over_last_wk and
previous_week_volume.
For each of our stocks, these variables have no data on the exact first observation based on the date.
This makes sense because we have no data of the previous week.
What should be done about these missing values?
Exploratory Analysis continued
summary(df)
## quarter stock date open high
## 1:360 AA : 25 Min. :2011-01-07 Min. : 10.59 Min. : 10.94
## 2:390 AXP : 25 1st Qu.:2011-02-18 1st Qu.: 29.83 1st Qu.: 30.63
## BA : 25 Median :2011-04-01 Median : 45.97 Median : 46.88
## BAC : 25 Mean :2011-03-31 Mean : 53.65 Mean : 54.67
## CAT : 25 3rd Qu.:2011-05-13 3rd Qu.: 72.72 3rd Qu.: 74.29
## CSCO : 25 Max. :2011-06-24 Max. :172.11 Max. :173.54
## (Other):600
## low close volume percent_change_price
## Min. : 10.40 Min. : 10.52 Min. :9.719e+06 Min. :-15.42290
## 1st Qu.: 28.72 1st Qu.: 30.36 1st Qu.:3.087e+07 1st Qu.: -1.28805
## Median : 44.80 Median : 45.93 Median :5.306e+07 Median : 0.00000
## Mean : 52.64 Mean : 53.73 Mean :1.175e+08 Mean : 0.05026
## 3rd Qu.: 71.04 3rd Qu.: 72.67 3rd Qu.:1.327e+08 3rd Qu.: 1.65089
## Max. :167.82 Max. :170.58 Max. :1.453e+09 Max. : 9.88223
##
## percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open
## Min. :-61.4332 Min. :9.719e+06 Min. : 10.52
## 1st Qu.:-19.8043 1st Qu.:3.068e+07 1st Qu.: 30.32
## Median : 0.5126 Median :5.295e+07 Median : 46.02
## Mean : 5.5936 Mean :1.174e+08 Mean : 53.70
## 3rd Qu.: 21.8006 3rd Qu.:1.333e+08 3rd Qu.: 72.72
## Max. :327.4089 Max. :1.453e+09 Max. :172.11
## NA's :30 NA's :30
## next_weeks_close percent_change_next_weeks_price days_to_next_dividend
## Min. : 10.52 Min. :-15.4229 Min. : 0.00
## 1st Qu.: 30.46 1st Qu.: -1.2221 1st Qu.: 24.00
## Median : 46.12 Median : 0.1012 Median : 47.00
## Mean : 53.89 Mean : 0.2385 Mean : 52.53
## 3rd Qu.: 72.92 3rd Qu.: 1.8456 3rd Qu.: 69.00
## Max. :174.54 Max. : 9.8822 Max. :336.00
##
## percent_return_next_dividend
## Min. :0.06557
## 1st Qu.:0.53455
## Median :0.68107
## Mean :0.69183
## 3rd Qu.:0.85429
## Max. :1.56421
##
#We use pivot_longer to reshape data into long format to be able to use facet_wrap
#Allows us to plot all histograms in one plot since we have all variables in one variable.
df_long <- df |>
pivot_longer(cols = c(open, high, low, close, volume, percent_change_price),
names_to = "variable", values_to = "value")
ggplot(df_long, aes(x = value)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.6) +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distributions of Numeric Variables")
Some of these aren’t too insightful since we are using all stocks to create the histograms, and not all stocks trade at the same prices or volumes.
We can see how close, open,
high and low have this up and down behavior
which shows the different prices of stocks.
percent_change_price is very interesting because
it looks like an almost perfect normal distribution. This alludes to the
50/50 behavior of the stock market. Normal distribution suggests
price changes behave symmetrically around the mean (0.05).
df_long2 <- df |>
pivot_longer(cols = c(percent_change_volume_over_last_wk, previous_weeks_volume,
next_weeks_open, next_weeks_close, percent_change_next_weeks_price,
days_to_next_dividend, percent_return_next_dividend),
names_to = "variable", values_to = "value")
ggplot(df_long2, aes(x = value)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.6) +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distributions of Numeric Variables")
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_bin()`).
next_weeks_close displays a decreasing behavior. This
could be due to the fact that most stocks aren’t as expensive to
purchase, so as price goes up, frequency goes down. This doesn’t tell us
much about the real behavior anyways as we don’t know if it’s showing an
increase or decrease throughout time.
percent_change_next_weeks_price follows same normal
distribution around the mean (0.23) just like
percent_change_price.
All other plots show interesting but not ultra insightful results
again, since it’s not specific to a stock but it does speak about the
overall behavior in some way.
# Summarize each stock's data with summary statistics
df_long_stocks <- df %>%
group_by(stock) %>%
summarise(
count = n(),
mean_open = mean(open, na.rm = TRUE),
mean_high = mean(high, na.rm = TRUE),
mean_low = mean(low, na.rm = TRUE),
mean_close = mean(close, na.rm = TRUE),
mean_volume = mean(volume, na.rm = TRUE),
mean_percent_change_price = mean(percent_change_price, na.rm = TRUE),
mean_percent_change_volume_over_last_wk = mean(percent_change_volume_over_last_wk, na.rm = TRUE),
mean_previous_weeks_volume = mean(previous_weeks_volume, na.rm = TRUE),
mean_next_weeks_open = mean(next_weeks_open, na.rm = TRUE),
mean_next_weeks_close = mean(next_weeks_close, na.rm = TRUE),
mean_percent_change_next_weeks_price = mean(percent_change_next_weeks_price, na.rm = TRUE),
mean_days_to_next_dividend = mean(days_to_next_dividend, na.rm = TRUE),
mean_percent_return_next_dividend = mean(percent_return_next_dividend, na.rm = TRUE),
median_open = median(open, na.rm = TRUE),
median_high = median(high, na.rm = TRUE),
median_low = median(low, na.rm = TRUE),
median_close = median(close, na.rm = TRUE),
median_volume = median(volume, na.rm = TRUE),
median_percent_change_price = median(percent_change_price, na.rm = TRUE),
median_percent_change_volume_over_last_wk = median(percent_change_volume_over_last_wk, na.rm = TRUE),
median_previous_weeks_volume = median(previous_weeks_volume, na.rm = TRUE),
median_next_weeks_open = median(next_weeks_open, na.rm = TRUE),
median_next_weeks_close = median(next_weeks_close, na.rm = TRUE),
median_percent_change_next_weeks_price = median(percent_change_next_weeks_price, na.rm = TRUE),
median_days_to_next_dividend = median(days_to_next_dividend, na.rm = TRUE),
median_percent_return_next_dividend = median(percent_return_next_dividend, na.rm = TRUE)
)
# View the summarized data for each stock
head(df_long_stocks)
## # A tibble: 6 × 28
## stock count mean_open mean_high mean_low mean_close mean_volume
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AA 25 16.6 17.0 16.1 16.5 129638810.
## 2 AXP 25 46.5 47.6 45.6 46.7 35208482.
## 3 BA 25 73.3 74.8 71.6 73.4 23781421.
## 4 BAC 25 13.2 13.5 12.8 13.1 722999136.
## 5 CAT 25 103. 106. 100. 103. 33731116.
## 6 CSCO 25 18.1 18.4 17.6 17.9 358661588.
## # ℹ 21 more variables: mean_percent_change_price <dbl>,
## # mean_percent_change_volume_over_last_wk <dbl>,
## # mean_previous_weeks_volume <dbl>, mean_next_weeks_open <dbl>,
## # mean_next_weeks_close <dbl>, mean_percent_change_next_weeks_price <dbl>,
## # mean_days_to_next_dividend <dbl>, mean_percent_return_next_dividend <dbl>,
## # median_open <dbl>, median_high <dbl>, median_low <dbl>, median_close <dbl>,
## # median_volume <int>, median_percent_change_price <dbl>, …
This gives us an idea about the average’s of each stock.
We can see that most of these stocks show volatility.
The highest average percent price change is 0.59% and the lowest is -1.17%.
This would equal to those stocks going up 0.59% and -1.17% each week.
To back this up we will create a plot to show their volatility.
ggplot(df, aes(x = date, y = percent_change_price, color = stock,
group = stock)) +
geom_line() +
labs(title = 'Stock Percent Change in Price Over Time',
x = 'Date',
y = 'Percent Change in Price') +
theme_minimal() +
theme(legend.position = 'none')
In the plot we see that from week to week, some stocks are very volatile.
Since dropping 30 out of our 750 observations would be quite a bit since we are taking away 4% of each stock’s observations, and we have very limited data, we will create 2 dataframes.
One with those observations removed, and one with those observations showing a 0 to assume no change in the first week.
#Dataset with removed NA's
df_NA_removed <- na.omit(df)
#Dataset with inputted 0's instead of NA's
df_NA_inputted <- df
df_NA_inputted[is.na(df_NA_inputted)] <- 0
summary(df_NA_removed)
## quarter stock date open high
## 1:330 AA : 24 Min. :2011-01-14 Min. : 10.59 Min. : 10.94
## 2:390 AXP : 24 1st Qu.:2011-02-23 1st Qu.: 29.96 1st Qu.: 30.66
## BA : 24 Median :2011-04-04 Median : 46.02 Median : 46.91
## BAC : 24 Mean :2011-04-04 Mean : 53.75 Mean : 54.76
## CAT : 24 3rd Qu.:2011-05-14 3rd Qu.: 72.81 3rd Qu.: 74.34
## CSCO : 24 Max. :2011-06-24 Max. :172.11 Max. :173.54
## (Other):576
## low close volume percent_change_price
## Min. : 10.40 Min. : 10.52 Min. :9.719e+06 Min. :-15.42290
## 1st Qu.: 28.71 1st Qu.: 30.41 1st Qu.:3.069e+07 1st Qu.: -1.29736
## Median : 44.95 Median : 46.12 Median :5.275e+07 Median : 0.00000
## Mean : 52.73 Mean : 53.83 Mean :1.156e+08 Mean : 0.03014
## 3rd Qu.: 71.11 3rd Qu.: 72.77 3rd Qu.:1.312e+08 3rd Qu.: 1.63087
## Max. :167.82 Max. :170.58 Max. :1.054e+09 Max. : 9.88223
##
## percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open
## Min. :-61.4332 Min. :9.719e+06 Min. : 10.52
## 1st Qu.:-19.8043 1st Qu.:3.068e+07 1st Qu.: 30.39
## Median : 0.5126 Median :5.295e+07 Median : 46.04
## Mean : 5.5936 Mean :1.174e+08 Mean : 53.81
## 3rd Qu.: 21.8006 3rd Qu.:1.333e+08 3rd Qu.: 72.81
## Max. :327.4089 Max. :1.453e+09 Max. :172.11
##
## next_weeks_close percent_change_next_weeks_price days_to_next_dividend
## Min. : 10.52 Min. :-15.4229 Min. : 0.00
## 1st Qu.: 30.51 1st Qu.: -1.2524 1st Qu.: 24.00
## Median : 45.93 Median : 0.0360 Median : 47.00
## Mean : 53.97 Mean : 0.1933 Mean : 52.26
## 3rd Qu.: 73.00 3rd Qu.: 1.7927 3rd Qu.: 69.25
## Max. :174.54 Max. : 9.8822 Max. :329.00
##
## percent_return_next_dividend
## Min. :0.06557
## 1st Qu.:0.53409
## Median :0.68014
## Mean :0.69156
## 3rd Qu.:0.85402
## Max. :1.56421
##
summary(df_NA_inputted)
## quarter stock date open high
## 1:360 AA : 25 Min. :2011-01-07 Min. : 10.59 Min. : 10.94
## 2:390 AXP : 25 1st Qu.:2011-02-18 1st Qu.: 29.83 1st Qu.: 30.63
## BA : 25 Median :2011-04-01 Median : 45.97 Median : 46.88
## BAC : 25 Mean :2011-03-31 Mean : 53.65 Mean : 54.67
## CAT : 25 3rd Qu.:2011-05-13 3rd Qu.: 72.72 3rd Qu.: 74.29
## CSCO : 25 Max. :2011-06-24 Max. :172.11 Max. :173.54
## (Other):600
## low close volume percent_change_price
## Min. : 10.40 Min. : 10.52 Min. :9.719e+06 Min. :-15.42290
## 1st Qu.: 28.72 1st Qu.: 30.36 1st Qu.:3.087e+07 1st Qu.: -1.28805
## Median : 44.80 Median : 45.93 Median :5.306e+07 Median : 0.00000
## Mean : 52.64 Mean : 53.73 Mean :1.175e+08 Mean : 0.05026
## 3rd Qu.: 71.04 3rd Qu.: 72.67 3rd Qu.:1.327e+08 3rd Qu.: 1.65089
## Max. :167.82 Max. :170.58 Max. :1.453e+09 Max. : 9.88223
##
## percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open
## Min. :-61.43 Min. :0.000e+00 Min. : 10.52
## 1st Qu.:-18.89 1st Qu.:2.812e+07 1st Qu.: 30.32
## Median : 0.00 Median :5.069e+07 Median : 46.02
## Mean : 5.37 Mean :1.127e+08 Mean : 53.70
## 3rd Qu.: 19.98 3rd Qu.:1.296e+08 3rd Qu.: 72.72
## Max. :327.41 Max. :1.453e+09 Max. :172.11
##
## next_weeks_close percent_change_next_weeks_price days_to_next_dividend
## Min. : 10.52 Min. :-15.4229 Min. : 0.00
## 1st Qu.: 30.46 1st Qu.: -1.2221 1st Qu.: 24.00
## Median : 46.12 Median : 0.1012 Median : 47.00
## Mean : 53.89 Mean : 0.2385 Mean : 52.53
## 3rd Qu.: 72.92 3rd Qu.: 1.8456 3rd Qu.: 69.00
## Max. :174.54 Max. : 9.8822 Max. :336.00
##
## percent_return_next_dividend
## Min. :0.06557
## 1st Qu.:0.53455
## Median :0.68107
## Mean :0.69183
## 3rd Qu.:0.85429
## Max. :1.56421
##
Only those 2 variables had changes in their summary statistics.
(previous_week_volume,
percent_change_volume_over_last_wk)
Variables that give us information about next week are important to remove. In reality, when predicting next week, we are technically not able to know all those things about the future.
#inputted dataframe
df_NA_inputted <- df_NA_inputted |>
select(-next_weeks_close, -next_weeks_open,
-days_to_next_dividend)
#Dropped NA's dataframe
df_NA_removed <- df_NA_removed |>
select(-next_weeks_close, -next_weeks_open,
-days_to_next_dividend)
# Create lagged variables
df_NA_removed_lagged <- df_NA_removed |>
arrange(stock) |> # Make sure the data is sorted by stock
mutate(
lagged_open = lag(open, 1), # Lagged open price (previous week's open price)
lagged_high = lag(high, 1), # Lagged high price
lagged_low = lag(low, 1), # Lagged low price
lagged_close = lag(close, 1), # Lagged close price
lagged_volume = lag(volume, 1), # Lagged volume (previous week's volume)
lagged_percent_change_price = lag(percent_change_price, 1), # Lagged percent change in price
lagged_percent_change_volume = lag(percent_change_volume_over_last_wk, 1),# Lagged percent change in volume
lagged_previous_weeks_volume = lag(previous_weeks_volume, 1)
)
df_NA_removed_lagged <- df_NA_removed_lagged |>
mutate(
lagged_open_2 = lag(open, 2), # Lagged open price (two weeks ago)
lagged_high_2 = lag(high, 2),
lagged_low_2 = lag(low, 2),
lagged_close_2 = lag(close, 2),
lagged_volume_2 = lag(volume, 2),
lagged_percent_change_price_2 = lag(percent_change_price, 2),
lagged_percent_change_volume_2 = lag(percent_change_volume_over_last_wk, 2),
lagged_previous_weeks_volume_2 = lag(previous_weeks_volume, 2)
)
#Dataset with removed NA's
df_NA_removed_lagged <- na.omit(df_NA_removed_lagged)
# -------------------------------------------------------------------------------------
# Create lagged variables
df_NA_inputted_lagged <- df_NA_inputted |>
arrange(stock) |> # Make sure the data is sorted by stock
mutate(
lagged_open = lag(open, 1), # Lagged open price (previous week's open price)
lagged_high = lag(high, 1), # Lagged high price
lagged_low = lag(low, 1), # Lagged low price
lagged_close = lag(close, 1), # Lagged close price
lagged_volume = lag(volume, 1), # Lagged volume (previous week's volume)
lagged_percent_change_price = lag(percent_change_price, 1), # Lagged percent change in price
lagged_percent_change_volume = lag(percent_change_volume_over_last_wk, 1),# Lagged percent change in volume
lagged_previous_weeks_volume = lag(previous_weeks_volume, 1)
)
df_NA_inputted_lagged <- df_NA_inputted_lagged |>
mutate(
lagged_open_2 = lag(open, 2), # Lagged open price (two weeks ago)
lagged_high_2 = lag(high, 2),
lagged_low_2 = lag(low, 2),
lagged_close_2 = lag(close, 2),
lagged_volume_2 = lag(volume, 2),
lagged_percent_change_price_2 = lag(percent_change_price, 2),
lagged_percent_change_volume_2 = lag(percent_change_volume_over_last_wk, 2),
lagged_previous_weeks_volume_2 = lag(previous_weeks_volume, 2)
)
#Dataset with inputted 0's instead of NA's
df_NA_inputted_lagged <- df_NA_inputted_lagged
df_NA_inputted_lagged[is.na(df_NA_inputted_lagged)] <- 0
cor(df_NA_removed_lagged[, sapply(df_NA_removed_lagged, is.numeric)])
## open high low
## open 1.00000000 0.999611385 0.99933736
## high 0.99961139 1.000000000 0.99941765
## low 0.99933736 0.999417648 1.00000000
## close 0.99904623 0.999553822 0.99954695
## volume -0.52209679 -0.520950925 -0.52385759
## percent_change_price 0.07024937 0.087061325 0.09197482
## percent_change_volume_over_last_wk -0.01062046 -0.008627264 -0.01969032
## previous_weeks_volume -0.51079386 -0.510222211 -0.51041632
## percent_change_next_weeks_price 0.06616456 0.069057497 0.06670463
## percent_return_next_dividend -0.15338047 -0.155602054 -0.15092841
## lagged_open 0.94992226 0.949491516 0.94853525
## lagged_high 0.95000426 0.949654918 0.94865023
## lagged_low 0.95010148 0.949587563 0.94882517
## lagged_close 0.95093171 0.950515352 0.94962995
## lagged_volume -0.49355500 -0.493169280 -0.49289465
## lagged_percent_change_price 0.11011153 0.110007214 0.11110462
## lagged_percent_change_volume -0.01882060 -0.018207471 -0.01840773
## lagged_previous_weeks_volume -0.47640137 -0.476185656 -0.47595670
## lagged_open_2 0.89875253 0.897698066 0.89672957
## lagged_high_2 0.89950796 0.898401651 0.89741422
## lagged_low_2 0.89874914 0.897598088 0.89679245
## lagged_close_2 0.90000820 0.898801509 0.89786205
## lagged_volume_2 -0.45729558 -0.456825006 -0.45631728
## lagged_percent_change_price_2 0.10790929 0.104851789 0.10523920
## lagged_percent_change_volume_2 -0.01858192 -0.016801425 -0.01802300
## lagged_previous_weeks_volume_2 -0.44419533 -0.443882454 -0.44327649
## close volume percent_change_price
## open 0.99904623 -0.52209679 0.07024937
## high 0.99955382 -0.52095093 0.08706133
## low 0.99954695 -0.52385759 0.09197482
## close 1.00000000 -0.52358494 0.10657543
## volume -0.52358494 1.00000000 -0.17154294
## percent_change_price 0.10657543 -0.17154294 1.00000000
## percent_change_volume_over_last_wk -0.01863109 0.18232179 -0.23735502
## previous_weeks_volume -0.51022874 0.88776696 -0.06591756
## percent_change_next_weeks_price 0.06724401 -0.09018005 0.01898666
## percent_return_next_dividend -0.15336623 -0.26867837 0.04386879
## lagged_open 0.94809539 -0.48788679 0.06083957
## lagged_high 0.94828405 -0.48685618 0.06256990
## lagged_low 0.94826037 -0.48867370 0.06026194
## lagged_close 0.94920382 -0.48852929 0.06190385
## lagged_volume -0.49296477 0.85517252 -0.09948906
## lagged_percent_change_price 0.11124746 -0.10217854 0.01586226
## lagged_percent_change_volume -0.01862318 0.03661341 0.02628712
## lagged_previous_weeks_volume -0.47608534 0.81924674 -0.11416825
## lagged_open_2 0.89620722 -0.45191718 0.04061948
## lagged_high_2 0.89684916 -0.45160298 0.03868478
## lagged_low_2 0.89613025 -0.45234911 0.03924222
## lagged_close_2 0.89720403 -0.45260228 0.03686459
## lagged_volume_2 -0.45673849 0.76287060 -0.08868801
## lagged_percent_change_price_2 0.10367740 -0.09670732 -0.05868190
## lagged_percent_change_volume_2 -0.01583372 0.01456590 0.03258479
## lagged_previous_weeks_volume_2 -0.44413581 0.75584470 -0.10363913
## percent_change_volume_over_last_wk
## open -0.010620460
## high -0.008627264
## low -0.019690317
## close -0.018631086
## volume 0.182321786
## percent_change_price -0.237355021
## percent_change_volume_over_last_wk 1.000000000
## previous_weeks_volume -0.128326210
## percent_change_next_weeks_price 0.013967095
## percent_return_next_dividend -0.020573308
## lagged_open -0.006175426
## lagged_high -0.007556415
## lagged_low -0.003163766
## lagged_close -0.003400151
## lagged_volume -0.117911110
## lagged_percent_change_price 0.087643041
## lagged_percent_change_volume -0.307370587
## lagged_previous_weeks_volume -0.035910745
## lagged_open_2 -0.027590464
## lagged_high_2 -0.028477492
## lagged_low_2 -0.026803630
## lagged_close_2 -0.026783489
## lagged_volume_2 -0.038944093
## lagged_percent_change_price_2 0.031885748
## lagged_percent_change_volume_2 -0.094053579
## lagged_previous_weeks_volume_2 -0.003574668
## previous_weeks_volume
## open -0.51079386
## high -0.51022221
## low -0.51041632
## close -0.51022874
## volume 0.88776696
## percent_change_price -0.06591756
## percent_change_volume_over_last_wk -0.12832621
## previous_weeks_volume 1.00000000
## percent_change_next_weeks_price -0.10211187
## percent_return_next_dividend -0.26793188
## lagged_open -0.45959234
## lagged_high -0.45843756
## lagged_low -0.46191203
## lagged_close -0.46191819
## lagged_volume 0.91183604
## lagged_percent_change_price -0.17810986
## lagged_percent_change_volume 0.17972928
## lagged_previous_weeks_volume 0.79806858
## lagged_open_2 -0.42731752
## lagged_high_2 -0.42640569
## lagged_low_2 -0.42765881
## lagged_close_2 -0.42764476
## lagged_volume_2 0.76468762
## lagged_percent_change_price_2 -0.08401117
## lagged_percent_change_volume_2 0.04736541
## lagged_previous_weeks_volume_2 0.73309926
## percent_change_next_weeks_price
## open 0.06616456
## high 0.06905750
## low 0.06670463
## close 0.06724401
## volume -0.09018005
## percent_change_price 0.01898666
## percent_change_volume_over_last_wk 0.01396710
## previous_weeks_volume -0.10211187
## percent_change_next_weeks_price 1.00000000
## percent_return_next_dividend 0.10243026
## lagged_open 0.04745815
## lagged_high 0.04642812
## lagged_low 0.04643954
## lagged_close 0.04391979
## lagged_volume -0.07733783
## lagged_percent_change_price -0.05414914
## lagged_percent_change_volume 0.04169501
## lagged_previous_weeks_volume -0.09624255
## lagged_open_2 0.04620233
## lagged_high_2 0.04535166
## lagged_low_2 0.04761554
## lagged_close_2 0.04454016
## lagged_volume_2 -0.07713344
## lagged_percent_change_price_2 -0.02608339
## lagged_percent_change_volume_2 0.02221783
## lagged_previous_weeks_volume_2 -0.08782722
## percent_return_next_dividend lagged_open
## open -0.15338047 0.949922257
## high -0.15560205 0.949491516
## low -0.15092841 0.948535252
## close -0.15336623 0.948095393
## volume -0.26867837 -0.487886788
## percent_change_price 0.04386879 0.060839572
## percent_change_volume_over_last_wk -0.02057331 -0.006175426
## previous_weeks_volume -0.26793188 -0.459592340
## percent_change_next_weeks_price 0.10243026 0.047458155
## percent_return_next_dividend 1.00000000 -0.137853997
## lagged_open -0.13785400 1.000000000
## lagged_high -0.14005190 0.999612396
## lagged_low -0.13497878 0.999338174
## lagged_close -0.13783538 0.999050001
## lagged_volume -0.25975805 -0.522082802
## lagged_percent_change_price 0.04819822 0.072838335
## lagged_percent_change_volume -0.02455874 -0.009010208
## lagged_previous_weeks_volume -0.25732162 -0.511464726
## lagged_open_2 -0.12191685 0.949971352
## lagged_high_2 -0.12395720 0.950054464
## lagged_low_2 -0.11896245 0.950152236
## lagged_close_2 -0.12169322 0.950982254
## lagged_volume_2 -0.24852715 -0.494297247
## lagged_percent_change_price_2 0.05014172 0.113215717
## lagged_percent_change_volume_2 -0.02894675 -0.018729515
## lagged_previous_weeks_volume_2 -0.24608821 -0.477029802
## lagged_high lagged_low lagged_close
## open 0.950004255 0.950101476 0.950931710
## high 0.949654918 0.949587563 0.950515352
## low 0.948650231 0.948825167 0.949629951
## close 0.948284049 0.948260365 0.949203824
## volume -0.486856179 -0.488673698 -0.488529291
## percent_change_price 0.062569896 0.060261939 0.061903855
## percent_change_volume_over_last_wk -0.007556415 -0.003163766 -0.003400151
## previous_weeks_volume -0.458437557 -0.461912030 -0.461918194
## percent_change_next_weeks_price 0.046428123 0.046439540 0.043919787
## percent_return_next_dividend -0.140051903 -0.134978781 -0.137835384
## lagged_open 0.999612396 0.999338174 0.999050001
## lagged_high 1.000000000 0.999419603 0.999560325
## lagged_low 0.999419603 1.000000000 0.999549129
## lagged_close 0.999560325 0.999549129 1.000000000
## lagged_volume -0.520952595 -0.523831162 -0.523529221
## lagged_percent_change_price 0.089689711 0.094538016 0.109063056
## lagged_percent_change_volume -0.007025568 -0.018051389 -0.016965888
## lagged_previous_weeks_volume -0.510907058 -0.511083629 -0.510879255
## lagged_open_2 0.949538420 0.948587696 0.948156577
## lagged_high_2 0.949703781 0.948703494 0.948344664
## lagged_low_2 0.949636656 0.948879227 0.948322916
## lagged_close_2 0.950565383 0.949683270 0.949263466
## lagged_volume_2 -0.493925450 -0.493634292 -0.493688156
## lagged_percent_change_price_2 0.113146630 0.114207328 0.114318059
## lagged_percent_change_volume_2 -0.018119874 -0.018314096 -0.018522460
## lagged_previous_weeks_volume_2 -0.476824532 -0.476583284 -0.476700159
## lagged_volume lagged_percent_change_price
## open -0.49355500 0.11011153
## high -0.49316928 0.11000721
## low -0.49289465 0.11110462
## close -0.49296477 0.11124746
## volume 0.85517252 -0.10217854
## percent_change_price -0.09948906 0.01586226
## percent_change_volume_over_last_wk -0.11791111 0.08764304
## previous_weeks_volume 0.91183604 -0.17810986
## percent_change_next_weeks_price -0.07733783 -0.05414914
## percent_return_next_dividend -0.25975805 0.04819822
## lagged_open -0.52208280 0.07283834
## lagged_high -0.52095259 0.08968971
## lagged_low -0.52383116 0.09453802
## lagged_close -0.52352922 0.10906306
## lagged_volume 1.00000000 -0.17171421
## lagged_percent_change_price -0.17171421 1.00000000
## lagged_percent_change_volume 0.18188628 -0.23503010
## lagged_previous_weeks_volume 0.88754357 -0.06713164
## lagged_open_2 -0.48792623 0.06347063
## lagged_high_2 -0.48687888 0.06518509
## lagged_low_2 -0.48869738 0.06290927
## lagged_close_2 -0.48853711 0.06450827
## lagged_volume_2 0.85492670 -0.10074022
## lagged_percent_change_price_2 -0.10232930 0.01764445
## lagged_percent_change_volume_2 0.03658853 0.02653782
## lagged_previous_weeks_volume_2 0.81907820 -0.11540234
## lagged_percent_change_volume
## open -0.018820598
## high -0.018207471
## low -0.018407727
## close -0.018623181
## volume 0.036613406
## percent_change_price 0.026287119
## percent_change_volume_over_last_wk -0.307370587
## previous_weeks_volume 0.179729278
## percent_change_next_weeks_price 0.041695015
## percent_return_next_dividend -0.024558739
## lagged_open -0.009010208
## lagged_high -0.007025568
## lagged_low -0.018051389
## lagged_close -0.016965888
## lagged_volume 0.181886279
## lagged_percent_change_price -0.235030105
## lagged_percent_change_volume 1.000000000
## lagged_previous_weeks_volume -0.129426103
## lagged_open_2 -0.004622536
## lagged_high_2 -0.005975442
## lagged_low_2 -0.001578447
## lagged_close_2 -0.001803112
## lagged_volume_2 -0.119100461
## lagged_percent_change_price_2 0.090575810
## lagged_percent_change_volume_2 -0.306943116
## lagged_previous_weeks_volume_2 -0.037078170
## lagged_previous_weeks_volume lagged_open_2
## open -0.47640137 0.898752526
## high -0.47618566 0.897698066
## low -0.47595670 0.896729566
## close -0.47608534 0.896207218
## volume 0.81924674 -0.451917178
## percent_change_price -0.11416825 0.040619478
## percent_change_volume_over_last_wk -0.03591074 -0.027590464
## previous_weeks_volume 0.79806858 -0.427317519
## percent_change_next_weeks_price -0.09624255 0.046202333
## percent_return_next_dividend -0.25732162 -0.121916851
## lagged_open -0.51146473 0.949971352
## lagged_high -0.51090706 0.949538420
## lagged_low -0.51108363 0.948587696
## lagged_close -0.51087926 0.948156577
## lagged_volume 0.88754357 -0.487926225
## lagged_percent_change_price -0.06713164 0.063470628
## lagged_percent_change_volume -0.12942610 -0.004622536
## lagged_previous_weeks_volume 1.00000000 -0.460319971
## lagged_open_2 -0.46031997 1.000000000
## lagged_high_2 -0.45916283 0.999612959
## lagged_low_2 -0.46264236 0.999338540
## lagged_close_2 -0.46263789 0.999051557
## lagged_volume_2 0.91191460 -0.522808818
## lagged_percent_change_price_2 -0.17970581 0.076026713
## lagged_percent_change_volume_2 0.17954975 -0.008932521
## lagged_previous_weeks_volume_2 0.79823577 -0.512071004
## lagged_high_2 lagged_low_2 lagged_close_2
## open 0.899507963 0.898749140 0.900008205
## high 0.898401651 0.897598088 0.898801509
## low 0.897414225 0.896792447 0.897862053
## close 0.896849158 0.896130248 0.897204030
## volume -0.451602976 -0.452349108 -0.452602279
## percent_change_price 0.038684777 0.039242225 0.036864590
## percent_change_volume_over_last_wk -0.028477492 -0.026803630 -0.026783489
## previous_weeks_volume -0.426405693 -0.427658806 -0.427644761
## percent_change_next_weeks_price 0.045351660 0.047615541 0.044540162
## percent_return_next_dividend -0.123957197 -0.118962452 -0.121693221
## lagged_open 0.950054464 0.950152236 0.950982254
## lagged_high 0.949703781 0.949636656 0.950565383
## lagged_low 0.948703494 0.948879227 0.949683270
## lagged_close 0.948344664 0.948322916 0.949263466
## lagged_volume -0.486878884 -0.488697380 -0.488537105
## lagged_percent_change_price 0.065185086 0.062909267 0.064508267
## lagged_percent_change_volume -0.005975442 -0.001578447 -0.001803112
## lagged_previous_weeks_volume -0.459162831 -0.462642361 -0.462637890
## lagged_open_2 0.999612959 0.999338540 0.999051557
## lagged_high_2 1.000000000 0.999420212 0.999560885
## lagged_low_2 0.999420212 1.000000000 0.999550165
## lagged_close_2 0.999560885 0.999550165 1.000000000
## lagged_volume_2 -0.521674406 -0.524559086 -0.524244645
## lagged_percent_change_price_2 0.092844465 0.097713521 0.112176520
## lagged_percent_change_volume_2 -0.006944765 -0.017965299 -0.016876672
## lagged_previous_weeks_volume_2 -0.511509561 -0.511692883 -0.511478704
## lagged_volume_2
## open -0.45729558
## high -0.45682501
## low -0.45631728
## close -0.45673849
## volume 0.76287060
## percent_change_price -0.08868801
## percent_change_volume_over_last_wk -0.03894409
## previous_weeks_volume 0.76468762
## percent_change_next_weeks_price -0.07713344
## percent_return_next_dividend -0.24852715
## lagged_open -0.49429725
## lagged_high -0.49392545
## lagged_low -0.49363429
## lagged_close -0.49368816
## lagged_volume 0.85492670
## lagged_percent_change_price -0.10074022
## lagged_percent_change_volume -0.11910046
## lagged_previous_weeks_volume 0.91191460
## lagged_open_2 -0.52280882
## lagged_high_2 -0.52167441
## lagged_low_2 -0.52455909
## lagged_close_2 -0.52424465
## lagged_volume_2 1.00000000
## lagged_percent_change_price_2 -0.17343031
## lagged_percent_change_volume_2 0.18168842
## lagged_previous_weeks_volume_2 0.88764091
## lagged_percent_change_price_2
## open 0.10790929
## high 0.10485179
## low 0.10523920
## close 0.10367740
## volume -0.09670732
## percent_change_price -0.05868190
## percent_change_volume_over_last_wk 0.03188575
## previous_weeks_volume -0.08401117
## percent_change_next_weeks_price -0.02608339
## percent_return_next_dividend 0.05014172
## lagged_open 0.11321572
## lagged_high 0.11314663
## lagged_low 0.11420733
## lagged_close 0.11431806
## lagged_volume -0.10232930
## lagged_percent_change_price 0.01764445
## lagged_percent_change_volume 0.09057581
## lagged_previous_weeks_volume -0.17970581
## lagged_open_2 0.07602671
## lagged_high_2 0.09284446
## lagged_low_2 0.09771352
## lagged_close_2 0.11217652
## lagged_volume_2 -0.17343031
## lagged_percent_change_price_2 1.00000000
## lagged_percent_change_volume_2 -0.23423721
## lagged_previous_weeks_volume_2 -0.06898473
## lagged_percent_change_volume_2
## open -0.018581918
## high -0.016801425
## low -0.018023002
## close -0.015833720
## volume 0.014565902
## percent_change_price 0.032584790
## percent_change_volume_over_last_wk -0.094053579
## previous_weeks_volume 0.047365406
## percent_change_next_weeks_price 0.022217830
## percent_return_next_dividend -0.028946753
## lagged_open -0.018729515
## lagged_high -0.018119874
## lagged_low -0.018314096
## lagged_close -0.018522460
## lagged_volume 0.036588531
## lagged_percent_change_price 0.026537817
## lagged_percent_change_volume -0.306943116
## lagged_previous_weeks_volume 0.179549751
## lagged_open_2 -0.008932521
## lagged_high_2 -0.006944765
## lagged_low_2 -0.017965299
## lagged_close_2 -0.016876672
## lagged_volume_2 0.181688418
## lagged_percent_change_price_2 -0.234237214
## lagged_percent_change_volume_2 1.000000000
## lagged_previous_weeks_volume_2 -0.129468003
## lagged_previous_weeks_volume_2
## open -0.444195334
## high -0.443882454
## low -0.443276492
## close -0.444135813
## volume 0.755844701
## percent_change_price -0.103639132
## percent_change_volume_over_last_wk -0.003574668
## previous_weeks_volume 0.733099256
## percent_change_next_weeks_price -0.087827222
## percent_return_next_dividend -0.246088210
## lagged_open -0.477029802
## lagged_high -0.476824532
## lagged_low -0.476583284
## lagged_close -0.476700159
## lagged_volume 0.819078202
## lagged_percent_change_price -0.115402337
## lagged_percent_change_volume -0.037078170
## lagged_previous_weeks_volume 0.798235770
## lagged_open_2 -0.512071004
## lagged_high_2 -0.511509561
## lagged_low_2 -0.511692883
## lagged_close_2 -0.511478704
## lagged_volume_2 0.887640905
## lagged_percent_change_price_2 -0.068984727
## lagged_percent_change_volume_2 -0.129468003
## lagged_previous_weeks_volume_2 1.000000000
After analyzing the correlation, our lags do not seem to be correlated with our predictor variable. This means that adding our lags to the model will more than likely not add anything or improve the model.
We are going to use 1st quarter as training and 2nd quarter as testing.
train_df_removed <- subset(df_NA_removed, quarter == 1)
test_df_removed <- subset(df_NA_removed, quarter == 2)
train_df_inputted <- subset(df_NA_inputted, quarter == 1)
test_df_inputted <- subset(df_NA_inputted, quarter == 2)
lm_model_full_r <- lm(percent_change_next_weeks_price ~ stock + date + open +
high + low + close + volume +
percent_change_price + percent_change_volume_over_last_wk +
previous_weeks_volume + percent_return_next_dividend,
data = train_df_removed)
summary(lm_model_full_r)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date +
## open + high + low + close + volume + percent_change_price +
## percent_change_volume_over_last_wk + previous_weeks_volume +
## percent_return_next_dividend, data = train_df_removed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0884 -1.2144 0.0623 1.2512 8.1557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.056e+00 9.599e+01 0.042 0.96632
## stockAXP 7.109e+00 3.726e+00 1.908 0.05736 .
## stockBA 1.480e+01 6.893e+00 2.147 0.03265 *
## stockBAC -2.513e+00 2.398e+00 -1.048 0.29554
## stockCAT 2.525e+01 8.626e+00 2.928 0.00368 **
## stockCSCO -2.576e+00 1.535e+00 -1.678 0.09447 .
## stockCVX 2.338e+01 9.958e+00 2.348 0.01956 *
## stockDD 8.622e+00 6.505e+00 1.325 0.18608
## stockDIS 4.391e+00 6.820e+00 0.644 0.52019
## stockGE -1.554e+00 3.716e+00 -0.418 0.67603
## stockHD 3.474e+00 4.718e+00 0.736 0.46213
## stockHPQ 7.331e+00 2.779e+00 2.638 0.00880 **
## stockIBM 4.332e+01 1.379e+01 3.142 0.00185 **
## stockINTC -3.103e+00 4.753e+00 -0.653 0.51443
## stockJNJ 9.393e+00 7.862e+00 1.195 0.23316
## stockJPM 7.038e+00 4.570e+00 1.540 0.12465
## stockKO 1.219e+01 7.098e+00 1.717 0.08708 .
## stockKRFT 6.193e-01 5.917e+00 0.105 0.91672
## stockMCD 1.474e+01 8.425e+00 1.749 0.08130 .
## stockMMM 2.056e+01 8.574e+00 2.398 0.01711 *
## stockMRK -6.000e-01 7.452e+00 -0.081 0.93589
## stockMSFT -6.131e-01 3.513e+00 -0.175 0.86158
## stockPFE -3.085e+00 5.915e+00 -0.522 0.60235
## stockPG 1.048e+01 7.599e+00 1.380 0.16874
## stockT -3.275e+00 9.559e+00 -0.343 0.73212
## stockTRV 1.099e+01 6.121e+00 1.795 0.07370 .
## stockUTX 1.873e+01 7.614e+00 2.459 0.01450 *
## stockVZ 2.532e-01 8.985e+00 0.028 0.97754
## stockWMT 8.616e+00 6.032e+00 1.428 0.15425
## stockXOM 1.833e+01 7.558e+00 2.426 0.01589 *
## date 2.621e-05 6.436e-03 0.004 0.99675
## open -4.017e-01 2.282e-01 -1.761 0.07937 .
## high 4.289e-01 2.556e-01 1.678 0.09439 .
## low -1.648e-01 2.489e-01 -0.662 0.50843
## close -1.823e-01 3.013e-01 -0.605 0.54565
## volume 1.829e-09 2.952e-09 0.619 0.53609
## percent_change_price -1.051e-01 1.072e-01 -0.981 0.32740
## percent_change_volume_over_last_wk -3.569e-03 4.537e-03 -0.787 0.43206
## previous_weeks_volume -7.041e-10 2.176e-09 -0.324 0.74646
## percent_return_next_dividend 5.175e+00 6.597e+00 0.785 0.43338
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.341 on 290 degrees of freedom
## Multiple R-squared: 0.1911, Adjusted R-squared: 0.08232
## F-statistic: 1.757 on 39 and 290 DF, p-value: 0.005192
vif(lm_model_full_r)
## GVIF Df GVIF^(1/(2*Df))
## stock 9.780008e+05 29 1.268474
## date 1.222277e+00 1 1.105567
## open 3.118355e+03 1 55.842233
## high 4.094441e+03 1 63.987820
## low 3.603692e+03 1 60.030755
## close 5.549627e+03 1 74.495818
## volume 1.405339e+01 1 3.748785
## percent_change_price 4.457538e+00 1 2.111288
## percent_change_volume_over_last_wk 2.488086e+00 1 1.577367
## previous_weeks_volume 8.992118e+00 1 2.998686
## percent_return_next_dividend 2.685693e+02 1 16.388085
In this block of code we ran and re-ran until we dropped all multicollinear variables in order to not have a bunch of useless models.
Had to remove close, high,
low, open, and
xpercent_return_next_dividend
lm_model_r <- lm(percent_change_next_weeks_price ~ stock + date +
volume + percent_change_price + percent_change_volume_over_last_wk +
previous_weeks_volume,
data = train_df_removed)
vif(lm_model_r)
## GVIF Df GVIF^(1/(2*Df))
## stock 14.775943 29 1.047526
## date 1.028527 1 1.014163
## volume 12.675917 1 3.560325
## percent_change_price 1.328936 1 1.152795
## percent_change_volume_over_last_wk 1.927818 1 1.388459
## previous_weeks_volume 8.352334 1 2.890041
summary(lm_model_r)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date +
## volume + percent_change_price + percent_change_volume_over_last_wk +
## previous_weeks_volume, data = train_df_removed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.8389 -1.3704 0.1016 1.4537 7.5444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.833e+01 9.247e+01 0.847 0.3976
## stockAXP -3.657e-01 1.084e+00 -0.337 0.7361
## stockBA 9.668e-02 1.096e+00 0.088 0.9297
## stockBAC -3.171e+00 2.241e+00 -1.415 0.1581
## stockCAT 1.011e+00 1.094e+00 0.924 0.3564
## stockCSCO -3.083e+00 1.215e+00 -2.537 0.0117 *
## stockCVX 1.142e+00 1.085e+00 1.052 0.2937
## stockDD 7.381e-01 1.091e+00 0.677 0.4992
## stockDIS 7.476e-01 1.077e+00 0.694 0.4883
## stockGE -3.150e-01 1.156e+00 -0.272 0.7855
## stockHD -1.903e-01 1.077e+00 -0.177 0.8599
## stockHPQ -1.262e+00 1.054e+00 -1.197 0.2321
## stockIBM 7.227e-01 1.099e+00 0.658 0.5114
## stockINTC -1.325e+00 1.118e+00 -1.186 0.2366
## stockJNJ -7.195e-01 1.073e+00 -0.670 0.5032
## stockJPM -7.886e-02 1.043e+00 -0.076 0.9398
## stockKO 4.312e-01 1.081e+00 0.399 0.6904
## stockKRFT -1.099e-01 1.080e+00 -0.102 0.9191
## stockMCD -1.226e-01 1.083e+00 -0.113 0.9100
## stockMMM 2.902e-02 1.107e+00 0.026 0.9791
## stockMRK -7.421e-01 1.053e+00 -0.704 0.4817
## stockMSFT -1.897e+00 1.132e+00 -1.675 0.0949 .
## stockPFE 3.443e-01 1.095e+00 0.315 0.7533
## stockPG -8.504e-01 1.074e+00 -0.792 0.4291
## stockT -9.040e-02 1.041e+00 -0.087 0.9308
## stockTRV 6.792e-01 1.103e+00 0.616 0.5385
## stockUTX 3.117e-01 1.104e+00 0.282 0.7780
## stockVZ 2.580e-01 1.050e+00 0.246 0.8061
## stockWMT -4.268e-01 1.066e+00 -0.400 0.6891
## stockXOM 1.110e-01 1.047e+00 0.106 0.9157
## date -5.201e-03 6.154e-03 -0.845 0.3987
## volume 2.185e-09 2.923e-09 0.747 0.4554
## percent_change_price -5.785e-02 6.099e-02 -0.949 0.3436
## percent_change_volume_over_last_wk 2.495e-04 4.162e-03 0.060 0.9522
## previous_weeks_volume 8.847e-11 2.186e-09 0.040 0.9677
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.44 on 295 degrees of freedom
## Multiple R-squared: 0.106, Adjusted R-squared: 0.002954
## F-statistic: 1.029 on 34 and 295 DF, p-value: 0.4292
step_lm_model_r <- step(lm_model_r, direction = 'both', trace = FALSE)
summary(step_lm_model_r)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ previous_weeks_volume,
## data = train_df_removed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3217 -1.2889 0.0459 1.4861 6.0230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.637e-01 1.637e-01 2.832 0.00492 **
## previous_weeks_volume -2.126e-09 7.494e-10 -2.837 0.00483 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.418 on 328 degrees of freedom
## Multiple R-squared: 0.02395, Adjusted R-squared: 0.02098
## F-statistic: 8.05 on 1 and 328 DF, p-value: 0.004834
We ended up a single variable. It is also statistically significant.
All other variables are either multicollinear, or they were dropped in stepwise selection.
Our stepwise linear model on NA removed dataset explains about 2.049% of the variance in percent_change_next_weeks_price.
par(mfrow = c(2, 2))
plot(step_lm_model_r)
par(mfrow = c(1, 1))
Our Diagnostic plots are not concerning. Residuals vs Fitted show a random pattern which suggest our residuals don’t have non-linear patterns.
Our Q-Q plot suggest normal residuals.
Our Scale-Location plot suggests that our residuals are not spread equally along the predictors. Assumption of equal variance might be rejected.
Our residuals vs leverage show a few potential high leverage points.
summary(influence.measures(step_lm_model_r))
## Potentially influential observations of
## lm(formula = percent_change_next_weeks_price ~ previous_weeks_volume, data = train_df_removed) :
##
## dfb.1_ dfb.pr__ dffit cov.r cook.d hat
## 4 0.10 0.01 0.13 0.97_* 0.01 0.00
## 17 0.15 -0.06 0.15 0.97_* 0.01 0.00
## 38 0.28 -0.59 -0.60_* 1.20_* 0.18 0.17_*
## 39 0.10 -0.29 -0.30_* 1.04_* 0.04 0.04_*
## 40 -0.29 0.70 0.71_* 1.04_* 0.25 0.07_*
## 41 -0.20 0.46 0.47_* 1.08_* 0.11 0.09_*
## 42 -0.03 0.09 0.09 1.05_* 0.00 0.04_*
## 43 0.00 -0.01 -0.01 1.05_* 0.00 0.04_*
## 44 0.00 -0.02 -0.03 1.02_* 0.00 0.01
## 45 -0.06 0.19 0.20 1.04_* 0.02 0.04_*
## 46 0.01 -0.04 -0.04 1.04_* 0.00 0.03_*
## 47 0.16 -0.44 -0.46_* 1.03_* 0.10 0.05_*
## 48 -0.03 0.08 0.08 1.05_* 0.00 0.05_*
## 64 0.07 0.05 0.13 0.98_* 0.01 0.00
## 65 -0.13 -0.29 -0.47_* 0.78_* 0.10 0.00
## 67 -0.06 0.14 0.15 1.09_* 0.01 0.08_*
## 101 0.15 -0.07 0.15 0.97_* 0.01 0.00
## 127 -0.15 0.07 -0.16 0.97_* 0.01 0.00
## 139 -0.24 0.06 -0.26_* 0.89_* 0.03 0.00
## 268 0.10 0.03 0.14 0.97_* 0.01 0.00
## 276 0.00 0.01 0.02 1.02_* 0.00 0.01
## 300 0.09 0.02 0.12 0.98_* 0.01 0.00
A lot of our influential points seem to come from BAC
stock which could mean we are removing meaningful behavior.
Instead of deleting these points, we will try transformations which we kind of already knew were going to be needed due to the skewness of our data
lm_model_r_ni <- lm(percent_change_next_weeks_price ~ previous_weeks_volume,
data = train_df_removed[-c(4, 17, 101, 127, 139, 232,
268, 276, 300), ])
summary(lm_model_r_ni)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ previous_weeks_volume,
## data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 268,
## 276, 300), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3407 -1.3413 0.0056 1.4529 5.9841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.063e-01 1.619e-01 3.127 0.00193 **
## previous_weeks_volume -2.215e-09 7.361e-10 -3.009 0.00283 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.365 on 319 degrees of freedom
## Multiple R-squared: 0.0276, Adjusted R-squared: 0.02455
## F-statistic: 9.054 on 1 and 319 DF, p-value: 0.00283
par(mfrow = c(2, 2))
plot(lm_model_r_ni)
par(mfrow = c(1, 1))
# Calculate RMSE
predictions <- predict(lm_model_r_ni, newdata = test_df_removed) # Get predictions
actuals <- test_df_removed$percent_change_next_weeks_price # Actual values
rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 2.836362
lm_model_full_i <- lm(percent_change_next_weeks_price ~ stock + date + open +
high + low + close + volume +
percent_change_price + percent_change_volume_over_last_wk +
previous_weeks_volume,
data = train_df_inputted)
summary(lm_model_full_i)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date +
## open + high + low + close + volume + percent_change_price +
## percent_change_volume_over_last_wk + previous_weeks_volume,
## data = train_df_inputted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5451 -1.2325 0.0115 1.3152 7.6999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.054e+01 8.503e+01 -0.124 0.901457
## stockAXP 1.005e+01 2.030e+00 4.949 1.20e-06 ***
## stockBA 1.919e+01 3.562e+00 5.389 1.38e-07 ***
## stockBAC -2.559e+00 2.112e+00 -1.212 0.226520
## stockCAT 2.982e+01 5.321e+00 5.605 4.49e-08 ***
## stockCSCO -1.066e+00 1.131e+00 -0.942 0.346941
## stockCVX 2.945e+01 5.209e+00 5.654 3.47e-08 ***
## stockDD 1.359e+01 2.451e+00 5.543 6.19e-08 ***
## stockDIS 9.653e+00 1.845e+00 5.233 3.02e-07 ***
## stockGE 1.574e+00 1.086e+00 1.450 0.148126
## stockHD 7.549e+00 1.613e+00 4.678 4.27e-06 ***
## stockHPQ 9.188e+00 2.033e+00 4.519 8.75e-06 ***
## stockIBM 4.992e+01 9.005e+00 5.544 6.17e-08 ***
## stockINTC 9.970e-01 1.079e+00 0.924 0.356253
## stockJNJ 1.528e+01 2.982e+00 5.124 5.17e-07 ***
## stockJPM 1.051e+01 2.058e+00 5.107 5.61e-07 ***
## stockKO 1.712e+01 3.142e+00 5.449 1.01e-07 ***
## stockKRFT 5.612e+00 1.368e+00 4.103 5.18e-05 ***
## stockMCD 2.039e+01 3.818e+00 5.342 1.75e-07 ***
## stockMMM 2.592e+01 4.728e+00 5.482 8.50e-08 ***
## stockMRK 4.948e+00 1.452e+00 3.407 0.000741 ***
## stockMSFT 2.399e+00 1.253e+00 1.915 0.056374 .
## stockPFE 1.774e+00 1.034e+00 1.716 0.087094 .
## stockPG 1.616e+01 3.133e+00 5.156 4.41e-07 ***
## stockT 4.428e+00 1.224e+00 3.617 0.000346 ***
## stockTRV 1.532e+01 2.771e+00 5.527 6.74e-08 ***
## stockUTX 2.341e+01 4.240e+00 5.522 6.93e-08 ***
## stockVZ 7.119e+00 1.567e+00 4.544 7.82e-06 ***
## stockWMT 1.316e+01 2.572e+00 5.117 5.37e-07 ***
## stockXOM 2.299e+01 4.187e+00 5.491 8.13e-08 ***
## date 1.062e-03 5.678e-03 0.187 0.851692
## open -4.001e-01 2.191e-01 -1.826 0.068745 .
## high 4.892e-01 2.418e-01 2.023 0.043898 *
## low -1.355e-01 2.400e-01 -0.565 0.572666
## close -3.074e-01 2.799e-01 -1.099 0.272808
## volume 4.622e-09 2.153e-09 2.146 0.032586 *
## percent_change_price -8.520e-02 9.834e-02 -0.866 0.386905
## percent_change_volume_over_last_wk -7.305e-03 3.885e-03 -1.880 0.060959 .
## previous_weeks_volume -3.061e-09 1.514e-09 -2.022 0.044001 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.36 on 321 degrees of freedom
## Multiple R-squared: 0.2013, Adjusted R-squared: 0.1067
## F-statistic: 2.129 on 38 and 321 DF, p-value: 0.0002404
Here we redo vif() just to make sure the variables removed in our first model are the same.
vif(lm_model_full_i)
## GVIF Df GVIF^(1/(2*Df))
## stock 3927.030715 29 1.153365
## date 1.216898 1 1.103131
## open 3052.736796 1 55.251577
## high 3880.393244 1 62.292803
## low 3559.092096 1 59.658127
## close 5072.945663 1 71.224614
## volume 9.148146 1 3.024590
## percent_change_price 4.095707 1 2.023785
## percent_change_volume_over_last_wk 1.796414 1 1.340304
## previous_weeks_volume 4.467358 1 2.113613
lm_model_i <- lm(percent_change_next_weeks_price ~ stock + date +
volume + percent_change_price + percent_change_volume_over_last_wk +
previous_weeks_volume,
data = train_df_inputted)
vif(lm_model_i)
## GVIF Df GVIF^(1/(2*Df))
## stock 14.574951 29 1.047279
## date 1.058982 1 1.029069
## volume 8.928571 1 2.988071
## percent_change_price 1.253402 1 1.119554
## percent_change_volume_over_last_wk 1.466034 1 1.210799
## previous_weeks_volume 4.422164 1 2.102894
close, high, low, and
open were removed the same as last model.
step_lm_model_i <- step(lm_model_i, direction = 'both', trace = FALSE)
summary(step_lm_model_i)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ volume + previous_weeks_volume,
## data = train_df_inputted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.350 -1.275 0.040 1.529 5.983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.856e-01 1.615e-01 3.007 0.002829 **
## volume 2.162e-09 1.111e-09 1.946 0.052473 .
## previous_weeks_volume -4.024e-09 1.118e-09 -3.600 0.000364 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.456 on 357 degrees of freedom
## Multiple R-squared: 0.0382, Adjusted R-squared: 0.03282
## F-statistic: 7.09 on 2 and 357 DF, p-value: 0.0009557
We ended up with 2 variables instead of just the 1 in the same model with different dataset.
volume is barely non-significant.
All other variables are either multicollinear, or they were dropped in stepwise selection.
Our stepwise linear model on NA inputted dataset explains about 3.282% of the variance in percent_change_next_weeks_price.
Better than model on df_NA_removed
par(mfrow = c(2, 2))
plot(step_lm_model_i)
par(mfrow = c(1, 1))
Almost the same results as our model on the Removed NA’s dataset.
Residuals vs fitted does seem to be more along the 0.
Scale-Location also seems to be more along the 0 and curves at the end.
summary(influence.measures(step_lm_model_i))
## Potentially influential observations of
## lm(formula = percent_change_next_weeks_price ~ volume + previous_weeks_volume, data = train_df_inputted) :
##
## dfb.1_ dfb.volm dfb.pr__ dffit cov.r cook.d hat
## 4 0.09 0.01 0.00 0.12 0.97_* 0.01 0.00
## 15 -0.13 0.02 0.03 -0.13 0.97_* 0.01 0.00
## 17 0.14 -0.04 0.00 0.14 0.96_* 0.01 0.00
## 37 -0.29 1.79_* -1.40_* 1.80_* 1.66_* 1.07_* 0.41_*
## 38 0.12 0.15 -0.36 -0.40_* 1.25_* 0.05 0.19_*
## 39 0.19 -0.31 -0.01 -0.48_* 1.06_* 0.08 0.07_*
## 40 -0.32 0.31 0.20 0.72_* 1.06_* 0.17 0.09_*
## 41 -0.18 -0.06 0.37 0.51_* 1.08_* 0.09 0.08_*
## 42 -0.03 0.02 0.03 0.07 1.05_* 0.00 0.04_*
## 43 -0.01 -0.01 0.02 0.02 1.05_* 0.00 0.04_*
## 44 0.03 -0.08 0.02 -0.10 1.04_* 0.00 0.04_*
## 45 -0.07 0.06 0.07 0.19 1.04_* 0.01 0.04_*
## 46 0.03 -0.04 -0.01 -0.09 1.06_* 0.00 0.05_*
## 47 0.19 -0.15 -0.19 -0.49_* 1.02 0.08 0.05_*
## 48 -0.03 0.01 0.04 0.07 1.06_* 0.00 0.05_*
## 61 0.00 0.01 -0.01 0.01 1.03_* 0.00 0.02
## 65 -0.12 0.00 -0.20 -0.46_* 0.71_* 0.06 0.00
## 66 0.05 -0.27 0.19 -0.28_* 1.15_* 0.03 0.13_*
## 67 -0.06 -0.18 0.32 0.33_* 1.12_* 0.04 0.11_*
## 101 0.14 -0.02 -0.02 0.14 0.96_* 0.01 0.00
## 119 0.00 0.00 0.00 0.00 1.03_* 0.00 0.02
## 120 0.00 -0.14 0.24 0.26 1.01 0.02 0.03_*
## 127 -0.15 0.04 0.01 -0.15 0.96_* 0.01 0.00
## 139 -0.24 0.10 -0.05 -0.26 0.86_* 0.02 0.00
## 157 0.00 0.03 -0.03 0.03 1.04_* 0.00 0.03_*
## 241 -0.21 -0.06 0.13 -0.24 0.91_* 0.02 0.00
## 253 -0.01 -0.05 0.04 -0.05 1.03_* 0.00 0.02
## 256 0.00 -0.03 0.02 -0.03 1.03_* 0.00 0.02
## 265 -0.01 -0.04 0.04 -0.05 1.04_* 0.00 0.03_*
## 268 0.05 0.13 -0.07 0.18 0.97_* 0.01 0.01
## 275 0.00 -0.03 0.02 -0.03 1.03_* 0.00 0.02
## 276 0.00 -0.02 0.04 0.04 1.03_* 0.00 0.02
## 300 0.07 0.02 0.00 0.12 0.97_* 0.00 0.00
Almost the same as the model on Removed NA’s dataset.
Hopefully transformations will help make these points less influential since we would be removing an entire stock.
lm_model_i_ni <- lm(percent_change_next_weeks_price ~ volume + previous_weeks_volume,
data = train_df_inputted[-c(4, 17, 101, 127, 139, 232,
268, 276, 300), ])
summary(lm_model_i_ni)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ volume + previous_weeks_volume,
## data = train_df_inputted[-c(4, 17, 101, 127, 139, 232, 268,
## 276, 300), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.304 -1.249 0.075 1.514 5.584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.674e-01 1.528e-01 3.059 0.002390 **
## volume 1.929e-09 1.046e-09 1.844 0.065970 .
## previous_weeks_volume -3.924e-09 1.056e-09 -3.716 0.000236 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.3 on 348 degrees of freedom
## Multiple R-squared: 0.04332, Adjusted R-squared: 0.03782
## F-statistic: 7.878 on 2 and 348 DF, p-value: 0.0004505
par(mfrow = c(2, 2))
plot(lm_model_i_ni)
par(mfrow = c(1, 1))
# Calculate RMSE
predictions <- predict(lm_model_i_ni, newdata = test_df_inputted) # Get predictions
actuals <- test_df_inputted$percent_change_next_weeks_price # Actual values
rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 2.839142
We are going to run skewness() function from the e1071 package to test for skewness.
Based on this and our previous histograms we are going to decide if and which variables to apply transformations to.
Interpreting the results:
Skewness > 0 → Right-skewed (positive skew)
Skewness < 0 → Left-skewed (negative skew)
Skewness ≈ 0 → Approximately symmetric
#Skewness for all numeric variables in our df_NA_removed
knitr::kable(sapply(df_NA_removed[, sapply(df_NA_removed, is.numeric)], skewness))
| x | |
|---|---|
| open | 1.2663722 |
| high | 1.2663619 |
| low | 1.2759798 |
| close | 1.2702365 |
| volume | 2.8568194 |
| percent_change_price | -0.4122834 |
| percent_change_volume_over_last_wk | 2.5334104 |
| previous_weeks_volume | 3.2429757 |
| percent_change_next_weeks_price | -0.1366288 |
| percent_return_next_dividend | 0.3967358 |
#Skewness for all numeric variables in our df_NA_inputted
knitr::kable(sapply(df_NA_inputted[, sapply(df_NA_inputted, is.numeric)], skewness))
| x | |
|---|---|
| open | 1.2643025 |
| high | 1.2643332 |
| low | 1.2747240 |
| close | 1.2692821 |
| volume | 3.2105227 |
| percent_change_price | -0.3793271 |
| percent_change_volume_over_last_wk | 2.5996699 |
| previous_weeks_volume | 3.2776377 |
| percent_change_next_weeks_price | -0.1716834 |
| percent_return_next_dividend | 0.3927708 |
For both datasets we get pretty much the same results with a little variation in the numbers.
percent_change_next_weeks_price is the only
normally distributed variable.
Left skewed variables:
percent_change_price
Right skewed variables:
open, high, low,
close, volume,
percent_change_volume_over_last_wk, and
previous_weeks_volume.
previous_weeks_volume and
percent_change_volume_over_last_wk are Highly right
skewed
skewness(log(df_NA_removed$open + 1))
## [1] -0.1056558
skewness(log(df_NA_removed$high + 1))
## [1] -0.1028366
skewness(log(df_NA_removed$low + 1))
## [1] -0.1130192
skewness(log(df_NA_removed$close + 1))
## [1] -0.1119091
skewness(log(df_NA_removed$volume + 1))
## [1] 0.5366983
skewness(sign(df_NA_removed$percent_change_price) * abs(df_NA_removed$percent_change_price)^(1/3)) #cube root transformation
## [1] -0.01831936
skewness(sign(df_NA_removed$percent_change_volume_over_last_wk) * abs(df_NA_removed$percent_change_volume_over_last_wk)^(1/3)) #cube root transformation
## [1] 0.09067206
skewness(log(df_NA_removed$previous_weeks_volume + 1))
## [1] 0.5419658
Since percent_change_price won’t allow us to do it’s
transformation inside the formula, we create a new variable to use the
transformation in our model.
train_df_removed$percent_change_price_transformed <- sign(train_df_removed$percent_change_price) * abs(train_df_removed$percent_change_price)^(1/3)
This full model is now using the optimal transformations for each variable.
t_lm_model_full_r <- lm(percent_change_next_weeks_price ~
stock + date +
log(open + 1) +
log(high + 1) +
log(low + 1) +
log(close + 1) +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_removed)
summary(t_lm_model_full_r)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date +
## log(open + 1) + log(high + 1) + log(low + 1) + log(close +
## 1) + log(volume + 1) + percent_change_price_transformed +
## percent_change_volume_over_last_wk + log(previous_weeks_volume +
## 1), data = train_df_removed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1822 -1.1612 0.0794 1.3467 8.7922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.592e+01 8.999e+01 0.621 0.53479
## stockAXP 2.261e+01 4.013e+00 5.635 4.13e-08 ***
## stockBA 3.387e+01 5.821e+00 5.819 1.56e-08 ***
## stockBAC -7.082e+00 1.640e+00 -4.319 2.15e-05 ***
## stockCAT 4.226e+01 7.168e+00 5.896 1.03e-08 ***
## stockCSCO -4.784e-01 1.396e+00 -0.343 0.73200
## stockCVX 4.139e+01 7.104e+00 5.826 1.51e-08 ***
## stockDD 2.750e+01 4.614e+00 5.961 7.26e-09 ***
## stockDIS 2.171e+01 3.719e+00 5.838 1.41e-08 ***
## stockGE 3.304e+00 1.453e+00 2.274 0.02368 *
## stockHD 1.817e+01 3.280e+00 5.539 6.81e-08 ***
## stockHPQ 2.067e+01 4.043e+00 5.112 5.79e-07 ***
## stockIBM 5.240e+01 9.035e+00 5.800 1.73e-08 ***
## stockINTC 3.312e+00 1.576e+00 2.101 0.03653 *
## stockJNJ 2.846e+01 5.223e+00 5.449 1.08e-07 ***
## stockJPM 2.180e+01 4.163e+00 5.235 3.17e-07 ***
## stockKO 3.091e+01 5.393e+00 5.732 2.48e-08 ***
## stockKRFT 1.465e+01 2.681e+00 5.465 9.98e-08 ***
## stockMCD 3.416e+01 6.031e+00 5.664 3.55e-08 ***
## stockMMM 3.947e+01 6.759e+00 5.839 1.40e-08 ***
## stockMRK 1.453e+01 2.902e+00 5.006 9.65e-07 ***
## stockMSFT 7.722e+00 2.352e+00 3.283 0.00115 **
## stockPFE 2.780e+00 1.288e+00 2.159 0.03169 *
## stockPG 2.948e+01 5.392e+00 5.468 9.78e-08 ***
## stockT 1.142e+01 2.356e+00 4.845 2.06e-06 ***
## stockTRV 2.994e+01 5.023e+00 5.962 7.21e-09 ***
## stockUTX 3.763e+01 6.391e+00 5.888 1.08e-08 ***
## stockVZ 1.746e+01 3.205e+00 5.447 1.09e-07 ***
## stockWMT 2.590e+01 4.766e+00 5.434 1.17e-07 ***
## stockXOM 3.531e+01 6.440e+00 5.484 9.05e-08 ***
## date -9.309e-04 6.011e-03 -0.155 0.87703
## log(open + 1) -9.821e+00 1.745e+01 -0.563 0.57411
## log(high + 1) 1.746e+01 1.735e+01 1.006 0.31515
## log(low + 1) 1.951e+01 1.513e+01 1.289 0.19838
## log(close + 1) -4.974e+01 1.712e+01 -2.906 0.00394 **
## log(volume + 1) 1.099e+00 1.314e+00 0.837 0.40346
## percent_change_price_transformed 2.735e-01 2.756e-01 0.992 0.32185
## percent_change_volume_over_last_wk -2.857e-03 1.099e-02 -0.260 0.79516
## log(previous_weeks_volume + 1) 1.586e-01 1.224e+00 0.130 0.89698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.293 on 291 degrees of freedom
## Multiple R-squared: 0.2214, Adjusted R-squared: 0.1197
## F-statistic: 2.177 on 38 and 291 DF, p-value: 0.0001769
We can already see adjusted R-squared increased from 0.08353 to 0.1197
vif(t_lm_model_full_r)
## GVIF Df GVIF^(1/(2*Df))
## stock 7352.325784 29 1.165903
## date 1.111689 1 1.054367
## log(open + 1) 6488.596534 1 80.551825
## log(high + 1) 6427.826500 1 80.173727
## log(low + 1) 4927.769768 1 70.198075
## log(close + 1) 6311.229996 1 79.443250
## log(volume + 1) 108.909771 1 10.435984
## percent_change_price_transformed 6.662224 1 2.581128
## percent_change_volume_over_last_wk 15.235487 1 3.903266
## log(previous_weeks_volume + 1) 96.801153 1 9.838758
had to remove open, high, low,
and close.
t_lm_model_r <- lm(percent_change_next_weeks_price ~
stock + date +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_removed)
vif(t_lm_model_r)
## GVIF Df GVIF^(1/(2*Df))
## stock 32.055606 29 1.061607
## date 1.050758 1 1.025065
## log(volume + 1) 89.729022 1 9.472540
## percent_change_price_transformed 1.221011 1 1.104994
## percent_change_volume_over_last_wk 10.206338 1 3.194736
## log(previous_weeks_volume + 1) 75.215769 1 8.672702
t_step_lm_model_r <- step(t_lm_model_r, direction = 'both', trace = FALSE)
summary(t_step_lm_model_r)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(previous_weeks_volume +
## 1), data = train_df_removed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1788 -1.3173 0.0646 1.5207 6.1766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2236 2.3822 2.613 0.0094 **
## log(previous_weeks_volume + 1) -0.3334 0.1315 -2.535 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.424 on 328 degrees of freedom
## Multiple R-squared: 0.01922, Adjusted R-squared: 0.01623
## F-statistic: 6.429 on 1 and 328 DF, p-value: 0.01169
Model ends up using the same 1 variable as non-transformed model.
log(previous_weeks_volume)
Also performs slightly worse. Adj. R-squared dropped from 0.02098 to 0.01623
par(mfrow = c(2, 2))
plot(t_step_lm_model_r)
par(mfrow = c(1, 1))
All plots seem to be behaving a lot normally than the models with no transformations.
summary(influence.measures(t_step_lm_model_r))
## Potentially influential observations of
## lm(formula = percent_change_next_weeks_price ~ log(previous_weeks_volume + 1), data = train_df_removed) :
##
## dfb.1_ dfb.l(+1 dffit cov.r cook.d hat
## 4 -0.08 0.09 0.16 0.97_* 0.01 0.00
## 17 0.05 -0.05 0.15 0.97_* 0.01 0.00
## 38 0.32 -0.33 -0.35_* 1.01 0.06 0.03_*
## 39 0.21 -0.21 -0.23 1.01 0.03 0.02_*
## 40 -0.29 0.30 0.32_* 1.00 0.05 0.02_*
## 41 -0.15 0.15 0.16 1.02_* 0.01 0.02_*
## 42 -0.02 0.02 0.03 1.03_* 0.00 0.02_*
## 43 0.03 -0.04 -0.04 1.03_* 0.00 0.02_*
## 44 0.03 -0.03 -0.03 1.02_* 0.00 0.01
## 45 -0.10 0.10 0.11 1.02_* 0.01 0.02_*
## 46 0.05 -0.05 -0.06 1.02_* 0.00 0.02_*
## 47 0.30 -0.30 -0.33_* 0.99 0.05 0.02_*
## 48 -0.01 0.01 0.01 1.03_* 0.00 0.02_*
## 64 -0.12 0.12 0.18 0.98_* 0.02 0.01
## 65 0.46 -0.48 -0.60_* 0.78_* 0.16 0.01
## 67 -0.01 0.01 0.01 1.03_* 0.00 0.02_*
## 101 0.08 -0.07 0.16 0.97_* 0.01 0.00
## 127 -0.10 0.09 -0.17 0.97_* 0.01 0.00
## 139 0.02 -0.04 -0.25_* 0.90_* 0.03 0.00
## 232 -0.02 0.02 -0.02 1.02_* 0.00 0.01
## 268 -0.10 0.11 0.18 0.97_* 0.02 0.00
## 276 -0.01 0.01 0.01 1.02_* 0.00 0.01
## 300 -0.09 0.10 0.16 0.98_* 0.01 0.00
# Calculate RMSE
predictions <- predict(t_step_lm_model_r, newdata = test_df_removed) # Get predictions
actuals <- test_df_removed$percent_change_next_weeks_price # Actual values
rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 2.843812
#To be able to use transformation of percent_change_price
train_df_inputted$percent_change_price_transformed <- sign(train_df_inputted$percent_change_price) * abs(train_df_inputted$percent_change_price)^(1/3)
t_lm_model_full_i <- lm(percent_change_next_weeks_price ~
stock + date +
log(open + 1) +
log(high + 1) +
log(low + 1) +
log(close + 1) +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_inputted)
summary(t_lm_model_full_i)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + date +
## log(open + 1) + log(high + 1) + log(low + 1) + log(close +
## 1) + log(volume + 1) + percent_change_price_transformed +
## percent_change_volume_over_last_wk + log(previous_weeks_volume +
## 1), data = train_df_inputted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5666 -1.2241 0.1678 1.3135 8.2590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.417e+01 9.106e+01 0.485 0.627967
## stockAXP 2.277e+01 3.790e+00 6.007 5.12e-09 ***
## stockBA 3.336e+01 5.475e+00 6.093 3.16e-09 ***
## stockBAC -5.704e+00 1.577e+00 -3.616 0.000347 ***
## stockCAT 4.153e+01 6.729e+00 6.171 2.04e-09 ***
## stockCSCO 4.296e-01 1.325e+00 0.324 0.746029
## stockCVX 4.072e+01 6.665e+00 6.109 2.89e-09 ***
## stockDD 2.734e+01 4.348e+00 6.289 1.04e-09 ***
## stockDIS 2.143e+01 3.495e+00 6.131 2.56e-09 ***
## stockGE 3.666e+00 1.359e+00 2.698 0.007336 **
## stockHD 1.839e+01 3.091e+00 5.949 7.07e-09 ***
## stockHPQ 2.094e+01 3.798e+00 5.512 7.29e-08 ***
## stockIBM 5.142e+01 8.481e+00 6.063 3.75e-09 ***
## stockINTC 3.862e+00 1.498e+00 2.578 0.010390 *
## stockJNJ 2.820e+01 4.937e+00 5.713 2.54e-08 ***
## stockJPM 2.186e+01 3.902e+00 5.604 4.52e-08 ***
## stockKO 3.048e+01 5.089e+00 5.991 5.61e-09 ***
## stockKRFT 1.476e+01 2.557e+00 5.775 1.82e-08 ***
## stockMCD 3.358e+01 5.688e+00 5.903 9.09e-09 ***
## stockMMM 3.895e+01 6.368e+00 6.116 2.78e-09 ***
## stockMRK 1.418e+01 2.775e+00 5.112 5.50e-07 ***
## stockMSFT 8.215e+00 2.231e+00 3.682 0.000271 ***
## stockPFE 3.061e+00 1.221e+00 2.508 0.012651 *
## stockPG 2.932e+01 5.090e+00 5.761 1.96e-08 ***
## stockT 1.154e+01 2.240e+00 5.154 4.47e-07 ***
## stockTRV 2.958e+01 4.736e+00 6.245 1.34e-09 ***
## stockUTX 3.701e+01 6.023e+00 6.146 2.36e-09 ***
## stockVZ 1.709e+01 3.027e+00 5.645 3.63e-08 ***
## stockWMT 2.582e+01 4.496e+00 5.744 2.15e-08 ***
## stockXOM 3.499e+01 6.023e+00 5.810 1.51e-08 ***
## date -1.581e-04 6.097e-03 -0.026 0.979330
## log(open + 1) -8.782e+00 1.610e+01 -0.546 0.585762
## log(high + 1) 1.507e+01 1.657e+01 0.910 0.363538
## log(low + 1) 2.349e+01 1.474e+01 1.593 0.112032
## log(close + 1) -5.183e+01 1.586e+01 -3.267 0.001203 **
## log(volume + 1) 1.183e+00 6.834e-01 1.730 0.084535 .
## percent_change_price_transformed 3.269e-01 2.473e-01 1.322 0.187163
## percent_change_volume_over_last_wk -3.255e-03 4.291e-03 -0.759 0.448702
## log(previous_weeks_volume + 1) -2.188e-02 2.932e-02 -0.746 0.456123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.336 on 321 degrees of freedom
## Multiple R-squared: 0.2174, Adjusted R-squared: 0.1247
## F-statistic: 2.346 on 38 and 321 DF, p-value: 3.436e-05
vif(t_lm_model_full_i)
## GVIF Df GVIF^(1/(2*Df))
## stock 5600.343135 29 1.160445
## date 1.431738 1 1.196552
## log(open + 1) 5783.706012 1 76.050681
## log(high + 1) 6125.080931 1 78.262896
## log(low + 1) 4894.526710 1 69.960894
## log(close + 1) 5665.392775 1 75.268803
## log(volume + 1) 31.517979 1 5.614088
## percent_change_price_transformed 5.684867 1 2.384296
## percent_change_volume_over_last_wk 2.237066 1 1.495682
## log(previous_weeks_volume + 1) 1.470997 1 1.212847
removed same 4 variables as previous model.
t_lm_model_i <- lm(percent_change_next_weeks_price ~
stock + date +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_inputted)
vif(t_lm_model_i)
## GVIF Df GVIF^(1/(2*Df))
## stock 27.279018 29 1.058658
## date 1.356846 1 1.164837
## log(volume + 1) 25.616579 1 5.061282
## percent_change_price_transformed 1.168733 1 1.081079
## percent_change_volume_over_last_wk 1.813435 1 1.346638
## log(previous_weeks_volume + 1) 1.456620 1 1.206905
t_step_lm_model_i <- step(t_lm_model_i, direction = 'both', trace = FALSE)
summary(t_step_lm_model_i)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1) +
## log(previous_weeks_volume + 1), data = train_df_inputted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.269 -1.229 0.056 1.546 6.847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.16570 2.33246 2.215 0.0274 *
## log(volume + 1) -0.20813 0.12952 -1.607 0.1089
## log(previous_weeks_volume + 1) -0.06736 0.02572 -2.619 0.0092 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.468 on 357 degrees of freedom
## Multiple R-squared: 0.02882, Adjusted R-squared: 0.02338
## F-statistic: 5.298 on 2 and 357 DF, p-value: 0.005404
Adjusted R-squared dropped from .032 to .02338.
par(mfrow = c(2, 2))
plot(t_step_lm_model_i)
par(mfrow = c(1, 1))
Once again, all of our plots seem to behave a lot better than the models with no transformations.
summary(influence.measures(t_step_lm_model_i))
## Potentially influential observations of
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1) + log(previous_weeks_volume + 1), data = train_df_inputted) :
##
## dfb.1_ dfb.l(+1) dfb.l(_+1 dffit cov.r cook.d hat
## 1 0.12 -0.20 0.43 -0.47_* 1.00 0.07 0.04_*
## 4 -0.09 0.09 0.04 0.17 0.96_* 0.01 0.00
## 13 0.05 0.00 -0.22 0.24 1.03_* 0.02 0.03_*
## 17 0.08 -0.08 0.04 0.15 0.96_* 0.01 0.00
## 25 -0.01 0.00 0.04 -0.05 1.04_* 0.00 0.03_*
## 37 -0.43 0.54 -0.57 0.76_* 1.01 0.19 0.06_*
## 38 0.25 -0.25 -0.07 -0.29_* 0.99 0.03 0.02
## 40 -0.28 0.27 0.05 0.30_* 1.00 0.03 0.02
## 42 -0.02 0.02 0.00 0.02 1.03_* 0.00 0.02
## 44 0.03 -0.03 -0.01 -0.03 1.03_* 0.00 0.02
## 46 0.06 -0.05 -0.01 -0.06 1.03_* 0.00 0.02
## 47 0.29 -0.28 -0.06 -0.32_* 0.98 0.03 0.02
## 48 0.00 0.00 0.00 0.00 1.03_* 0.00 0.02
## 49 -0.02 0.01 0.05 -0.06 1.04_* 0.00 0.03_*
## 61 0.00 0.01 -0.01 0.02 1.05_* 0.00 0.04_*
## 64 -0.15 0.15 0.04 0.20 0.97_* 0.01 0.01
## 65 0.40 -0.39 -0.14 -0.56_* 0.72_* 0.09 0.01
## 66 -0.03 0.03 0.00 0.03 1.03_* 0.00 0.02
## 73 0.01 0.00 -0.04 0.04 1.04_* 0.00 0.03_*
## 85 0.04 -0.02 -0.11 0.11 1.04_* 0.00 0.03_*
## 97 0.00 -0.01 0.05 -0.05 1.04_* 0.00 0.03_*
## 101 0.04 -0.04 0.03 0.14 0.96_* 0.01 0.00
## 109 0.00 0.00 0.00 0.00 1.05_* 0.00 0.04_*
## 121 0.04 0.01 -0.26 0.27 1.02 0.02 0.03_*
## 127 -0.11 0.11 -0.03 -0.17 0.96_* 0.01 0.00
## 133 0.00 0.03 -0.13 0.14 1.04_* 0.01 0.03_*
## 139 -0.09 0.09 -0.09 -0.26 0.86_* 0.02 0.00
## 145 0.01 -0.01 -0.03 0.03 1.04_* 0.00 0.03_*
## 157 -0.02 0.03 -0.06 0.07 1.05_* 0.00 0.04_*
## 169 -0.01 0.00 0.07 -0.08 1.04_* 0.00 0.03_*
## 181 -0.05 0.09 -0.20 0.22 1.04_* 0.02 0.04_*
## 193 0.00 0.00 0.01 -0.01 1.04_* 0.00 0.03_*
## 205 -0.01 0.00 0.05 -0.06 1.04_* 0.00 0.03_*
## 217 0.00 -0.02 0.12 -0.12 1.04_* 0.01 0.03_*
## 229 0.04 -0.03 -0.07 0.09 1.04_* 0.00 0.04_*
## 241 -0.05 -0.09 0.71 -0.74_* 0.91_* 0.17 0.03_*
## 253 0.02 -0.03 0.06 -0.06 1.05_* 0.00 0.04_*
## 265 0.01 -0.02 0.03 -0.03 1.05_* 0.00 0.04_*
## 268 -0.18 0.18 0.04 0.23 0.96_* 0.02 0.01
## 277 0.00 0.00 -0.02 0.02 1.04_* 0.00 0.03_*
## 289 0.02 -0.04 0.12 -0.13 1.04_* 0.01 0.04_*
## 300 -0.10 0.10 0.04 0.16 0.97_* 0.01 0.01
## 301 0.04 -0.02 -0.08 0.09 1.04_* 0.00 0.03_*
## 313 -0.03 0.02 0.05 -0.05 1.04_* 0.00 0.04_*
## 325 0.04 -0.10 0.32 -0.34_* 1.02 0.04 0.04_*
## 337 0.01 0.00 -0.05 0.06 1.04_* 0.00 0.03_*
## 349 0.00 0.04 -0.17 0.18 1.04_* 0.01 0.03_*
# Calculate RMSE
predictions <- predict(t_step_lm_model_i, newdata = test_df_inputted) # Get predictions
actuals <- test_df_inputted$percent_change_next_weeks_price # Actual values
rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 2.839231
t_lm_model_full_r_ni <- lm(percent_change_next_weeks_price ~
stock + date +
log(open + 1) +
log(high + 1) +
log(low + 1) +
log(close + 1) +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_removed[-c(4, 17, 101, 127, 139, 232,
268, 276, 300), ])
vif(t_lm_model_full_r_ni)
## GVIF Df GVIF^(1/(2*Df))
## stock 7765.907359 29 1.167004
## date 1.112477 1 1.054740
## log(open + 1) 6612.568873 1 81.317703
## log(high + 1) 6520.929541 1 80.752273
## log(low + 1) 5089.765047 1 71.342589
## log(close + 1) 6455.649689 1 80.347058
## log(volume + 1) 106.992529 1 10.343719
## percent_change_price_transformed 6.673583 1 2.583328
## percent_change_volume_over_last_wk 14.944679 1 3.865835
## log(previous_weeks_volume + 1) 94.902425 1 9.741788
Removed open, high, close,
low .
t_lm_model_r_ni <- lm(percent_change_next_weeks_price ~
stock + date +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_removed[-c(4, 17, 101, 127, 139, 232,
268, 276, 300), ])
vif(t_lm_model_r_ni)
## GVIF Df GVIF^(1/(2*Df))
## stock 32.499758 29 1.061859
## date 1.046881 1 1.023172
## log(volume + 1) 88.631048 1 9.414406
## percent_change_price_transformed 1.237671 1 1.112506
## percent_change_volume_over_last_wk 9.979608 1 3.159052
## log(previous_weeks_volume + 1) 73.906037 1 8.596862
t_step_lm_model_r_ni <- step(t_lm_model_r_ni, direction = 'both', trace = FALSE)
summary(t_step_lm_model_r_ni)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1),
## data = train_df_removed[-c(4, 17, 101, 127, 139, 232, 268,
## 276, 300), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2348 -1.3293 0.0021 1.4818 6.3440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3016 2.3975 2.628 0.00899 **
## log(volume + 1) -0.3366 0.1326 -2.539 0.01160 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.375 on 319 degrees of freedom
## Multiple R-squared: 0.0198, Adjusted R-squared: 0.01673
## F-statistic: 6.445 on 1 and 319 DF, p-value: 0.0116
Also performs slightly worse. Adj. R-squared dropped from 0.02098 to 0.01623, back up to 0.01673.
# Calculate RMSE
predictions <- predict(t_step_lm_model_r_ni, newdata = test_df_removed) # Get predictions
actuals <- test_df_removed$percent_change_next_weeks_price # Actual values
rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 2.844462
t_lm_model_full_i_ni <- lm(percent_change_next_weeks_price ~
stock + date +
log(open + 1) +
log(high + 1) +
log(low + 1) +
log(close + 1) +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_inputted[-c(4, 17, 101, 127, 139, 232,
268, 276, 300), ])
vif(t_lm_model_full_i_ni)
## GVIF Df GVIF^(1/(2*Df))
## stock 5841.723739 29 1.161289
## date 1.429843 1 1.195761
## log(open + 1) 5874.638856 1 76.646193
## log(high + 1) 6100.789330 1 78.107550
## log(low + 1) 4998.728237 1 70.701685
## log(close + 1) 5647.001938 1 75.146536
## log(volume + 1) 31.713684 1 5.631490
## percent_change_price_transformed 5.740249 1 2.395882
## percent_change_volume_over_last_wk 2.263377 1 1.504452
## log(previous_weeks_volume + 1) 1.484811 1 1.218528
t_lm_model_i_ni <- lm(percent_change_next_weeks_price ~
stock + date +
log(volume + 1) +
percent_change_price_transformed +
percent_change_volume_over_last_wk +
log(previous_weeks_volume + 1),
data = train_df_inputted[-c(4, 17, 101, 127, 139, 232,
268, 276, 300), ])
vif(t_lm_model_i_ni)
## GVIF Df GVIF^(1/(2*Df))
## stock 27.636845 29 1.058896
## date 1.369514 1 1.170262
## log(volume + 1) 25.787878 1 5.078177
## percent_change_price_transformed 1.171139 1 1.082192
## percent_change_volume_over_last_wk 1.794374 1 1.339542
## log(previous_weeks_volume + 1) 1.464272 1 1.210071
t_step_lm_model_i_ni <- step(t_lm_model_i_ni, direction = 'both', trace = FALSE)
summary(t_step_lm_model_i_ni)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ log(volume + 1) +
## log(previous_weeks_volume + 1), data = train_df_inputted[-c(4,
## 17, 101, 127, 139, 232, 268, 276, 300), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1547 -1.1956 0.0968 1.4837 7.0296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.22365 2.19659 2.833 0.00488 **
## log(volume + 1) -0.26693 0.12193 -2.189 0.02924 *
## log(previous_weeks_volume + 1) -0.06935 0.02404 -2.885 0.00416 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.303 on 348 degrees of freedom
## Multiple R-squared: 0.04074, Adjusted R-squared: 0.03523
## F-statistic: 7.39 on 2 and 348 DF, p-value: 0.0007195
Adjusted R-squared dropped from .032 to .02338.
With this model Adjusted R-squared went up to 0.03523
# Calculate RMSE
predictions <- predict(t_step_lm_model_i_ni, newdata = test_df_inputted) # Get predictions
actuals <- test_df_inputted$percent_change_next_weeks_price # Actual values
rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 2.843356
Since we found out after that our lagged variables aren’t good to introduce to our models, we will comment this part which now becomes useless.
# lagged_lm_model_full_r <- lm(percent_change_next_weeks_price ~ stock + date + open +
# high + low + close + volume +
# percent_change_price + percent_change_volume_over_last_wk +
# previous_weeks_volume +
# lagged_open + lagged_high + lagged_low + lagged_close + lagged_volume +
# lagged_percent_change_price + lagged_percent_change_volume +
# lagged_open_2 + lagged_high_2 + lagged_low_2 + lagged_close_2 +
# lagged_volume_2 +
# lagged_percent_change_price_2 + lagged_percent_change_volume_2 +
# lagged_previous_weeks_volume + lagged_previous_weeks_volume_2,
# data = train_df_removed)
#
#
# vif(lagged_lm_model_full_r)
we remove the same 4 variables and their lags. open, close, high, low.
# lagged_lm_model_r <- lm(percent_change_next_weeks_price ~ stock + date + volume +
# percent_change_price + percent_change_volume_over_last_wk +
# previous_weeks_volume +
# lagged_volume +
# lagged_percent_change_price + lagged_percent_change_volume +
# lagged_volume_2 +
# lagged_percent_change_price_2 + lagged_percent_change_volume_2,
# data = train_df_removed)
#
# vif(lagged_lm_model_r)
# lagged_step_lm_model_r <- step(lagged_lm_model_r, direction = 'both', trace = FALSE)
# summary(lagged_step_lm_model_r)
We end up with 2 variables instead of just 1. They are not statistically significant, but they explain a higher variance of 2.587%.
# predictions <- predict(lagged_step_lm_model_r, newdata = test_df_removed) # Get predictions
# actuals <- test_df_removed$percent_change_next_weeks_price # Actual values
# rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
# print(rmse)
# lagged_lm_model_full_i <- lm(percent_change_next_weeks_price ~ stock + date + open +
# high + low + close + volume +
# percent_change_price + percent_change_volume_over_last_wk +
# previous_weeks_volume +
# lagged_open + lagged_high + lagged_low + lagged_close + lagged_volume +
# lagged_percent_change_price + lagged_percent_change_volume +
# lagged_open_2 + lagged_high_2 + lagged_low_2 + lagged_close_2 +
# lagged_volume_2 +
# lagged_percent_change_price_2 + lagged_percent_change_volume_2,
# data = train_df_inputted)
#
#
# vif(lagged_lm_model_full_i)
Remove same 4 and their lags.
# lagged_lm_model_i <- lm(percent_change_next_weeks_price ~ stock + date + volume +
# percent_change_price + percent_change_volume_over_last_wk +
# previous_weeks_volume +
# lagged_volume +
# lagged_percent_change_price + lagged_percent_change_volume +
# lagged_volume_2 +
# lagged_percent_change_price_2 + lagged_percent_change_volume_2,
# data = train_df_inputted)
#
# vif(lagged_lm_model_i)
# lagged_step_lm_model_i <- step(lagged_lm_model_i, direction = 'both', trace = FALSE)
# summary(lagged_step_lm_model_i)
Increase in Variance explained by the model.
Also uses more variables.
# predictions <- predict(lagged_step_lm_model_i, newdata = test_df_inputted) # Get predictions
# actuals <- test_df_inputted$percent_change_next_weeks_price # Actual values
# rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
# print(rmse)
# lagged_t_lm_model_full_r <- lm(lm(percent_change_next_weeks_price ~
# stock + date +
# log(open + 1) +
# log(high + 1) +
# log(low + 1) +
# log(close + 1) +
# log(volume + 1) +
# percent_change_price_transformed +
# percent_change_volume_over_last_wk +
# log(previous_weeks_volume + 1) +
#
# # Lagged variables with corresponding transformations
# log(lagged_open + 1) +
# log(lagged_high + 1) +
# log(lagged_low + 1) +
# log(lagged_close + 1) +
# log(lagged_volume + 1) +
# log(lagged_open_2 + 1) +
# log(lagged_high_2 + 1) +
# log(lagged_low_2 + 1) +
# log(lagged_close_2 + 1) +
# log(lagged_volume_2 + 1),
# data = train_df_removed)
# )
#
# summary(lagged_t_lm_model_full_r)
# vif(lagged_t_lm_model_full_r)
# lagged_t_lm_model_r <- lm(percent_change_next_weeks_price ~
# stock + date +
# log(volume + 1) +
# percent_change_price_transformed +
# percent_change_volume_over_last_wk +
# log(previous_weeks_volume + 1) +
#
# # Lagged variables with corresponding transformations
# log(lagged_volume + 1) +
# log(lagged_volume_2 + 1),
# data = train_df_removed)
#
# vif(lagged_t_lm_model_r)
# lagged_t_step_lm_model_r <- step(lagged_t_lm_model_r, direction = 'both', trace = FALSE)
# summary(lagged_t_step_lm_model_r)
# predictions <- predict(lagged_t_step_lm_model_r, newdata = test_df_removed) # Get predictions
# actuals <- test_df_removed$percent_change_next_weeks_price # Actual values
# rmse <- sqrt(mean((actuals - predictions)^2)) # Compute RMSE
# print(rmse)
can’t use stock or date because they are
not numeric
tree_model_r <- tree(percent_change_next_weeks_price ~ open +
high + low + close + volume +
percent_change_price + percent_change_volume_over_last_wk +
previous_weeks_volume + percent_return_next_dividend,
data = train_df_removed)
summary(tree_model_r)
##
## Regression tree:
## tree(formula = percent_change_next_weeks_price ~ open + high +
## low + close + volume + percent_change_price + percent_change_volume_over_last_wk +
## previous_weeks_volume + percent_return_next_dividend, data = train_df_removed)
## Variables actually used in tree construction:
## [1] "percent_change_price" "percent_return_next_dividend"
## [3] "open" "volume"
## [5] "percent_change_volume_over_last_wk" "close"
## Number of terminal nodes: 18
## Residual mean deviance: 4.233 = 1321 / 312
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11.420000 -1.155000 -0.005521 0.000000 1.227000 4.891000
plot(tree_model_r)
text(tree_model_r)
predictions <- predict(tree_model_r, newdata = test_df_removed)
actuals <- test_df_removed$percent_change_next_weeks_price
sqrt(mean((actuals - predictions)^2))
## [1] 3.292289
set.seed(42)
cv_tree_model_r <- cv.tree(tree_model_r)
best_size <- cv_tree_model_r$size[which.min(cv_tree_model_r$dev)]
best_k <- cv_tree_model_r$k[which.min(cv_tree_model_r$dev)]
#plot the pruned tree
plot(cv_tree_model_r$size, cv_tree_model_r$dev, type = 'b')
plot(cv_tree_model_r$k, cv_tree_model_r$dev, type = 'b')
best size = 1
pruning at around 70 - 110 shows the most improvement in performance
Our plots show that the lower the size, the better our model. Also the higher the k(more pruning) complexity decreases and performance is better(lower error).
Pruning won’t work because our size is too small and based on what we know about our variables, it won’t be enough to actually make meaningful predictions
# p_tree_model_r <- prune.tree(tree_model_r, best = best_size)
# predictions <- predict(p_tree_model_r, newdata = test_df_removed)
tree_model_i <- tree(percent_change_next_weeks_price ~ open +
high + low + close + volume +
percent_change_price + percent_change_volume_over_last_wk +
previous_weeks_volume + percent_return_next_dividend,
data = train_df_inputted)
summary(tree_model_i)
##
## Regression tree:
## tree(formula = percent_change_next_weeks_price ~ open + high +
## low + close + volume + percent_change_price + percent_change_volume_over_last_wk +
## previous_weeks_volume + percent_return_next_dividend, data = train_df_inputted)
## Variables actually used in tree construction:
## [1] "previous_weeks_volume" "percent_change_price"
## [3] "volume" "percent_change_volume_over_last_wk"
## [5] "percent_return_next_dividend" "open"
## [7] "close"
## Number of terminal nodes: 14
## Residual mean deviance: 4.712 = 1630 / 346
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11.4100 -1.1790 0.2242 0.0000 1.2350 6.3080
plot(tree_model_i)
text(tree_model_i)
predictions <- predict(tree_model_i, newdata = test_df_inputted)
actuals <- test_df_inputted$percent_change_next_weeks_price
sqrt(mean((actuals - predictions)^2))
## [1] 3.103051
set.seed(42)
cv_tree_model_i <- cv.tree(tree_model_i)
best_size <- cv_tree_model_i$size[which.min(cv_tree_model_i$dev)]
best_k <- cv_tree_model_i$k[which.min(cv_tree_model_i$dev)]
#plot the pruned tree
plot(cv_tree_model_i$size, cv_tree_model_i$dev, type = 'b')
plot(cv_tree_model_i$k, cv_tree_model_i$dev, type = 'b')
Same as dataset with Removed NA’s.
Pruning results in size of 1 which will make our model have insignificant predictions.
First of all, we found out that ‘removed’ dataset worked better than inputted.
Second, most of our linear models seem to only like to use
previous_weeks_volume and it’s the only variable we ever
get that is significant enough.
lm_models = list()
lm_results = data.frame(stock = character(), RMSE = numeric(), stringsAsFactors = FALSE)
for (stock in unique(df_NA_removed$stock)) {
subset_data <- df_NA_removed[df_NA_removed$stock == stock, ]
train_data <- subset_data[subset_data$quarter == 1, ]
test_data <- subset_data[subset_data$quarter == 2, ]
model <- lm(percent_change_next_weeks_price ~ previous_weeks_volume,
data = train_data)
lm_models[[stock]] <- model
predictions <- predict(model, newdata = test_data)
actuals <- test_data$percent_change_next_weeks_price
rmse <- sqrt(mean((actuals - predictions)^2))
lm_results <- rbind(lm_results, data.frame(stock = stock, RMSE = rmse))
}
mean(lm_results$RMSE)
## [1] 2.893729
dt_models = list()
dt_results = data.frame(stock = character(), RMSE = numeric(), stringsAsFactors = FALSE)
for (stock in unique(df_NA_removed$stock)) {
subset_data <- df_NA_removed[df_NA_removed$stock == stock, ]
train_data <- df_NA_removed[df_NA_removed$quarter == 1, ]
test_data <- df_NA_removed[df_NA_removed$quarter == 2, ]
model <- tree(percent_change_next_weeks_price ~ previous_weeks_volume,
data = train_data)
dt_models[[stock]] <- model
predictions <- predict(model, newdata = test_data)
actuals <- test_data$percent_change_next_weeks_price
rmse <- sqrt(mean((actuals - predictions)^2))
dt_results <- rbind(dt_results, data.frame(stock = stock, RMSE = rmse))
}
mean(dt_results$RMSE)
## [1] 2.855011
svr_model <- svm(percent_change_next_weeks_price ~ previous_weeks_volume,
data = train_data, kernel = 'radial')
svr_models <- list()
svr_results <- data.frame(stock = character(), RMSE = numeric(), stringsAsFactors = FALSE)
for (stock in unique(df_NA_removed$stock)) {
subset_data <- df_NA_removed[df_NA_removed$stock == stock, ]
train_data <- df_NA_removed[df_NA_removed$quarter == 1, ]
test_data <- df_NA_removed[df_NA_removed$quarter == 2, ]
model <- svm(percent_change_next_weeks_price ~ previous_weeks_volume,
data = train_data, kernel = 'radial')
svr_models[[stock]] <- model
predictions <- predict(model, newdata = test_data)
actuals <- test_data$percent_change_next_weeks_price
rmse <- sqrt(mean((actuals - predictions)^2))
svr_results <- rbind(svr_results, data.frame(stock = stock, RMSE = rmse))
}
mean(svr_results$RMSE)
## [1] 2.843871
library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.26 ✔ xts 0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ PerformanceAnalytics::kurtosis() masks e1071::kurtosis()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ car::recode() masks dplyr::recode()
## ✖ PerformanceAnalytics::skewness() masks e1071::skewness()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Get daily historical data for S&P 500 (^GSPC)
sp500_data = tq_get("^GSPC", from = "2011-01-14", to = "2011-06-24", get = "stock.prices", periodicity = "weekly")
# View first few rows
head(sp500_data)
## # A tibble: 6 × 8
## symbol date open high low close volume adjusted
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ^GSPC 2011-01-10 1286. 1293. 1280. 1293. 8972430000 1293.
## 2 ^GSPC 2011-01-17 1293. 1296. 1271. 1283. 19899340000 1283.
## 3 ^GSPC 2011-01-24 1283. 1303. 1275. 1276. 23156650000 1276.
## 4 ^GSPC 2011-01-31 1276. 1311 1276. 1311. 21726860000 1311.
## 5 ^GSPC 2011-02-07 1312. 1331. 1312. 1329. 20109950000 1329.
## 6 ^GSPC 2011-02-14 1329. 1344. 1325. 1343. 12589110000 1343.
# Save data to CSV
write.csv(sp500_data, "SP500_data.csv", row.names = FALSE)
#SP500
returnSP500 <- na.omit(Delt(sp500_data$close))
# loop for stocks in df_NA_removed
stock_returns <- list()
for (stock in unique(df_NA_removed$stock)) {
stock_returns[[stock]] <- na.omit(Delt(df_NA_removed$close[df_NA_removed$stock == stock]))
}
stock_returns_df <- do.call(cbind, lapply(stock_returns, function(x) x[, 1]))
colnames(stock_returns_df) <- names(stock_returns)
stock_returns_df <- as.data.frame(stock_returns_df)
#add SP500 to dataframe
returnSP500_vector <- as.vector(returnSP500)
stock_returns_df$SP500 <- returnSP500_vector
boxplot(stock_returns_df, main = 'Expected Return', xlab = 'Stock', ylab = 'Return')
DataMean <- apply(stock_returns_df, 2, mean)
DataSD <- apply(stock_returns_df, 2, sd)
cbind(DataMean, DataSD)
## DataMean DataSD
## AA -0.0014758553 0.03486277
## AXP 0.0022992844 0.02809141
## BA 0.0011022005 0.02795967
## BAC -0.0156049349 0.02911342
## CAT 0.0032480226 0.03411175
## CSCO -0.0143473271 0.03971904
## CVX 0.0026118394 0.02482674
## DD 0.0022017080 0.02723925
## DIS -0.0015588482 0.02808084
## GE -0.0016902251 0.02574530
## HD -0.0007599981 0.02198521
## HPQ -0.0113893855 0.03927282
## IBM 0.0043318296 0.01836236
## INTC 0.0007425153 0.03258276
## JNJ 0.0019245403 0.02114399
## JPM -0.0053481151 0.02181402
## KRFT 0.0044681344 0.01816930
## KO 0.0013550453 0.01664583
## MCD 0.0045364377 0.01954457
## MMM 0.0015614511 0.02073424
## MRK 0.0006072965 0.02069553
## MSFT -0.0064307236 0.01898709
## PFE 0.0042807309 0.02655822
## PG -0.0018427834 0.01779427
## T 0.0031507287 0.01934575
## TRV 0.0017787893 0.01920565
## UTX 0.0030140170 0.02130799
## VZ 0.0008114707 0.01799993
## WMT -0.0017666125 0.01910305
## XOM -0.0002927384 0.02512918
## SP500 -0.0002213339 0.01499232
beta_results <- data.frame(stock = character(), beta = numeric())
for (stock in unique(df_NA_removed$stock)) {
lm_capm <- lm(stock_returns_df[[stock]] ~ stock_returns_df$SP500, data = stock_returns_df)
beta <- coef(lm_capm)[2]
beta_results <- rbind(beta_results, data.frame(stock = stock, beta = beta))
}
print(beta_results)
## stock beta
## stock_returns_df$SP500 AA 1.5139715
## stock_returns_df$SP5001 AXP 0.8964116
## stock_returns_df$SP5002 BA 1.3487835
## stock_returns_df$SP5003 BAC 0.7006978
## stock_returns_df$SP5004 CAT 1.6273586
## stock_returns_df$SP5005 CSCO 0.7112362
## stock_returns_df$SP5006 CVX 0.8324819
## stock_returns_df$SP5007 DD 1.3509575
## stock_returns_df$SP5008 DIS 1.4134352
## stock_returns_df$SP5009 GE 1.0565302
## stock_returns_df$SP50010 HD 0.8387551
## stock_returns_df$SP50011 HPQ 1.1244543
## stock_returns_df$SP50012 IBM 0.7905043
## stock_returns_df$SP50013 INTC 1.1496233
## stock_returns_df$SP50014 JNJ 0.6804785
## stock_returns_df$SP50015 JPM 0.6300871
## stock_returns_df$SP50016 KRFT 0.2189592
## stock_returns_df$SP50017 KO 0.6385589
## stock_returns_df$SP50018 MCD 0.6457669
## stock_returns_df$SP50019 MMM 1.0278481
## stock_returns_df$SP50020 MRK 0.5219022
## stock_returns_df$SP50021 MSFT 0.6762358
## stock_returns_df$SP50022 PFE 0.6965205
## stock_returns_df$SP50023 PG 0.2733775
## stock_returns_df$SP50024 T 0.7897055
## stock_returns_df$SP50025 TRV 0.8666328
## stock_returns_df$SP50026 UTX 1.0654107
## stock_returns_df$SP50027 VZ 0.8204556
## stock_returns_df$SP50028 WMT 0.4493817
## stock_returns_df$SP50029 XOM 1.1084912