Stock Market prices are very volatile and every investor is interested to know it’s future behavior so that they can invest and gain profits. Prediction and analysis of Stock prices are some of the most difficult jobs to complete. There can be various reasons for this like market fluctuations, news, customer sentiments, pandemic etc. We would like to consider some of these factors as our predictors and would like to come up with a prediction of stock price.
In this project we are taking IBM’s intraday stock price for past 2 years with 15mins times interval and try to predict the close price at the end of 15 mins using various Fundamental/Economic/Technical indicators as our predictors
The Data files are stored in github
Load Libraries
library(h2o)
library(tidyverse)
library(plyr)
library(httr)
library(jsonlite)
library(tidyverse)
library(stringr)
library(lubridate)
library(ggplot2)
library(skimr)
library(recipes)
library(plotly)
library(stringr)
library(knitr)
library(kableExtra)
options(scipen =999)
h2o.init(nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 days 40 minutes
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.36.0.3
## H2O cluster version age: 2 months and 5 days
## H2O cluster name: H2O_started_from_R_Ankit_kij316
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.78 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.1.2 (2021-11-01)
Define some functions for pre-processing data
get_quater <- function(number){
if (number ==1 | number==2 | number==3){
return ("Q1")
}
else if(number ==4 | number==5 | number==6){
return ("Q2")
}
else if(number ==7 | number==8 | number==9){
return ("Q3")
}
else if(number ==10 | number==11 | number==12){
return ("Q4")
}
}
get_modified_quater <- function(number){
if ( number==1 | number==2 | number==3){
return ("Q2")
}
else if(number ==4 | number==5 | number==6){
return ("Q3")
}
else if(number ==7 | number==8 | number==9){
return ("Q4")
}
else if(number ==10 | number==11 | number==12){
return ("Q1")
}
}
get_modified_year <- function(number, year_val){
if ( number==1 | number==2 | number==3){
return (year_val)
}
else if(number ==4 | number==5 | number==6){
return (year_val)
}
else if(number ==7 | number==8 | number==9){
return (year_val)
}
else if(number ==10 | number==11 | number==12){
return (year_val+1)
}
}
Read all data from files/folders
myfiles = list.files(path="./data/StockPrediction", pattern="^year", full.names=TRUE)
Primary_data_Intraday = ldply(myfiles, read_csv)
df_Income <- read.csv("./data/StockPrediction/Income.csv")
df_cs <- read.csv("./data/StockPrediction/Customer_Sentiment.csv")
df_rs <- read.csv("./data/StockPrediction/RETAIL_SALES.csv")
Prepare additional columns for data merge
# extracting time from timestamp
Primary_data_Intraday<- Primary_data_Intraday %>%
mutate(Date= format.Date(Primary_data_Intraday$time,"%Y/%m/%d"),
Time = format.Date(Primary_data_Intraday$time,"%H:%M:%S"),
Year=format.Date(Primary_data_Intraday$time,"%Y"))
# Calculated cloumn Quater for Intra-Day
Quater=unlist(map(month(as.POSIXlt(Primary_data_Intraday$Date, format="%Y/%m/%d")),get_quater))
Primary_data_Intraday=cbind(Primary_data_Intraday,Quater)
# Calculated column Quater for Income statement
Quater=unlist(map(month(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),get_modified_quater))
df_Income=cbind(df_Income,Quater)
Year=unlist(map2(month(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),
year(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),
get_modified_year))
df_Income=cbind(df_Income,Year)
Merge all data
data = merge(x=Primary_data_Intraday, y=df_Income, by=c("Quater","Year"))
data$Date=as.Date(data$Date)
df_rs$date=as.Date(df_rs$date)
df_rs=df_rs %>% mutate(Year=format.Date(df_rs$date,"%Y"))
df_rs=df_rs %>% mutate(Month=format.Date(df_rs$date,"%m"))
df_cs=df_cs %>% mutate(Year=format.Date(df_cs$date,"%Y"))
df_cs=df_cs %>% mutate(Month=format.Date(df_cs$date,"%m"))
data=data %>% mutate(Month=format.Date(data$Date,"%m"))
data$Date=as.Date(data$Date)
data = merge(x=data, y=df_rs, by=c("Month","Year"))
data = merge(x=data, y=df_cs, by=c("Month","Year"))
data$interestIncome=as.numeric(data$interestIncome)
data$otherNonOperatingIncome=as.numeric(data$otherNonOperatingIncome)
data$CONSUMER_SENTIMENT_VALUE=as.numeric(data$CONSUMER_SENTIMENT_VALUE)
Clean data for training
data<- data %>% select(-c("Quater","Year","Date","Time","Month","date.x","date.y","fiscalDateEnding","reportedCurrency","nonInterestIncome","investmentIncomeNet"))
shift <- function(x, n) {
c(x[-(seq(n))], rep(NA, n))
}
data$shifted <- shift(data$close, 1)
#tail(data)
data <- na.omit(data)
write.csv(data, "./data/formatted_data.csv")
kbl(cbind(head(data))) %>%
kable_paper() %>%
scroll_box(width = "100%", height = "100%")
| time | open | high | low | close | volume | grossProfit | totalRevenue | costOfRevenue | costofGoodsAndServicesSold | operatingIncome | sellingGeneralAndAdministrative | researchAndDevelopment | operatingExpenses | netInterestIncome | interestIncome | interestExpense | otherNonOperatingIncome | depreciation | depreciationAndAmortization | incomeBeforeTax | incomeTaxExpense | interestAndDebtExpense | netIncomeFromContinuingOperations | comprehensiveIncomeNetOfTax | ebit | ebitda | netIncome | RETAIL_SALES_VALUE | CONSUMER_SENTIMENT_VALUE | shifted |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2021-01-27 15:00:00 | 111.6059 | 111.8037 | 111.4442 | 111.5071 | 471830 | 10523000000 | 20367000000 | 9844000000 | 181000000 | -4000000000 | 7233000000 | 1611000000 | 2000000000 | -317000000 | 15000000 | 317000000 | -247000000 | 1089000000 | 610000000 | 1380000000 | 24000000 | 317000000 | 1264000000 | 603000000 | 1697000000 | 2307000000 | 1356000000 | 464362 | 79 | 109.9360 |
| 2021-01-27 09:45:00 | 109.7274 | 110.7925 | 109.3152 | 109.9360 | 1364954 | 10523000000 | 20367000000 | 9844000000 | 181000000 | -4000000000 | 7233000000 | 1611000000 | 2000000000 | -317000000 | 15000000 | 317000000 | -247000000 | 1089000000 | 610000000 | 1380000000 | 24000000 | 317000000 | 1264000000 | 603000000 | 1697000000 | 2307000000 | 1356000000 | 464362 | 79 | 110.9858 |
| 2021-01-27 15:30:00 | 111.9475 | 111.9565 | 110.8599 | 110.9858 | 431226 | 10523000000 | 20367000000 | 9844000000 | 181000000 | -4000000000 | 7233000000 | 1611000000 | 2000000000 | -317000000 | 15000000 | 317000000 | -247000000 | 1089000000 | 610000000 | 1380000000 | 24000000 | 317000000 | 1264000000 | 603000000 | 1697000000 | 2307000000 | 1356000000 | 464362 | 79 | 111.9475 |
| 2021-01-27 15:15:00 | 111.4891 | 111.9745 | 111.4263 | 111.9475 | 434875 | 10523000000 | 20367000000 | 9844000000 | 181000000 | -4000000000 | 7233000000 | 1611000000 | 2000000000 | -317000000 | 15000000 | 317000000 | -247000000 | 1089000000 | 610000000 | 1380000000 | 24000000 | 317000000 | 1264000000 | 603000000 | 1697000000 | 2307000000 | 1356000000 | 464362 | 79 | 108.5320 |
| 2021-01-27 08:45:00 | 108.5320 | 108.5320 | 108.3972 | 108.5320 | 2090 | 10523000000 | 20367000000 | 9844000000 | 181000000 | -4000000000 | 7233000000 | 1611000000 | 2000000000 | -317000000 | 15000000 | 317000000 | -247000000 | 1089000000 | 610000000 | 1380000000 | 24000000 | 317000000 | 1264000000 | 603000000 | 1697000000 | 2307000000 | 1356000000 | 464362 | 79 | 109.6735 |
| 2021-01-27 09:30:00 | 108.5230 | 109.7005 | 108.5230 | 109.6735 | 21911 | 10523000000 | 20367000000 | 9844000000 | 181000000 | -4000000000 | 7233000000 | 1611000000 | 2000000000 | -317000000 | 15000000 | 317000000 | -247000000 | 1089000000 | 610000000 | 1380000000 | 24000000 | 317000000 | 1264000000 | 603000000 | 1697000000 | 2307000000 | 1356000000 | 464362 | 79 | 108.3163 |
Import data to H2O
data <- h2o.importFile("./data/formatted_data.csv")
##
|
| | 0%
|
|======================================================================| 100%
Data Summary
kbl(cbind(h2o.describe(data))) %>%
kable_paper() %>%
scroll_box(width = "100%", height = "200px")
| Label | Type | Missing | Zeros | PosInf | NegInf | Min | Max | Mean | Sigma | Cardinality |
|---|---|---|---|---|---|---|---|---|---|---|
| C1 | int | 0 | 0 | 0 | 0 | 1.00000 | 20364.0000 | 10182.50000 | 5878.724777 | NA |
| time | time | 0 | 0 | 0 | 0 | 1587356100000.00000 | 1646077500000.0000 | 1616411363361.82007 | 17150549471.472900 | NA |
| open | real | 0 | 0 | 0 | 0 | 94.01986 | 140.1091 | 118.29767 | 10.964513 | NA |
| high | real | 0 | 0 | 0 | 0 | 94.21928 | 140.7784 | 118.44232 | 10.963563 | NA |
| low | real | 0 | 0 | 0 | 0 | 93.87805 | 139.8423 | 118.15019 | 10.965435 | NA |
| close | real | 0 | 0 | 0 | 0 | 94.01427 | 140.1140 | 118.29416 | 10.965058 | NA |
| volume | int | 0 | 3 | 0 | 0 | 0.00000 | 7953069.0000 | 113928.93218 | 188241.325092 | NA |
| grossProfit | int | 0 | 0 | 0 | 0 | 6107000000.00000 | 10523000000.0000 | 8499836083.28423 | 1082736537.395630 | NA |
| totalRevenue | int | 0 | 0 | 0 | 0 | 3257000000.00000 | 20367000000.0000 | 16945251718.71930 | 4343619426.664740 | NA |
| costOfRevenue | int | 0 | 0 | 0 | 0 | -2849000000.00000 | 9844000000.0000 | 8445378020.03536 | 3507399354.926100 | NA |
| costofGoodsAndServicesSold | int | 0 | 0 | 0 | 0 | 49000000.00000 | 181000000.0000 | 92778530.74052 | 41680151.663085 | NA |
| operatingIncome | int | 0 | 0 | 0 | 0 | -4000000000.00000 | 2358000000.0000 | 780602877.62718 | 2071471558.836250 | NA |
| sellingGeneralAndAdministrative | int | 0 | 0 | 0 | 0 | 3377000000.00000 | 7233000000.0000 | 5302524062.07032 | 990989430.185588 | NA |
| researchAndDevelopment | int | 0 | 0 | 0 | 0 | 1515000000.00000 | 1657000000.0000 | 1602136613.63190 | 41618733.021482 | NA |
| operatingExpenses | int | 0 | 0 | 0 | 0 | 2000000000.00000 | 8067000000.0000 | 5977057847.18130 | 1747060622.512220 | NA |
| netInterestIncome | int | 0 | 0 | 0 | 0 | -326000000.00000 | -280000000.0000 | -305726772.73620 | 18265627.461575 | NA |
| interestIncome | int | 0 | 0 | 0 | 0 | 11000000.00000 | 51000000.0000 | 19210518.56217 | 11841711.077610 | NA |
| interestExpense | int | 0 | 0 | 0 | 0 | 280000000.00000 | 326000000.0000 | 305726772.73620 | 18265627.461575 | NA |
| otherNonOperatingIncome | int | 0 | 0 | 0 | 0 | -362000000.00000 | 38000000.0000 | -227293704.57670 | 100331276.734826 | NA |
| depreciation | int | 0 | 0 | 0 | 0 | 749000000.00000 | 1089000000.0000 | 1026708112.35514 | 88655565.046771 | NA |
| depreciationAndAmortization | int | 0 | 0 | 0 | 0 | 610000000.00000 | 647000000.0000 | 624363779.21823 | 11483727.907642 | NA |
| incomeBeforeTax | int | 0 | 0 | 0 | 0 | -51000000.00000 | 2092000000.0000 | 1319668680.02357 | 581349140.735480 | NA |
| incomeTaxExpense | int | 0 | 0 | 0 | 0 | -1226000000.00000 | 227000000.0000 | -63550235.71008 | 437274344.446366 | NA |
| interestAndDebtExpense | int | 0 | 0 | 0 | 0 | 280000000.00000 | 326000000.0000 | 305726772.73620 | 18265627.461575 | NA |
| netIncomeFromContinuingOperations | int | 0 | 0 | 0 | 0 | 956000000.00000 | 1698000000.0000 | 1280331025.33883 | 207638786.871327 | NA |
| comprehensiveIncomeNetOfTax | int | 0 | 0 | 0 | 0 | 489000000.00000 | 5853000000.0000 | 1861170054.99902 | 1362260517.605360 | NA |
| ebit | int | 0 | 0 | 0 | 0 | 275000000.00000 | 2395000000.0000 | 1625395452.75977 | 580250944.499440 | NA |
| ebitda | int | 0 | 0 | 0 | 0 | 897000000.00000 | 3027000000.0000 | 2249759231.97800 | 580687555.364317 | NA |
| netIncome | int | 0 | 0 | 0 | 0 | 955000000.00000 | 2333000000.0000 | 1383218915.73365 | 361205191.271561 | NA |
| RETAIL_SALES_VALUE | int | 0 | 0 | 0 | 0 | 377210.00000 | 636842.0000 | 522876.22702 | 52332.565954 | NA |
| CONSUMER_SENTIMENT_VALUE | real | 0 | 0 | 0 | 0 | 62.80000 | 88.3000 | 76.18822 | 6.442037 | NA |
| shifted | real | 0 | 0 | 0 | 0 | 94.01427 | 140.1140 | 118.29437 | 10.964970 | NA |
Visualizing Raw Data
plot <- as.data.frame(data) %>% plot_ly(x = ~time, type="candlestick",
open = ~open, close = ~close,
high = ~high, low = ~low)
plot <- plot %>% add_lines(x = ~time, y = ~open, line = list(color = 'black'), inherit = F)
plot <- plot %>% layout(showlegend = FALSE)
plot
Data Splitting
y <- "shifted"
x <- setdiff(names(data),y)
parts <- h2o.splitFrame(data, .80)
train <- parts[[1]]
test <- parts[[2]]
Modelling
# automodel <- h2o.automl(x, y, train, test, max_runtime_secs = 10)
model <- h2o.loadModel("StackedEnsemble_BestOfFamily_1_AutoML_11_20220421_175438")
model
## Model Details:
## ==============
##
## H2ORegressionModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_1_AutoML_11_20220421_175438
## Number of Base Models: 2
##
## Base Models (count by algorithm type):
##
## gbm glm
## 1 1
##
## Metalearner:
##
## Metalearner algorithm: glm
## Metalearner cross-validation fold assignment:
## Fold assignment scheme: AUTO
## Number of folds: 5
## Fold column: NULL
## Metalearner hyperparameters:
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.892557
## RMSE: 0.9447523
## MAE: 0.2096082
## RMSLE: 0.008022957
## Mean Residual Deviance : 0.892557
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 1.108056
## RMSE: 1.052642
## MAE: 0.2136033
## RMSLE: 0.008832665
## Mean Residual Deviance : 1.108056
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.848416
## RMSE: 0.921095
## MAE: 0.2167189
## RMSLE: 0.007873269
## Mean Residual Deviance : 0.848416
Prediction on test data
predictions <- h2o.predict(model, test)
##
|
| | 0%
|
|======================================================================| 100%
Visualizing Actual vs predicted values
pred_val = unlist(as.list(predictions))
test_val = as.data.frame(test)
test_val = cbind(test_val, pred_val)
test_val = test_val[1:100,]
# plot_data <- as.data.frame(test)
ggplot(data=test_val, aes(x=test_val$time, y=test_val$shifted)) +
# geom_bar(stat="identity", width=.9, fill="blue")+
geom_line(aes(x=test_val$time, y=test_val$shifted), color="red")+
geom_line(aes(x=test_val$time, y=test_val$pred_val), color="black") +
ggtitle("Actual vs prediacted price of IBM")+
xlab("time")+
ylab("price")+
scale_color_manual(labels = c("actual", "predicted"), values = c("red", "black"))