Team Members

Vamshidhar Reddy Kanamanthareddy ()

Sai Charan Pappala ()

Business context

Stock Market prices are very volatile and every investor is interested to know it’s future behavior so that they can invest and gain profits. Prediction and analysis of Stock prices are some of the most difficult jobs to complete. There can be various reasons for this like market fluctuations, news, customer sentiments, pandemic etc. We would like to consider some of these factors as our predictors and would like to come up with a prediction of stock price.

Problem description

In this project we are taking IBM’s intraday stock price for past 2 years with 15mins times interval and try to predict the close price at the end of 15 mins using various Fundamental/Economic/Technical indicators as our predictors

Data preparation

Load Libraries

library(h2o)
library(tidyverse)
library(plyr)
library(httr)
library(jsonlite)
library(tidyverse)
library(stringr)
library(lubridate)
library(ggplot2)
library(skimr)
library(recipes)
library(plotly)
library(stringr)
library(knitr)
library(kableExtra)

options(scipen =999)
h2o.init(nthreads = -1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 40 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.36.0.3 
##     H2O cluster version age:    2 months and 5 days  
##     H2O cluster name:           H2O_started_from_R_Ankit_kij316 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.78 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.1.2 (2021-11-01)

Define some functions for pre-processing data

get_quater <- function(number){
  
  if (number ==1 | number==2 | number==3){
    return ("Q1")
  }
  else if(number ==4 | number==5 | number==6){
    return ("Q2")
  }
  else if(number ==7 | number==8 | number==9){
    return ("Q3")
  }
  else if(number ==10 | number==11 | number==12){
    return ("Q4")
  }
}

get_modified_quater <- function(number){
  if ( number==1 | number==2 | number==3){
    return ("Q2")
  }
  else if(number ==4 | number==5 | number==6){
    return ("Q3")
  }
  else if(number ==7 | number==8 | number==9){
    return ("Q4")
  }
  else if(number ==10 | number==11 | number==12){
    return ("Q1")
  }
}

get_modified_year <- function(number, year_val){
  if ( number==1 | number==2 | number==3){
    return (year_val)
  }
  else if(number ==4 | number==5 | number==6){
    return (year_val)
  }
  else if(number ==7 | number==8 | number==9){
    return (year_val)
  }
  else if(number ==10 | number==11 | number==12){
    return (year_val+1)
  }
}

Prepare master data for prediction

Read all data from files/folders

myfiles = list.files(path="./data/StockPrediction", pattern="^year", full.names=TRUE)
Primary_data_Intraday = ldply(myfiles, read_csv)
df_Income <- read.csv("./data/StockPrediction/Income.csv")
df_cs <- read.csv("./data/StockPrediction/Customer_Sentiment.csv")
df_rs <- read.csv("./data/StockPrediction/RETAIL_SALES.csv")

Prepare additional columns for data merge

# extracting time from timestamp 
Primary_data_Intraday<- Primary_data_Intraday %>%
  mutate(Date= format.Date(Primary_data_Intraday$time,"%Y/%m/%d"), 
         Time = format.Date(Primary_data_Intraday$time,"%H:%M:%S"),
         Year=format.Date(Primary_data_Intraday$time,"%Y")) 

# Calculated cloumn Quater for Intra-Day
Quater=unlist(map(month(as.POSIXlt(Primary_data_Intraday$Date, format="%Y/%m/%d")),get_quater))
Primary_data_Intraday=cbind(Primary_data_Intraday,Quater)

# Calculated column Quater for Income statement
Quater=unlist(map(month(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),get_modified_quater))

df_Income=cbind(df_Income,Quater)

Year=unlist(map2(month(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),
                 year(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),
                 get_modified_year))

df_Income=cbind(df_Income,Year)

Merge all data

data = merge(x=Primary_data_Intraday, y=df_Income, by=c("Quater","Year"))

data$Date=as.Date(data$Date)

df_rs$date=as.Date(df_rs$date)
df_rs=df_rs %>% mutate(Year=format.Date(df_rs$date,"%Y"))
df_rs=df_rs %>% mutate(Month=format.Date(df_rs$date,"%m"))
df_cs=df_cs %>% mutate(Year=format.Date(df_cs$date,"%Y"))
df_cs=df_cs %>% mutate(Month=format.Date(df_cs$date,"%m"))

data=data %>% mutate(Month=format.Date(data$Date,"%m"))
data$Date=as.Date(data$Date)
data = merge(x=data, y=df_rs, by=c("Month","Year"))
data = merge(x=data, y=df_cs, by=c("Month","Year"))
data$interestIncome=as.numeric(data$interestIncome)
data$otherNonOperatingIncome=as.numeric(data$otherNonOperatingIncome)
data$CONSUMER_SENTIMENT_VALUE=as.numeric(data$CONSUMER_SENTIMENT_VALUE)

Clean data for training

data<- data %>% select(-c("Quater","Year","Date","Time","Month","date.x","date.y","fiscalDateEnding","reportedCurrency","nonInterestIncome","investmentIncomeNet"))

shift <- function(x, n) {   
  c(x[-(seq(n))], rep(NA, n))
}

data$shifted <- shift(data$close, 1)

#tail(data)

data <- na.omit(data)

write.csv(data, "./data/formatted_data.csv")

kbl(cbind(head(data))) %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "100%")
time open high low close volume grossProfit totalRevenue costOfRevenue costofGoodsAndServicesSold operatingIncome sellingGeneralAndAdministrative researchAndDevelopment operatingExpenses netInterestIncome interestIncome interestExpense otherNonOperatingIncome depreciation depreciationAndAmortization incomeBeforeTax incomeTaxExpense interestAndDebtExpense netIncomeFromContinuingOperations comprehensiveIncomeNetOfTax ebit ebitda netIncome RETAIL_SALES_VALUE CONSUMER_SENTIMENT_VALUE shifted
2021-01-27 15:00:00 111.6059 111.8037 111.4442 111.5071 471830 10523000000 20367000000 9844000000 181000000 -4000000000 7233000000 1611000000 2000000000 -317000000 15000000 317000000 -247000000 1089000000 610000000 1380000000 24000000 317000000 1264000000 603000000 1697000000 2307000000 1356000000 464362 79 109.9360
2021-01-27 09:45:00 109.7274 110.7925 109.3152 109.9360 1364954 10523000000 20367000000 9844000000 181000000 -4000000000 7233000000 1611000000 2000000000 -317000000 15000000 317000000 -247000000 1089000000 610000000 1380000000 24000000 317000000 1264000000 603000000 1697000000 2307000000 1356000000 464362 79 110.9858
2021-01-27 15:30:00 111.9475 111.9565 110.8599 110.9858 431226 10523000000 20367000000 9844000000 181000000 -4000000000 7233000000 1611000000 2000000000 -317000000 15000000 317000000 -247000000 1089000000 610000000 1380000000 24000000 317000000 1264000000 603000000 1697000000 2307000000 1356000000 464362 79 111.9475
2021-01-27 15:15:00 111.4891 111.9745 111.4263 111.9475 434875 10523000000 20367000000 9844000000 181000000 -4000000000 7233000000 1611000000 2000000000 -317000000 15000000 317000000 -247000000 1089000000 610000000 1380000000 24000000 317000000 1264000000 603000000 1697000000 2307000000 1356000000 464362 79 108.5320
2021-01-27 08:45:00 108.5320 108.5320 108.3972 108.5320 2090 10523000000 20367000000 9844000000 181000000 -4000000000 7233000000 1611000000 2000000000 -317000000 15000000 317000000 -247000000 1089000000 610000000 1380000000 24000000 317000000 1264000000 603000000 1697000000 2307000000 1356000000 464362 79 109.6735
2021-01-27 09:30:00 108.5230 109.7005 108.5230 109.6735 21911 10523000000 20367000000 9844000000 181000000 -4000000000 7233000000 1611000000 2000000000 -317000000 15000000 317000000 -247000000 1089000000 610000000 1380000000 24000000 317000000 1264000000 603000000 1697000000 2307000000 1356000000 464362 79 108.3163

Using H2O.ai AutoML model

Import data to H2O

data <- h2o.importFile("./data/formatted_data.csv")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Data Summary

kbl(cbind(h2o.describe(data))) %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "200px")
Label Type Missing Zeros PosInf NegInf Min Max Mean Sigma Cardinality
C1 int 0 0 0 0 1.00000 20364.0000 10182.50000 5878.724777 NA
time time 0 0 0 0 1587356100000.00000 1646077500000.0000 1616411363361.82007 17150549471.472900 NA
open real 0 0 0 0 94.01986 140.1091 118.29767 10.964513 NA
high real 0 0 0 0 94.21928 140.7784 118.44232 10.963563 NA
low real 0 0 0 0 93.87805 139.8423 118.15019 10.965435 NA
close real 0 0 0 0 94.01427 140.1140 118.29416 10.965058 NA
volume int 0 3 0 0 0.00000 7953069.0000 113928.93218 188241.325092 NA
grossProfit int 0 0 0 0 6107000000.00000 10523000000.0000 8499836083.28423 1082736537.395630 NA
totalRevenue int 0 0 0 0 3257000000.00000 20367000000.0000 16945251718.71930 4343619426.664740 NA
costOfRevenue int 0 0 0 0 -2849000000.00000 9844000000.0000 8445378020.03536 3507399354.926100 NA
costofGoodsAndServicesSold int 0 0 0 0 49000000.00000 181000000.0000 92778530.74052 41680151.663085 NA
operatingIncome int 0 0 0 0 -4000000000.00000 2358000000.0000 780602877.62718 2071471558.836250 NA
sellingGeneralAndAdministrative int 0 0 0 0 3377000000.00000 7233000000.0000 5302524062.07032 990989430.185588 NA
researchAndDevelopment int 0 0 0 0 1515000000.00000 1657000000.0000 1602136613.63190 41618733.021482 NA
operatingExpenses int 0 0 0 0 2000000000.00000 8067000000.0000 5977057847.18130 1747060622.512220 NA
netInterestIncome int 0 0 0 0 -326000000.00000 -280000000.0000 -305726772.73620 18265627.461575 NA
interestIncome int 0 0 0 0 11000000.00000 51000000.0000 19210518.56217 11841711.077610 NA
interestExpense int 0 0 0 0 280000000.00000 326000000.0000 305726772.73620 18265627.461575 NA
otherNonOperatingIncome int 0 0 0 0 -362000000.00000 38000000.0000 -227293704.57670 100331276.734826 NA
depreciation int 0 0 0 0 749000000.00000 1089000000.0000 1026708112.35514 88655565.046771 NA
depreciationAndAmortization int 0 0 0 0 610000000.00000 647000000.0000 624363779.21823 11483727.907642 NA
incomeBeforeTax int 0 0 0 0 -51000000.00000 2092000000.0000 1319668680.02357 581349140.735480 NA
incomeTaxExpense int 0 0 0 0 -1226000000.00000 227000000.0000 -63550235.71008 437274344.446366 NA
interestAndDebtExpense int 0 0 0 0 280000000.00000 326000000.0000 305726772.73620 18265627.461575 NA
netIncomeFromContinuingOperations int 0 0 0 0 956000000.00000 1698000000.0000 1280331025.33883 207638786.871327 NA
comprehensiveIncomeNetOfTax int 0 0 0 0 489000000.00000 5853000000.0000 1861170054.99902 1362260517.605360 NA
ebit int 0 0 0 0 275000000.00000 2395000000.0000 1625395452.75977 580250944.499440 NA
ebitda int 0 0 0 0 897000000.00000 3027000000.0000 2249759231.97800 580687555.364317 NA
netIncome int 0 0 0 0 955000000.00000 2333000000.0000 1383218915.73365 361205191.271561 NA
RETAIL_SALES_VALUE int 0 0 0 0 377210.00000 636842.0000 522876.22702 52332.565954 NA
CONSUMER_SENTIMENT_VALUE real 0 0 0 0 62.80000 88.3000 76.18822 6.442037 NA
shifted real 0 0 0 0 94.01427 140.1140 118.29437 10.964970 NA

Visualizing Raw Data

plot <- as.data.frame(data) %>% plot_ly(x = ~time, type="candlestick",
                      open = ~open, close = ~close,
                      high = ~high, low = ~low) 
plot <- plot %>% add_lines(x = ~time, y = ~open, line = list(color = 'black'), inherit = F)
plot <- plot %>% layout(showlegend = FALSE)

plot

Data Splitting

y <- "shifted"
x <- setdiff(names(data),y)

parts <- h2o.splitFrame(data, .80)

train <- parts[[1]]
test <- parts[[2]]

Modelling and Result

Modelling

# automodel <- h2o.automl(x, y, train, test, max_runtime_secs = 10)
model <- h2o.loadModel("StackedEnsemble_BestOfFamily_1_AutoML_11_20220421_175438")

model
## Model Details:
## ==============
## 
## H2ORegressionModel: stackedensemble
## Model ID:  StackedEnsemble_BestOfFamily_1_AutoML_11_20220421_175438 
## Number of Base Models: 2
## 
## Base Models (count by algorithm type):
## 
## gbm glm 
##   1   1 
## 
## Metalearner:
## 
## Metalearner algorithm: glm
## Metalearner cross-validation fold assignment:
##   Fold assignment scheme: AUTO
##   Number of folds: 5
##   Fold column: NULL
## Metalearner hyperparameters: 
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.892557
## RMSE:  0.9447523
## MAE:  0.2096082
## RMSLE:  0.008022957
## Mean Residual Deviance :  0.892557
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
## 
## MSE:  1.108056
## RMSE:  1.052642
## MAE:  0.2136033
## RMSLE:  0.008832665
## Mean Residual Deviance :  1.108056
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.848416
## RMSE:  0.921095
## MAE:  0.2167189
## RMSLE:  0.007873269
## Mean Residual Deviance :  0.848416

Prediction on test data

predictions <- h2o.predict(model, test)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Visualizing Actual vs predicted values

pred_val = unlist(as.list(predictions))
test_val = as.data.frame(test)
test_val = cbind(test_val, pred_val)
test_val = test_val[1:100,]
# plot_data <- as.data.frame(test)
ggplot(data=test_val, aes(x=test_val$time, y=test_val$shifted)) +
  # geom_bar(stat="identity", width=.9, fill="blue")+
  geom_line(aes(x=test_val$time, y=test_val$shifted), color="red")+
  geom_line(aes(x=test_val$time, y=test_val$pred_val), color="black") +
  ggtitle("Actual vs prediacted price of IBM")+
  xlab("time")+
  ylab("price")+
  scale_color_manual(labels = c("actual", "predicted"), values = c("red", "black"))