Group 8: Stock Market Prediction

Team Members

Ankit (ankit02@student.gsu.edu)

Vamshidhar Reddy Kanamanthareddy (vkanamanthareddy1@student.gsu.edu)

Sai Charan Pappala (spappala1@student.gsu.edu)

Uday Kolli (ukolli1@student.gsu.edu)

Business context

Stock Market prices are very volatile and every investor is interested to know it’s future behavior so that they can invest and gain profits. Prediction and analysis of Stock prices are some of the most difficult jobs to complete. There can be various reasons for this like market fluctuations, news, customer sentiments, pandemic etc. We would like to consider some of these factors as our predictors and would like to come up with a prediction of stock price.

Problem description

In this project we are taking IBM’s intraday stock price for past 2 years with 15mins times interval and try to predict the close price at the end of 15 mins using various Fundamental/Economic/Technical indicators as our predictors

URL link to download the zip file of data collected from the API

The Data files are stored in github

Data preparation

Load Libraries

library(h2o)
library(tidyverse)
library(plyr)
library(httr)
library(jsonlite)
library(tidyverse)
library(stringr)
library(lubridate)
library(ggplot2)
library(skimr)
library(recipes)
library(plotly)
library(stringr)
library(knitr)
library(kableExtra)

options(scipen =999)
h2o.init(nthreads = -1)

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 40 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.36.0.3 
##     H2O cluster version age:    2 months and 5 days  
##     H2O cluster name:           H2O_started_from_R_Ankit_kij316 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.78 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.1.2 (2021-11-01)

Define some functions for pre-processing data

get_quater <- function(number){
  
  if (number ==1 | number==2 | number==3){
    return ("Q1")
  }
  else if(number ==4 | number==5 | number==6){
    return ("Q2")
  }
  else if(number ==7 | number==8 | number==9){
    return ("Q3")
  }
  else if(number ==10 | number==11 | number==12){
    return ("Q4")
  }
}

get_modified_quater <- function(number){
  if ( number==1 | number==2 | number==3){
    return ("Q2")
  }
  else if(number ==4 | number==5 | number==6){
    return ("Q3")
  }
  else if(number ==7 | number==8 | number==9){
    return ("Q4")
  }
  else if(number ==10 | number==11 | number==12){
    return ("Q1")
  }
}

get_modified_year <- function(number, year_val){
  if ( number==1 | number==2 | number==3){
    return (year_val)
  }
  else if(number ==4 | number==5 | number==6){
    return (year_val)
  }
  else if(number ==7 | number==8 | number==9){
    return (year_val)
  }
  else if(number ==10 | number==11 | number==12){
    return (year_val+1)
  }
}

Prepare master data for prediction

Read all data from files/folders

myfiles = list.files(path="./data/StockPrediction", pattern="^year", full.names=TRUE)
Primary_data_Intraday = ldply(myfiles, read_csv)
df_Income <- read.csv("./data/StockPrediction/Income.csv")
df_cs <- read.csv("./data/StockPrediction/Customer_Sentiment.csv")
df_rs <- read.csv("./data/StockPrediction/RETAIL_SALES.csv")

Prepare additional columns for data merge

# extracting time from timestamp 
Primary_data_Intraday<- Primary_data_Intraday %>%
  mutate(Date= format.Date(Primary_data_Intraday$time,"%Y/%m/%d"), 
         Time = format.Date(Primary_data_Intraday$time,"%H:%M:%S"),
         Year=format.Date(Primary_data_Intraday$time,"%Y")) 

# Calculated cloumn Quater for Intra-Day
Quater=unlist(map(month(as.POSIXlt(Primary_data_Intraday$Date, format="%Y/%m/%d")),get_quater))
Primary_data_Intraday=cbind(Primary_data_Intraday,Quater)

# Calculated column Quater for Income statement
Quater=unlist(map(month(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),get_modified_quater))

df_Income=cbind(df_Income,Quater)

Year=unlist(map2(month(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),
                 year(as.POSIXlt(df_Income$fiscalDateEnding, format="%Y-%m-%d")),
                 get_modified_year))

df_Income=cbind(df_Income,Year)

Merge all data

data = merge(x=Primary_data_Intraday, y=df_Income, by=c("Quater","Year"))

data$Date=as.Date(data$Date)

df_rs$date=as.Date(df_rs$date)
df_rs=df_rs %>% mutate(Year=format.Date(df_rs$date,"%Y"))
df_rs=df_rs %>% mutate(Month=format.Date(df_rs$date,"%m"))
df_cs=df_cs %>% mutate(Year=format.Date(df_cs$date,"%Y"))
df_cs=df_cs %>% mutate(Month=format.Date(df_cs$date,"%m"))

data=data %>% mutate(Month=format.Date(data$Date,"%m"))
data$Date=as.Date(data$Date)
data = merge(x=data, y=df_rs, by=c("Month","Year"))
data = merge(x=data, y=df_cs, by=c("Month","Year"))
data$interestIncome=as.numeric(data$interestIncome)
data$otherNonOperatingIncome=as.numeric(data$otherNonOperatingIncome)
data$CONSUMER_SENTIMENT_VALUE=as.numeric(data$CONSUMER_SENTIMENT_VALUE)

Clean data for training

data<- data %>% select(-c("Quater","Year","Date","Time","Month","date.x","date.y","fiscalDateEnding","reportedCurrency","nonInterestIncome","investmentIncomeNet"))

shift <- function(x, n) {   
  c(x[-(seq(n))], rep(NA, n))
}

data$shifted <- shift(data$close, 1)

#tail(data)

data <- na.omit(data)

write.csv(data, "./data/formatted_data.csv")

kbl(cbind(head(data))) %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "100%")

time	open	high	low	close	volume	grossProfit	totalRevenue	costOfRevenue	costofGoodsAndServicesSold	operatingIncome	sellingGeneralAndAdministrative	researchAndDevelopment	operatingExpenses	netInterestIncome	interestIncome	interestExpense	otherNonOperatingIncome	depreciation	depreciationAndAmortization	incomeBeforeTax	incomeTaxExpense	interestAndDebtExpense	netIncomeFromContinuingOperations	comprehensiveIncomeNetOfTax	ebit	ebitda	netIncome	RETAIL_SALES_VALUE	CONSUMER_SENTIMENT_VALUE	shifted
2021-01-27 15:00:00	111.6059	111.8037	111.4442	111.5071	471830	10523000000	20367000000	9844000000	181000000	-4000000000	7233000000	1611000000	2000000000	-317000000	15000000	317000000	-247000000	1089000000	610000000	1380000000	24000000	317000000	1264000000	603000000	1697000000	2307000000	1356000000	464362	79	109.9360
2021-01-27 09:45:00	109.7274	110.7925	109.3152	109.9360	1364954	10523000000	20367000000	9844000000	181000000	-4000000000	7233000000	1611000000	2000000000	-317000000	15000000	317000000	-247000000	1089000000	610000000	1380000000	24000000	317000000	1264000000	603000000	1697000000	2307000000	1356000000	464362	79	110.9858
2021-01-27 15:30:00	111.9475	111.9565	110.8599	110.9858	431226	10523000000	20367000000	9844000000	181000000	-4000000000	7233000000	1611000000	2000000000	-317000000	15000000	317000000	-247000000	1089000000	610000000	1380000000	24000000	317000000	1264000000	603000000	1697000000	2307000000	1356000000	464362	79	111.9475
2021-01-27 15:15:00	111.4891	111.9745	111.4263	111.9475	434875	10523000000	20367000000	9844000000	181000000	-4000000000	7233000000	1611000000	2000000000	-317000000	15000000	317000000	-247000000	1089000000	610000000	1380000000	24000000	317000000	1264000000	603000000	1697000000	2307000000	1356000000	464362	79	108.5320
2021-01-27 08:45:00	108.5320	108.5320	108.3972	108.5320	2090	10523000000	20367000000	9844000000	181000000	-4000000000	7233000000	1611000000	2000000000	-317000000	15000000	317000000	-247000000	1089000000	610000000	1380000000	24000000	317000000	1264000000	603000000	1697000000	2307000000	1356000000	464362	79	109.6735
2021-01-27 09:30:00	108.5230	109.7005	108.5230	109.6735	21911	10523000000	20367000000	9844000000	181000000	-4000000000	7233000000	1611000000	2000000000	-317000000	15000000	317000000	-247000000	1089000000	610000000	1380000000	24000000	317000000	1264000000	603000000	1697000000	2307000000	1356000000	464362	79	108.3163

Using H2O.ai AutoML model

Import data to H2O

data <- h2o.importFile("./data/formatted_data.csv")

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Data Summary

kbl(cbind(h2o.describe(data))) %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "200px")

Label	Type	Zeros	Min	Max	Mean	Sigma	Cardinality
C1	int	0	1.00000	20364.0000	10182.50000	5878.724777	NA
time	time	0	1587356100000.00000	1646077500000.0000	1616411363361.82007	17150549471.472900	NA
open	real	0	94.01986	140.1091	118.29767	10.964513	NA
high	real	0	94.21928	140.7784	118.44232	10.963563	NA
low	real	0	93.87805	139.8423	118.15019	10.965435	NA
close	real	0	94.01427	140.1140	118.29416	10.965058	NA
volume	int	3	0.00000	7953069.0000	113928.93218	188241.325092	NA
grossProfit	int	0	6107000000.00000	10523000000.0000	8499836083.28423	1082736537.395630	NA
totalRevenue	int	0	3257000000.00000	20367000000.0000	16945251718.71930	4343619426.664740	NA
costOfRevenue	int	0	-2849000000.00000	9844000000.0000	8445378020.03536	3507399354.926100	NA
costofGoodsAndServicesSold	int	0	49000000.00000	181000000.0000	92778530.74052	41680151.663085	NA
operatingIncome	int	0	-4000000000.00000	2358000000.0000	780602877.62718	2071471558.836250	NA
sellingGeneralAndAdministrative	int	0	3377000000.00000	7233000000.0000	5302524062.07032	990989430.185588	NA
researchAndDevelopment	int	0	1515000000.00000	1657000000.0000	1602136613.63190	41618733.021482	NA
operatingExpenses	int	0	2000000000.00000	8067000000.0000	5977057847.18130	1747060622.512220	NA
netInterestIncome	int	0	-326000000.00000	-280000000.0000	-305726772.73620	18265627.461575	NA
interestIncome	int	0	11000000.00000	51000000.0000	19210518.56217	11841711.077610	NA
interestExpense	int	0	280000000.00000	326000000.0000	305726772.73620	18265627.461575	NA
otherNonOperatingIncome	int	0	-362000000.00000	38000000.0000	-227293704.57670	100331276.734826	NA
depreciation	int	0	749000000.00000	1089000000.0000	1026708112.35514	88655565.046771	NA
depreciationAndAmortization	int	0	610000000.00000	647000000.0000	624363779.21823	11483727.907642	NA
incomeBeforeTax	int	0	-51000000.00000	2092000000.0000	1319668680.02357	581349140.735480	NA
incomeTaxExpense	int	0	-1226000000.00000	227000000.0000	-63550235.71008	437274344.446366	NA
interestAndDebtExpense	int	0	280000000.00000	326000000.0000	305726772.73620	18265627.461575	NA
netIncomeFromContinuingOperations	int	0	956000000.00000	1698000000.0000	1280331025.33883	207638786.871327	NA
comprehensiveIncomeNetOfTax	int	0	489000000.00000	5853000000.0000	1861170054.99902	1362260517.605360	NA
ebit	int	0	275000000.00000	2395000000.0000	1625395452.75977	580250944.499440	NA
ebitda	int	0	897000000.00000	3027000000.0000	2249759231.97800	580687555.364317	NA
netIncome	int	0	955000000.00000	2333000000.0000	1383218915.73365	361205191.271561	NA
RETAIL_SALES_VALUE	int	0	377210.00000	636842.0000	522876.22702	52332.565954	NA
CONSUMER_SENTIMENT_VALUE	real	0	62.80000	88.3000	76.18822	6.442037	NA
shifted	real	0	94.01427	140.1140	118.29437	10.964970	NA

Visualizing Raw Data

plot <- as.data.frame(data) %>% plot_ly(x = ~time, type="candlestick",
                      open = ~open, close = ~close,
                      high = ~high, low = ~low) 
plot <- plot %>% add_lines(x = ~time, y = ~open, line = list(color = 'black'), inherit = F)
plot <- plot %>% layout(showlegend = FALSE)

plot

Data Splitting

y <- "shifted"
x <- setdiff(names(data),y)

parts <- h2o.splitFrame(data, .80)

train <- parts[[1]]
test <- parts[[2]]

Modelling and Result

Modelling

# automodel <- h2o.automl(x, y, train, test, max_runtime_secs = 10)
model <- h2o.loadModel("StackedEnsemble_BestOfFamily_1_AutoML_11_20220421_175438")

model

## Model Details:
## ==============
## 
## H2ORegressionModel: stackedensemble
## Model ID:  StackedEnsemble_BestOfFamily_1_AutoML_11_20220421_175438 
## Number of Base Models: 2
## 
## Base Models (count by algorithm type):
## 
## gbm glm 
##   1   1 
## 
## Metalearner:
## 
## Metalearner algorithm: glm
## Metalearner cross-validation fold assignment:
##   Fold assignment scheme: AUTO
##   Number of folds: 5
##   Fold column: NULL
## Metalearner hyperparameters: 
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.892557
## RMSE:  0.9447523
## MAE:  0.2096082
## RMSLE:  0.008022957
## Mean Residual Deviance :  0.892557
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
## 
## MSE:  1.108056
## RMSE:  1.052642
## MAE:  0.2136033
## RMSLE:  0.008832665
## Mean Residual Deviance :  1.108056
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.848416
## RMSE:  0.921095
## MAE:  0.2167189
## RMSLE:  0.007873269
## Mean Residual Deviance :  0.848416

Prediction on test data

predictions <- h2o.predict(model, test)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Visualizing Actual vs predicted values

pred_val = unlist(as.list(predictions))
test_val = as.data.frame(test)
test_val = cbind(test_val, pred_val)
test_val = test_val[1:100,]
# plot_data <- as.data.frame(test)
ggplot(data=test_val, aes(x=test_val$time, y=test_val$shifted)) +
  # geom_bar(stat="identity", width=.9, fill="blue")+
  geom_line(aes(x=test_val$time, y=test_val$shifted), color="red")+
  geom_line(aes(x=test_val$time, y=test_val$pred_val), color="black") +
  ggtitle("Actual vs prediacted price of IBM")+
  xlab("time")+
  ylab("price")+
  scale_color_manual(labels = c("actual", "predicted"), values = c("red", "black"))