Introduction

In the first part of the project-“Analyzing Stocks”, we analyzed all the data and chose only those explanatory variables(indicators) that could be significant to our model. In this part we will go a step ahead using Timeseries forecasting. Lets describe the steps we will be following:

  1. The data is available for 20 unique dates(quarterly). So we will build a linear regression model each for first 15 quarters.
  2. Now we have 15 sets of coefficients for each 15 dates, we will use ARIMA model to forecast the coefficients for the next 4 quarters
  3. Using these forecasted coefficients, we will predict the Price returns of stocks for the last 4 quarters
  4. Next divide the data into 5 buckets ordered by the actual price returns and the forecasted price returns
  5. Take mean of actual and forecasted price returns and compare the results. This will show the performance of our model.

Computing Price Returns

Price returns is the ratio of price of a stock the previous day and the next day

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Indicators
ind1 <- c("ticker", "calendardate","price","marketcap","sps","eps","ncff","ncf",
          "accoci","ncfcommon","bvps","de","pe1","ps1","netmargin","ncfi","capex",
          "currentratio", "epsdil","fcfps", "ncfx","pb","tbvps")
data_ind1<- data_csv[ind1]
data_ind1 <-na.omit(data_ind1)
dim(data_ind1)
## [1] 61502    23
# *******Computing Price returns**********************
data_ind1<-data_ind1[order(data_ind1$ticker, data_ind1$calendardate),]
# adding a new cloumn for log price ratio
data_ind1["PriceReturn"] <- NA

records<-nrow(data_ind1)
for (i in 2:records) {
  if(identical(data_ind1$ticker[i],data_ind1$ticker[i-1])){ # if ticker is same
    
    data_ind1$PriceReturn[i]<-(data_ind1$price[i]/data_ind1$price[i-1])# calculate the log returns
    
  } else {
    data_ind1$PriceReturn[i]<-0
  }
  
  
}
data_ind1$PriceReturn[1]<-0

Removing outliers

#### Data Cleaning, remove outliers #######
# 1. marketcap
data_ind1 <-subset(data_ind1, marketcap < 150000000000)
#boxplot(data_ind1$marketcap)

# 2 sps 
data_ind1 <-subset(data_ind1, sps < 800)
data_ind1 <-subset(data_ind1, sps > -100)
#boxplot(data_ind1$sps)

# 3 eps
data_ind1 <-subset(data_ind1, eps < 10)
data_ind1 <-subset(data_ind1, eps > -10)
#boxplot(data_ind1$eps)

# 4 ncff
data_ind1 <-subset(data_ind1, ncff < 1000000000)
data_ind1 <-subset(data_ind1, ncff > -1000000000)
#boxplot(data_ind1$ncff)

#5 ncf can remove later
data_ind1 <-subset(data_ind1, ncf < 700000000)
data_ind1 <-subset(data_ind1, ncf > -1000000000)
#boxplot(data_ind1$ncf)

#6 accoci can remove later
data_ind1 <-subset(data_ind1, accoci < 1000000000)
data_ind1 <-subset(data_ind1, accoci > -2000000000)
#boxplot(data_ind1$accoci)

#7 ncfcommon lot of outliers still
data_ind1 <-subset(data_ind1, ncfcommon < 400000000)
data_ind1 <-subset(data_ind1, ncfcommon > -300000000)
#boxplot(data_ind1$ncfcommon)

# 8 bvps
data_ind1 <-subset(data_ind1, bvps < 200)
#boxplot(data_ind1$bvps)

#9 de
data_ind1 <-subset(data_ind1, de < 100)
data_ind1 <-subset(data_ind1, de > -100)
#boxplot(data_ind1$de)

#10 pe1
data_ind1 <-subset(data_ind1, pe1 < 1500)
data_ind1 <-subset(data_ind1, pe1 > -1500)
#boxplot(data_ind1$pe1)

#11 ps1
data_ind1 <-subset(data_ind1, ps1 < 1000)
data_ind1 <-subset(data_ind1, ps1 > -1000)
#boxplot(data_ind1$ps1)

# 12 netmargin
data_ind1 <-subset(data_ind1, netmargin <  40)
data_ind1 <-subset(data_ind1, netmargin > -80)
#boxplot(data_ind1$netmargin)

# 13 ncfi
data_ind1 <-subset(data_ind1, ncfi <  1000000000)
data_ind1 <-subset(data_ind1, ncfi > -2000000000)
#boxplot(data_ind1$ncfi)

# 14 capex
data_ind1 <-subset(data_ind1, capex > -900000000)
data_ind1 <-subset(data_ind1, capex < 200000000)
#boxplot(data_ind1$capex)

# 15 currentratio
data_ind1 <-subset(data_ind1, currentratio < 40)
#boxplot(data_ind1$currentratio)

# 16 epsdil
data_ind1 <-subset(data_ind1, epsdil < 20)
#boxplot(data_ind1$epsdil)

# 17 fcfps
data_ind1 <-subset(data_ind1, fcfps < 20)
data_ind1 <-subset(data_ind1, fcfps > -20)
#boxplot(data_ind1$fcfps)

# 18 ncfx
data_ind1 <-subset(data_ind1, ncfx < 9000000)
data_ind1 <-subset(data_ind1, ncfx > -9000000)
#boxplot(data_ind1$ncfx)

# 19 pb
data_ind1 <-subset(data_ind1, pb < 200)
data_ind1 <-subset(data_ind1, pb > -150)
#boxplot(data_ind1$pb)

#20 tbvps
data_ind1 <-subset(data_ind1, tbvps < 300)
#boxplot(data_ind1$tbvps)

Normalizing the data

#************ Normalize data***************
dfNormZ <- as.data.frame( scale(data_ind1[3:23] ))
dfNormZ<-cbind(dfNormZ,data_ind1$ticker)
dfNormZ<-cbind(dfNormZ,data_ind1$calendardate)
dfNormZ<-cbind(dfNormZ,data_ind1$PriceReturn)

#naming the normalized variable
colnames(dfNormZ)[22] <- "ticker"
colnames(dfNormZ)[23] <- "calenderdate"
colnames(dfNormZ)[24] <- "PriceReturn"

Building the model- for Universe(all the stocks)

Building model for 15 quarters(15 models in total)

#***************** regression model****************************

dfNormZ<-subset(dfNormZ,PriceReturn!=0)
dfNormZ<-subset(dfNormZ,PriceReturn!=Inf)
dfNormZ<-dfNormZ[complete.cases(dfNormZ), ]
dfNormZ <-na.omit(dfNormZ)

cal_date<-unique(dfNormZ$calenderdate)
l<-length(cal_date)
l<-l-5
reg_model <- list()
reg_summary <- list()
betas<-NULL
for (i in 1:l){
  data_set<-subset(dfNormZ,calenderdate==cal_date[i])
  

  #Building a linear regression model
  f<-lm(data_set$PriceReturn~marketcap+sps+eps+ncff+ncf+accoci
        +ncfcommon+bvps+de+pe1+ps1+netmargin+ncfx+pb
        +ncfi+capex+currentratio+epsdil+fcfps+tbvps,data=data_set)
  
  reg_model[[i]] <- f
  reg_summary[[i]] <- summary(f)
  betas<- rbind(betas,reg_model[[i]]$coefficients)
  # Get all the co-efficients of the regression model in a data frame
}

Forecasting-ARIMA model

Using the 15 sets of coefficients forecast the next 4 sets of coefficients

###***** Time Series for the Universe ********
colnames(betas) <- c("beta0","beta1","beta2","beta3","beta4","beta5","beta6","beta7",
                     "beta8","beta9","beta10","beta11","beta12","beta13","beta14",
                     "beta15","beta16","beta17","beta18","beta19","beta20")


#**** ARIMA model*******

data_ts<- dfNormZ[order(dfNormZ$calenderdate),]
cal_date<-unique(data_ts$calenderdate)

# Add dates in the betas(coefficients) data frame
betas<-as.data.frame(betas)

betas$date <- cal_date[1:15]
betas$date <- as.Date(betas$date, format = "%Y-%m-%d")

# forecast the betas for 16 through 19 dates
for (j in 1:21){
  for (i in 16:20){
     coeff <- ts(betas[,j], start = c(2011,03), end = c(2014,09), frequency = 15+(i-15))
    fit <- arima(coeff, order = c(1,0,2))
    pr <- predict(fit,n.ahead = 1)
    betas[i, j] <- as.numeric(pr$pred) # append the forecasted betas(16-20)
    
  }
}

Subset the data for the last 4 dates

# subset the data and get last 4 dates(16 to 19) data in data_ts
cal_date<-as.data.frame(cal_date[16:19])
data_ts<-sqldf('select * from data_ts where calenderdate in cal_date')
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
for (i in 16:19){
  betas[i,22] <- cal_date[i-15,1]
}

Forecast the Price Returns

Using the forecasted the coefficients we got from ARIMA model, we will forecast the price returns for the last 4 dates; First subsetting the data for last 4 dates

# forecast the return price for each date
data_ts$calenderdate <- as.Date(data_ts$calenderdate, format = "%Y-%m-%d") 


# Here we took 4 df(set1,set2,set3,set4) 
set1<-subset(data_ts,data_ts$calenderdate == "2014-12-31")
set2<-subset(data_ts,data_ts$calenderdate == "2015-03-31")
set3<-subset(data_ts,data_ts$calenderdate == "2015-06-30")
set4<-subset(data_ts,data_ts$calenderdate == "2015-09-30")

n1<-nrow(set1)
n2<-nrow(set2)
n3<-nrow(set3)
n4<-nrow(set4)

Now Predicting price returns using forecasted betas for each date(16-19)

# Predicting price returns using forecasted betas for each date(16-19)
gg<-0
set1$for_price<-NA
for(i in 1:n1){
  for(j in 2:21){
    target<- set1[i,j]*betas[16,j]
    target = target + gg 
  }
   set1[i,25] <- target + betas[16,1]
}


set1 <-set1[order(set1$PriceReturn),]
set2$for_price<-NA
for(i in 1:n2){
  for(j in 2:21){
    target<- set2[i,j]*betas[17,j]
    target = target + gg 
  }
  set2[i,25] <- target + betas[17,1]
}


set3$for_price<-NA
for(i in 1:n3){
  for(j in 2:21){
    target<- set3[i,j]*betas[18,j]
    target = target + gg 
  }
  set3[i,25] <- target + betas[18,1]
}


set4$for_price<-NA
for(i in 1:n4){
  for(j in 2:21){
    target<- set4[i,j]*betas[19,j]
    target = target + gg 
  }
  set4[i,25] <- target + betas[19,1]
}

Combine the results in a single data frame

# Combining the data in a single data frame
org_data<-rbind(set1,set2,set3,set4)


n_row<-nrow(org_data)
comp1<-org_data[order(org_data$PriceReturn),] # order by original price returns
comp2<-org_data[order(org_data$for_price),] # order by forecasted price returns

Creating the buckets based on Price returns

# create 5 buckets for price return
new_df<-NULL
new_df$meanPriceReturn<-0
new_df$meanfor_return<-0

# diving into 5 equal buckets
f1<-1
f2<-2066
for(i in 1:5){
  new_df$meanPriceReturn[i]<-mean(comp1[f1:f2,24])
  new_df$meanfor_return[i]<-mean(comp2[f1:f2,24])
  f1<- f2
  f2<- f2+2066 
  
}

# Actual and forecasted returns for the Universe
names(new_df) <- c("ActualReturns","Forecasted Returns")
new_df<-as.data.frame(new_df)
knitr::kable(new_df)
ActualReturns Forecasted.Returns
0.6616758 0.9772242
0.8871990 0.9738437
0.9819374 0.9970131
1.0679350 1.0148454
1.3158568 0.9592560

Building the model for restricted sector- Information Sector

In this part of the project we will repeat the same process as above but this time we will restrict our stocks to a particular sector. We will consider stocks belonging to only Information sector

### ******** Sector wise************** ####

tic <- read.csv('/Users/neha/Documents/Predictive/tic.csv')
tic<-tic[order(tic$Sector),]
t = sqldf("select ticker, sector from tic where sector = 'Information Technology'")

# get all the data in information technology sector 
colnames(t)[1] <- "ticker"
sect_wise <-merge(dfNormZ,t, by = "ticker")

Building the regression model

Building the model each for 15 quarters

# Run a linear regression model for Information Technology sector
cal_date1<-unique(sect_wise$calenderdate)
l<-length(cal_date1)
l<-l-5
reg_model1 <- list()
reg_summary1 <- list()
betas1<-NULL
for (i in 1:l){
  data_set1<-subset(sect_wise,calenderdate==cal_date1[i])
  
  #Building a linear regression model
  f_sect <- lm(log(data_set1$PriceReturn)~marketcap+sps+eps+ncff+ncf+accoci
               +ncfcommon+bvps+de+pe1+ps1+netmargin+ncfx+pb
               +ncfi+capex+currentratio+epsdil+fcfps+tbvps ,data=data_set1)
  
  
  reg_model1[[i]] <- f_sect
  reg_summary1[[i]] <- summary(f_sect)
 # cat("Summary for Model", i)
 # print(summary(f_sect))
  betas1<- rbind(betas1,reg_model1[[i]]$coefficients)
}
# combbing the coefficients
colnames(betas1) <- c("beta0","beta1","beta2","beta3","beta4","beta5","beta6","beta7",
                     "beta8","beta9","beta10","beta11","beta12","beta13","beta14",
                     "beta15","beta16","beta17","beta18","beta19","beta20")

Forecasting - ARIMA model

#**** ARIMA model*******

data_ts1<- sect_wise[order(sect_wise$calenderdate),]
cal_date<-unique(data_ts1$calenderdate)

# Add dates in the betas(coefficients) data frame
betas1<-as.data.frame(betas1)

betas1$date <- cal_date[1:15]
betas1$date <- as.Date(betas1$date, format = "%Y-%m-%d")

# forecast the betas for 16 through 19 dates
for (j in 1:21){
  for (i in 16:20){
    
    coeff1 <- ts(betas1[,j], start = c(2011,03), end = c(2014,09), frequency = 15+(i-15))
    #  print(betas[,j])
    fit1 <- arima(coeff1, order = c(1,0,2))
    pr1 <- predict(fit1,n.ahead = 1)
    betas1[i, j] <- as.numeric(pr1$pred) # append the forecasted betas(16-20)
    
  }
}
# subset the data and get last 4 dates data in data_ts
cal_date<-as.data.frame(cal_date[16:19])
data_ts1<-sqldf('select * from data_ts1 where calenderdate in cal_date')

for (i in 16:19){
  betas1[i,22] <- cal_date[i-15,1]
}

Forecast the Price Returns

Using the forecasted the coefficients we got from ARIMA model, we will forecast the price returns for the last 4 dates; First subsetting the data for last 4 dates

# forecast the return price
data_ts1$calenderdate <- as.Date(data_ts1$calenderdate, format = "%Y-%m-%d") 


set11<-subset(data_ts1[,2:25],data_ts1$calenderdate == "2014-12-31")
set21<-subset(data_ts1[,2:25],data_ts1$calenderdate == "2015-03-31")
set31<-subset(data_ts1[,2:25],data_ts1$calenderdate == "2015-06-30")
set41<-subset(data_ts1[,2:25],data_ts1$calenderdate == "2015-09-30")

n1<-nrow(set11)
n2<-nrow(set21)
n3<-nrow(set31)
n4<-nrow(set41)

Now predicting price returns using forecasted betas for each date(16-19)

gg<-0
set11$for_price1<-NA
for(i in 1:n1){
  for(j in 2:21){
    target1<- set11[i,j]*betas1[16,j]
    target1 = target1 + gg 
  }
  set11[i,25] <- target1 + betas1[16,1]
}
#View(set11)
gg<-0
#set1 <-set1[order(set1$PriceReturn),]
set21$for_price1<-NA
for(i in 1:n2){
  for(j in 2:21){
    target1<- set21[i,j]*betas1[17,j]
    target1 = target1 + gg 
  }
  set21[i,25] <- target1 + betas1[17,1]
}
#View(set21)

gg<-0
set31$for_price1<-NA
for(i in 1:n3){
  for(j in 2:21){
    target1<- set31[i,j]*betas1[18,j]
    target1 = target1 + gg 
  }
  set31[i,25] <- target1 + betas1[18,1]
}
#View(set31)
gg<-0
set41$for_price1<-NA
for(i in 1:n4){
  for(j in 2:21){
    target1<- set41[i,j]*betas1[19,j]
    target1 = target1 + gg 
  }
  set41[i,25] <- target1 + betas1[19,1]
}
#View(set41)
org_data1<-rbind(set11,set21,set31,set41)

Combining the data in a single data frame

n_row<-nrow(org_data1)
comp11<-org_data1[order(org_data1$PriceReturn),]
comp21<-org_data1[order(org_data1$for_price1),]

Creating the buckets based on Price returns

# create 5 buckets for price return
new_df1<-NULL
new_df1$meanPriceReturn<-0
new_df1$meanForcasted<-0

# Dividing the data into 5 equal buckets
f1<-1
f2<-185
for(i in 1:5){
  new_df1$meanPriceReturn[i]<-mean(comp11[f1:f2,23])
  new_df1$meanForcasted[i]<-mean(comp21[f1:f2,25])
  f1<- f2
  f2<- f2+185
 
}
# Actual and forecasted returns for Information Technology
names(new_df1) <- c("ActualReturns","Forecasted Returns")
new_df1<-as.data.frame(new_df1)
knitr::kable(new_df1)
ActualReturns Forecasted.Returns
0.7759202 0.0180397
0.9282601 0.0278704
1.0155221 0.0298581
1.1043328 0.0322246
1.2813537 0.0360007

Conclusion

  1. The output shows that the model forecasted extremely well for the universe, the difference between the actual and forecasted returns were minimal
  2. For sector wise the model results were not very robust.

  3. Going forward using neural net can be used to improve the results for sectors or overall, as it does not assume only linear relationships like regression models.