#LOAD REQUIRED LIBRARIES
library(tidyverse)
library(ggplot2)
library(zoo)
library(corrplot)
library(Metrics)
#Set working directory to project folder
setwd("C:/Users/justine/Desktop/Midterm")
# Get all CSV files from data folder
mydatasets <- list.files(path = "data",pattern = "*.csv",full.names = TRUE)
# Display all file paths
mydatasets
## [1] "data/ADANIPORTS.csv" "data/ASIANPAINT.csv"
## [3] "data/AXISBANK.csv" "data/BAJAJ-AUTO.csv"
## [5] "data/BAJAJFINSV.csv" "data/BAJFINANCE.csv"
## [7] "data/BHARTIARTL.csv" "data/BPCL.csv"
## [9] "data/BRITANNIA.csv" "data/CIPLA.csv"
## [11] "data/COALINDIA.csv" "data/DRREDDY.csv"
## [13] "data/EICHERMOT.csv" "data/GAIL.csv"
## [15] "data/GRASIM.csv" "data/HCLTECH.csv"
## [17] "data/HDFC.csv" "data/HDFCBANK.csv"
## [19] "data/HEROMOTOCO.csv" "data/HINDALCO.csv"
## [21] "data/HINDUNILVR.csv" "data/ICICIBANK.csv"
## [23] "data/INDUSINDBK.csv" "data/INFRATEL.csv"
## [25] "data/INFY.csv" "data/IOC.csv"
## [27] "data/ITC.csv" "data/JSWSTEEL.csv"
## [29] "data/KOTAKBANK.csv" "data/LT.csv"
## [31] "data/MARUTI.csv" "data/MM.csv"
## [33] "data/NESTLEIND.csv" "data/NIFTY50_all.csv"
## [35] "data/NTPC.csv" "data/ONGC.csv"
## [37] "data/POWERGRID.csv" "data/RELIANCE.csv"
## [39] "data/SBIN.csv" "data/SHREECEM.csv"
## [41] "data/stock_metadata.csv" "data/SUNPHARMA.csv"
## [43] "data/TATAMOTORS.csv" "data/TATASTEEL.csv"
## [45] "data/TCS.csv" "data/TECHM.csv"
## [47] "data/TITAN.csv" "data/ULTRACEMCO.csv"
## [49] "data/UPL.csv" "data/VEDL.csv"
## [51] "data/WIPRO.csv" "data/ZEEL.csv"
##LOAD THE DATASETS
#Read all csv files into a list
stock_market_data<- lapply(mydatasets, read.csv)
##ASSIGN NAMES TO DATASETS
# Remove .csv extension from file names
names(stock_market_data) <- gsub(
".csv",
"",
basename(mydatasets)
)
# Display dataset names
names(stock_market_data)
## [1] "ADANIPORTS" "ASIANPAINT" "AXISBANK" "BAJAJ-AUTO"
## [5] "BAJAJFINSV" "BAJFINANCE" "BHARTIARTL" "BPCL"
## [9] "BRITANNIA" "CIPLA" "COALINDIA" "DRREDDY"
## [13] "EICHERMOT" "GAIL" "GRASIM" "HCLTECH"
## [17] "HDFC" "HDFCBANK" "HEROMOTOCO" "HINDALCO"
## [21] "HINDUNILVR" "ICICIBANK" "INDUSINDBK" "INFRATEL"
## [25] "INFY" "IOC" "ITC" "JSWSTEEL"
## [29] "KOTAKBANK" "LT" "MARUTI" "MM"
## [33] "NESTLEIND" "NIFTY50_all" "NTPC" "ONGC"
## [37] "POWERGRID" "RELIANCE" "SBIN" "SHREECEM"
## [41] "stock_metadata" "SUNPHARMA" "TATAMOTORS" "TATASTEEL"
## [45] "TCS" "TECHM" "TITAN" "ULTRACEMCO"
## [49] "UPL" "VEDL" "WIPRO" "ZEEL"
#Extract Zee Entertainment Enterprises Ltd dataset
zee<-stock_market_data$ZEEL
#Displaying first rows
head(zee)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 ZEETELE EQ 1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE EQ 1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE EQ 1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE EQ 1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE EQ 1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## 6 2000-01-10 ZEETELE EQ 1026.25 1026.25 1026.25 944.30 962.00 966.70
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14 NA NA NA
## 2 1228.02 4616547 5.669220e+14 NA NA NA
## 3 1238.35 8763127 1.085178e+15 NA NA NA
## 4 1135.04 5164020 5.861353e+14 NA NA NA
## 5 1029.94 755129 7.777374e+13 NA NA NA
## 6 980.49 3942813 3.865885e+14 NA NA NA
colnames(zee)
## [1] "Date" "Symbol" "Series"
## [4] "Prev.Close" "Open" "High"
## [7] "Low" "Last" "Close"
## [10] "VWAP" "Volume" "Turnover"
## [13] "Trades" "Deliverable.Volume" "X.Deliverble"
# Convert date column into date format
zee$Date<- as.Date(zee$Date)
#check structure of dataset
str(zee)
## 'data.frame': 5306 obs. of 15 variables:
## $ Date : Date, format: "2000-01-03" "2000-01-04" ...
## $ Symbol : chr "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
## $ Series : chr "EQ" "EQ" "EQ" "EQ" ...
## $ Prev.Close : num 1093 1180 1261 1177 1115 ...
## $ Open : num 1175 1220 1161 1195 1097 ...
## $ High : num 1180 1274 1318 1200 1097 ...
## $ Low : num 1160 1183 1160 1095 1026 ...
## $ Last : num 1180 1274 1191 1106 1026 ...
## $ Close : num 1180 1261 1177 1115 1026 ...
## $ VWAP : num 1177 1228 1238 1135 1030 ...
## $ Volume : int 1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
## $ Turnover : num 1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
## $ Trades : num NA NA NA NA NA NA NA NA NA NA ...
## $ Deliverable.Volume: num NA NA NA NA NA NA NA NA NA NA ...
## $ X.Deliverble : num NA NA NA NA NA NA NA NA NA NA ...
# Create a function to profile stock datasets
profile_stock <- function(data){
# Trading date range
cat("TRADING DATE RANGE\n")
print(range(data$Date))
# Missing values
cat("\nMISSING VALUES\n")
print(colSums(is.na(data)))
# First 5 rows
cat("\nTOP 5 ROWS\n")
print(head(data, 5))
# Data types
cat("\nDATA TYPES\n")
print(str(data))
# Summary statistics
cat("\nSUMMARY STATISTICS\n")
print(summary(data))
# Outlier detection using boxplot
cat("\nOUTLIERS\n")
# Select numeric columns
numeric_cols <- sapply(data, is.numeric)
# Loop through numeric columns
for(col in names(data[, numeric_cols])){
# Use boxplot to detect outliers
outliers <- boxplot.stats(data[[col]])$out
# Print number of outliers
cat(col, ":", length(outliers), "outliers\n")
}
}
#Apply function Zeel dataset
profile_stock(zee)
## TRADING DATE RANGE
## [1] "2000-01-03" "2021-04-30"
##
## MISSING VALUES
## Date Symbol Series Prev.Close
## 0 0 0 0
## Open High Low Last
## 0 0 0 0
## Close VWAP Volume Turnover
## 0 0 0 0
## Trades Deliverable.Volume X.Deliverble
## 2850 519 519
##
## TOP 5 ROWS
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 ZEETELE EQ 1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE EQ 1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE EQ 1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE EQ 1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE EQ 1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14 NA NA NA
## 2 1228.02 4616547 5.669220e+14 NA NA NA
## 3 1238.35 8763127 1.085178e+15 NA NA NA
## 4 1135.04 5164020 5.861353e+14 NA NA NA
## 5 1029.94 755129 7.777374e+13 NA NA NA
##
## DATA TYPES
## 'data.frame': 5306 obs. of 15 variables:
## $ Date : Date, format: "2000-01-03" "2000-01-04" ...
## $ Symbol : chr "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
## $ Series : chr "EQ" "EQ" "EQ" "EQ" ...
## $ Prev.Close : num 1093 1180 1261 1177 1115 ...
## $ Open : num 1175 1220 1161 1195 1097 ...
## $ High : num 1180 1274 1318 1200 1097 ...
## $ Low : num 1160 1183 1160 1095 1026 ...
## $ Last : num 1180 1274 1191 1106 1026 ...
## $ Close : num 1180 1261 1177 1115 1026 ...
## $ VWAP : num 1177 1228 1238 1135 1030 ...
## $ Volume : int 1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
## $ Turnover : num 1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
## $ Trades : num NA NA NA NA NA NA NA NA NA NA ...
## $ Deliverable.Volume: num NA NA NA NA NA NA NA NA NA NA ...
## $ X.Deliverble : num NA NA NA NA NA NA NA NA NA NA ...
## NULL
##
## SUMMARY STATISTICS
## Date Symbol Series Prev.Close
## Min. :2000-01-03 Length :5306 Length :5306 Min. : 62.3
## 1st Qu.:2005-04-13 N.unique : 2 N.unique : 1 1st Qu.: 143.2
## Median :2010-08-17 N.blank : 0 N.blank : 0 Median : 238.2
## Mean :2010-08-18 Min.nchar: 4 Min.nchar: 2 Mean : 273.4
## 3rd Qu.:2015-12-17 Max.nchar: 7 Max.nchar: 2 3rd Qu.: 345.6
## Max. :2021-04-30 Max. :1541.7
##
## Open High Low Last
## Min. : 62 Min. : 66.3 Min. : 60.1 Min. : 62.7
## 1st Qu.: 144 1st Qu.: 146.9 1st Qu.: 140.0 1st Qu.: 143.5
## Median : 238 Median : 244.0 Median : 231.4 Median : 237.7
## Mean : 274 Mean : 279.6 Mean : 267.6 Mean : 273.2
## 3rd Qu.: 346 3rd Qu.: 352.8 3rd Qu.: 338.4 3rd Qu.: 345.1
## Max. :1640 Max. :1645.0 Max. :1512.2 Max. :1564.0
##
## Close VWAP Volume Turnover
## Min. : 62.3 Min. : 63.08 Min. : 4415 Min. :7.021e+10
## 1st Qu.: 143.2 1st Qu.: 143.68 1st Qu.: 1218226 1st Qu.:2.595e+13
## Median : 238.1 Median : 238.90 Median : 2138807 Median :5.250e+13
## Mean : 273.2 Mean : 273.63 Mean : 4825422 Mean :1.249e+14
## 3rd Qu.: 345.6 3rd Qu.: 345.64 3rd Qu.: 4532904 3rd Qu.:1.137e+14
## Max. :1541.7 Max. :1578.11 Max. :165959680 Max. :4.286e+15
##
## Trades Deliverable.Volume X.Deliverble
## Min. : 296 Min. : 4415 Min. :0.0557
## 1st Qu.: 24579 1st Qu.: 513686 1st Qu.:0.3073
## Median : 41074 Median : 893532 Median :0.4635
## Mean : 62646 Mean : 1415718 Mean :0.4522
## 3rd Qu.: 71463 3rd Qu.: 1593444 3rd Qu.:0.5939
## Max. :1088460 Max. :42891428 Max. :1.0000
## NAs :2850 NAs :519 NAs :519
##
## OUTLIERS
## Prev.Close : 85 outliers
## Open : 83 outliers
## High : 87 outliers
## Low : 81 outliers
## Last : 83 outliers
## Close : 84 outliers
## VWAP : 87 outliers
## Volume : 672 outliers
## Turnover : 744 outliers
## Trades : 227 outliers
## Deliverable.Volume : 382 outliers
## X.Deliverble : 0 outliers
##Trading Volume Trend
#trading volume trend chart
ggplot(zee, aes(x= Date, y= Volume))+
#create line chart
geom_line(color="darkgreen")+
#Add title and Labels
labs(
title = "Trading Volume Trend for Zeel",
x = "Date",
y = "Trading Volume"
)
##Identifying Unusual volume spikes
volume_threshold <- quantile(zee$Volume,0.99)
#Extract unusually high trading volume
volume_spikes <- zee[zee$Volume > volume_threshold, ]
#display spike dates and volumes
head(volume_spikes[, c("Date","Volume")])
## Date Volume
## 3715 2014-11-25 60972072
## 4748 2019-01-25 74261004
## 4749 2019-01-28 121305516
## 4815 2019-05-08 67279826
## 4820 2019-05-15 43282009
## 4907 2019-09-20 53252658
#Top 10 highest trading volumes
volume_spikes %>%
arrange(desc(Volume)) %>%
head(10)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2020-08-20 ZEEL EQ 196.80 195.00 207.00 192.50 199.05 199.45
## 2 2020-08-19 ZEEL EQ 173.95 172.50 200.00 169.25 198.40 196.80
## 3 2019-01-28 ZEEL EQ 318.40 350.20 382.00 320.00 368.50 373.30
## 4 2019-11-21 ZEEL EQ 307.00 337.70 364.00 320.20 343.00 345.15
## 5 2021-02-05 ZEEL EQ 249.45 251.50 251.50 212.05 216.70 215.30
## 6 2020-08-21 ZEEL EQ 199.45 199.40 204.00 189.15 192.05 192.05
## 7 2020-09-24 ZEEL EQ 191.80 185.00 195.80 182.85 193.50 190.35
## 8 2020-08-24 ZEEL EQ 192.05 195.00 203.85 193.45 201.20 201.60
## 9 2019-01-25 ZEEL EQ 433.85 437.75 439.35 288.30 299.70 318.40
## 10 2020-05-28 ZEEL EQ 164.45 164.40 189.05 163.30 180.20 180.70
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 200.60 165959680 3.329180e+15 732032 19283601 0.1162
## 2 186.51 144863037 2.701825e+15 626425 31883619 0.2201
## 3 353.29 121305516 4.285622e+15 1088460 37531534 0.3094
## 4 337.95 100218862 3.386853e+15 699898 23719948 0.2367
## 5 222.43 83931232 1.866843e+15 465910 18462973 0.2200
## 6 194.66 82296210 1.602012e+15 382492 12360894 0.1502
## 7 190.14 76316695 1.451102e+15 279397 24361908 0.3192
## 8 200.02 74694058 1.494055e+15 369116 8821541 0.1181
## 9 353.90 74261004 2.628104e+15 757676 18117711 0.2440
## 10 178.30 71029787 1.266447e+15 383014 7807378 0.1099
##Trading Volume with spike annotations
ggplot(zee, aes(x = Date, y = Volume))+
#trading volume line
geom_line(color = "darkgreen") +
#Annotate unusual spikes
geom_vline(xintercept = as.Date("2014-11-25"),color = "red", linetype = "dashed")+
geom_vline(xintercept = as.Date("2019-01-28"),color = "blue", linetype = "dashed")+
geom_vline(xintercept = as.Date("2019-09-20"),color = "purple", linetype = "dashed")+
# Labels
annotate("text",x = as.Date("2014-11-25"),y = max(zee$Volume),label = "volume spike 2014", angle = 90, color = "red",size = 3)+
annotate("text",x = as.Date("2019-01-28"),y = max(zee$Volume)*0.8,label = "High trading activity", angle = 90, color = "blue",size = 3)+
annotate("text",x = as.Date("2019-09-20"),y = max(zee$Volume)*0.6,label = "Market Volatility", angle = 90, color = "purple",size = 3)+
#Labels
labs(title= "Trading volume trend with unusual spikes", x = "Date",y ="Trading Volume")
#4.3 Visualization
##CLOSING PRICE TREND
#plot closing price trend
ggplot(zee, aes(x = Date, y = Close)) +
# Draw line chart
geom_line(color = ("darkblue")) +
# adding labels
labs(title = "Closing Price Trend of ZEEL Stock",
x= "Date",
y = "Closing Price"
)
The closing price trend shows that ZEEL stock prices fluctuated considerably between 2000 and 2021, reflecting changes in market conditions and investor sentiment.
# Calculate percentage change
zee$Percent.Change <- ((zee$Close - zee$Prev.Close)/ zee$Prev.Close)*100
#Display first rows
head(zee[, c(
"Date",
"Close",
"Prev.Close",
"Percent.Change"
)])
## Date Close Prev.Close Percent.Change
## 1 2000-01-03 1179.95 1092.55 7.999634
## 2 2000-01-04 1260.65 1179.95 6.839273
## 3 2000-01-05 1176.55 1260.65 -6.671162
## 4 2000-01-06 1115.45 1176.55 -5.193149
## 5 2000-01-07 1026.25 1115.45 -7.996773
## 6 2000-01-10 966.70 1026.25 -5.802680
ggplot(zee,aes(x = Date, y = Percent.Change))+
#Draw line chart
geom_line(color = "darkred")+
#Add horizontal reference line
geom_hline(yintercept = 0,linetype = "dashed")+
#Add labels
labs(title = "Daily Percentage Change in Closing Price",
x = "Date",
y = "Percentage change(%)")+
theme_minimal()
The percentage change trend indicates periods of high volatility with both positive and negative returns. Large fluctuations were observed during periods of market uncertainty.
##Sales Volume Trend
ggplot(zee, aes(x = Date, y = Volume))+
#sales volume line
geom_line(color = "darkgreen") +
#Labels
labs(title = "Sales Volume Trend", x = "Date",y ="Sales Volume")
##CALCULATE MOVING AVERAGES
#Calculate 15-day moving average
zee$MA15 <- rollmean(zee$Close,15, fill = NA, align = "right")
#Calculate 30-day moving average
zee$MA30 <- rollmean(zee$Close, 30, fill = NA, align = "right")
#Calculate 45-day moving average
zee$MA45 <- rollmean(zee$Close, 45, fill = NA, align = "right")
#Display first rows
head(zee)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 ZEETELE EQ 1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE EQ 1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE EQ 1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE EQ 1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE EQ 1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## 6 2000-01-10 ZEETELE EQ 1026.25 1026.25 1026.25 944.30 962.00 966.70
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14 NA NA NA
## 2 1228.02 4616547 5.669220e+14 NA NA NA
## 3 1238.35 8763127 1.085178e+15 NA NA NA
## 4 1135.04 5164020 5.861353e+14 NA NA NA
## 5 1029.94 755129 7.777374e+13 NA NA NA
## 6 980.49 3942813 3.865885e+14 NA NA NA
## Percent.Change MA15 MA30 MA45
## 1 7.999634 NA NA NA
## 2 6.839273 NA NA NA
## 3 -6.671162 NA NA NA
## 4 -5.193149 NA NA NA
## 5 -7.996773 NA NA NA
## 6 -5.802680 NA NA NA
##MOVING AVERAGE VISUALIZATION
ggplot(zee, aes(x = Date))+
#Closing price line
geom_line(aes(y = Close, color = "Close Price"))+
# 15-day moving average
geom_line(aes(y = MA15, color = "MA15"))+
# 30-day moving average
geom_line(aes(y = MA30, color = "MA30"))+
# 45-day moving average
geom_line(aes(y = MA45, color = "MA45"))+
#labels
labs(title = "Closing Price with Moving Averages",
x = "Date",
y = "Price",
color = "Legend")
##HISTOGRAM OF PERCENTAGE CHANGE
#creating histogram
ggplot(zee, aes(x= Percent.Change))+
#drawing histogram
geom_histogram(bins= 48,
fill = "darkblue",
color = "white")+
#Adding labels
labs(title = "Distribution of Percentage Change",
x = "Percentage Change",
y= " Frequency")
The histogram shows that most percentage changes are concentrated around
zero, indicating that daily stock price movements are usually small.
However, some extreme values are present, showing periods of high
volatility.
##Select numerical Variables for correlation
correlation_data <-zee[, c("Prev.Close","Open","High","Low", "Volume","Close")]
#Display first rows
head(correlation_data)
## Prev.Close Open High Low Volume Close
## 1 1092.55 1175.00 1179.95 1160.00 1261391 1179.95
## 2 1179.95 1220.00 1274.35 1183.10 4616547 1260.65
## 3 1260.65 1160.55 1317.70 1159.80 8763127 1176.55
## 4 1176.55 1195.00 1200.00 1095.00 5164020 1115.45
## 5 1115.45 1097.10 1097.10 1026.25 755129 1026.25
## 6 1026.25 1026.25 1026.25 944.30 3942813 966.70
##Correlation Matrix
#Calculate correlation matrix
correlation_matrix <- cor(
correlation_data, use = "complete.obs"
)
#Display correlation matrix
correlation_matrix
## Prev.Close Open High Low Volume
## Prev.Close 1.00000000 0.99942268 0.99885060 0.99838428 -0.04934908
## Open 0.99942268 1.00000000 0.99913584 0.99869374 -0.04832402
## High 0.99885060 0.99913584 1.00000000 0.99814370 -0.03974465
## Low 0.99838428 0.99869374 0.99814370 1.00000000 -0.05694402
## Volume -0.04934908 -0.04832402 -0.03974465 -0.05694402 1.00000000
## Close 0.99790147 0.99811311 0.99880555 0.99911821 -0.04744036
## Close
## Prev.Close 0.99790147
## Open 0.99811311
## High 0.99880555
## Low 0.99911821
## Volume -0.04744036
## Close 1.00000000
##Correlation Visualization
#Plot correlation matrix
corrplot(correlation_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.sr = 45)
The correlation analysis shows that Prev.Close, Open, High, Low, and
Close have very strong positive correlations. Most correlations are
close to 1, indicating that these stock price variables move
together.
Volume has weaker correlations with price related variables. The high correlations among predictors may cause multicollinearity in regression analysis, making coefficient estimates unstable and reducing interpretability.
##PREPARE DATA FOR REGRESSION
#Selecting variables for regression
regression_data <- zee[, c("Close",
"Prev.Close",
"Open",
"High",
"Low",
"Volume")]
# removing missing values
regression_data <- na.omit(regression_data)
#displaying first rows
head(regression_data)
## Close Prev.Close Open High Low Volume
## 1 1179.95 1092.55 1175.00 1179.95 1160.00 1261391
## 2 1260.65 1179.95 1220.00 1274.35 1183.10 4616547
## 3 1176.55 1260.65 1160.55 1317.70 1159.80 8763127
## 4 1115.45 1176.55 1195.00 1200.00 1095.00 5164020
## 5 1026.25 1115.45 1097.10 1097.10 1026.25 755129
## 6 966.70 1026.25 1026.25 1026.25 944.30 3942813
##TRAIN TEST SPLIT
# Set seed
set.seed((123))
#creating row indexes for training data
train_index <- sample(
1:nrow(regression_data),
0.8 * nrow(regression_data)
)
# Creating training dataset
train_data <- regression_data[train_index, ]
# Creating test dataset
test_data <- regression_data[-train_index, ]
# Print dataset sizes
cat("Training data size:", nrow(train_data), "\n")
## Training data size: 4244
cat("Test data size:", nrow(test_data), "\n")
## Test data size: 1062
##TRAIN LINEAR REGRESSION MODEL
#Train regression model
model <-lm(
Close ~ Prev.Close + Open + High + Low + Volume,
data = train_data
)
# Display model summary
summary(model)
##
## Call:
## lm(formula = Close ~ Prev.Close + Open + High + Low + Volume,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.224 -1.547 -0.319 1.264 66.371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.095e-01 1.483e-01 4.110 4.03e-05 ***
## Prev.Close -8.240e-02 1.225e-02 -6.729 1.94e-11 ***
## Open -4.747e-01 1.511e-02 -31.422 < 2e-16 ***
## High 7.474e-01 1.065e-02 70.184 < 2e-16 ***
## Low 8.079e-01 8.767e-03 92.151 < 2e-16 ***
## Volume 1.639e-08 9.238e-09 1.774 0.0762 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.846 on 4238 degrees of freedom
## Multiple R-squared: 0.9992, Adjusted R-squared: 0.9992
## F-statistic: 1.112e+06 on 5 and 4238 DF, p-value: < 2.2e-16
# Predict closing prices
predictions <- predict(
model,
newdata = test_data
)
# Display first predictions
head(predictions)
## 4 6 8 15 23 24
## 1117.9757 958.8398 1074.6155 1162.2597 1326.6539 1365.6769
##MODEL PERFORMANCE
# Calculate RMSE
rmse_value <- rmse(
test_data$Close,
predictions
)
# Calculate MAE
mae_value <- mae(
test_data$Close,
predictions
)
# Calculate R squared
r2_value <- cor(
test_data$Close,
predictions
)^2
# Print metrics
cat("RMSE:", rmse_value, "\n")
## RMSE: 4.847351
cat("MAE:", mae_value, "\n")
## MAE: 2.403604
cat("Rsquared:", r2_value, "\n")
## Rsquared: 0.9992598
The regression model achieved a high R² value, indicating strong predictive performance. Low RMSE and MAE values suggest that prediction errors were relatively small.
##Actual vs Predicted Scatter Plot
#Creating dataframe for plotting
results <- data.frame(
Actual = test_data$Close,
Predicted = predictions
)
# Plot actual vs predicted values
ggplot(results, aes(x = Actual, y = Predicted)) +
# Scatter points
geom_point(color = "blue", alpha = 0.5) +
# 45-degree reference line
geom_abline(
slope = 1,
intercept = 0,
color = "red",
linetype = "dashed"
) +
# Add labels
labs(
title = "Actual vs Predicted Closing Prices",
x = "Actual Closing Price",
y = "Predicted Closing Price"
)+
theme_minimal()
The scatter plot shows that most predicted values lie close to the 45 degree reference line, indicating good model performance. The spread around the line represents prediction errors. Smaller spread indicates more accurate predictions, while points farther from the line indicate larger prediction errors.
##Standardize independent variables
# Selecting independent variables
x_variables <- regression_data[, c(
"Prev.Close",
"Open",
"High",
"Low",
"Volume"
)]
# Standardize variables
x_scaled <- scale(x_variables)
# Display first rows
head(x_scaled)
## Prev.Close Open High Low Volume
## [1,] 4.643600 5.079458 4.967118 5.177397 -0.42418743
## [2,] 5.139056 5.333141 5.487904 5.311414 -0.02486008
## [3,] 5.596531 4.997997 5.727058 5.176237 0.46866176
## [4,] 5.119782 5.192206 5.077730 4.800292 0.04029961
## [5,] 4.773416 4.640303 4.510050 4.401431 -0.48444223
## [6,] 4.267756 4.240891 4.119185 3.925988 -0.10504723
##VERIFY SCALING
# Calculate means
scaled_means <- colMeans(x_scaled)
# Calculate standard deviations
scaled_sd <- apply(
x_scaled,
2,
sd
)
# Print means
cat("Means after scaling:\n")
## Means after scaling:
print(scaled_means)
## Prev.Close Open High Low Volume
## 1.260117e-16 -1.713930e-17 3.974498e-17 -4.879196e-17 1.369993e-17
# Print standard deviations
cat("\nStandard deviations after scaling:\n")
##
## Standard deviations after scaling:
print(scaled_sd)
## Prev.Close Open High Low Volume
## 1 1 1 1 1
The standardized variables produced means very close to zero due to floating point precision and standard deviations close to one, confirming successful scaling of the independent variables.
pca_model <- prcomp(
x_scaled,
center = TRUE,
scale. = TRUE
)
#PCA summary
summary(pca_model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.9999 0.9985 0.04346 0.03368 0.02263
## Proportion of Variance 0.7999 0.1994 0.00038 0.00023 0.00010
## Cumulative Proportion 0.7999 0.9993 0.99967 0.99990 1.00000
pca_data <- data.frame(
PC1 = pca_model$x[,1],
PC2 = pca_model$x[,2],
Close = regression_data$Close
)
# Display first rows
head(pca_data)
## PC1 PC2 Close
## 1 -9.942295 0.1037662 1179.95
## 2 -10.630962 -0.3212474 1260.65
## 3 -10.728045 -0.8209573 1176.55
## 4 -10.088440 -0.3694890 1115.45
## 5 -9.173517 0.1867117 1026.25
## 6 -8.276006 -0.1645722 966.70
# Training data
pca_train <- pca_data[train_index, ]
# Test data
pca_test <- pca_data[-train_index, ]
# Display sizes
cat("PCA Training Size:", nrow(pca_train), "\n")
## PCA Training Size: 4244
cat("PCA Test Size:", nrow(pca_test), "\n")
## PCA Test Size: 1062
# Train regression model using PCA components
pca_regression_model <- lm(
Close ~ PC1 + PC2,
data = pca_train
)
# Display model summary
summary(pca_regression_model)
##
## Call:
## lm(formula = Close ~ PC1 + PC2, data = pca_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81.479 -2.469 -0.229 2.471 79.596
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 273.22368 0.12195 2240.42 <2e-16 ***
## PC1 -87.91147 0.06115 -1437.58 <2e-16 ***
## PC2 -2.91021 0.12213 -23.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.945 on 4241 degrees of freedom
## Multiple R-squared: 0.998, Adjusted R-squared: 0.998
## F-statistic: 1.034e+06 on 2 and 4241 DF, p-value: < 2.2e-16
# Predict test values
pca_predictions <- predict(
pca_regression_model,
newdata = pca_test
)
# Display first predictions
head(pca_predictions)
## 4 6 8 15 23 24
## 1161.189 1001.258 1039.965 1149.208 1272.853 1366.522
# Calculating PCA R squared
pca_r2 <- cor(
pca_test$Close,
pca_predictions
)^2
# Printing PCA R2
cat("PCA Model R squared:", pca_r2)
## PCA Model R squared: 0.9976517
The raw regression model performed slightly better than the PCA model because it retained all original predictor variables. Although PCA reduced multicollinearity, reducing the data to two principal components resulted in slight information loss, leading to a lower R² value compared to the raw model.
##LOAD VEDANTA DATASET
# Extract Vedanta dataset
vedl <- stock_market_data$VEDL
# Convert Date column
vedl$Date <- as.Date(vedl$Date)
# Display first rows
head(vedl)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 SESAGOA EQ 107.70 111.00 116.35 108.00 116.35 116.35
## 2 2000-01-04 SESAGOA EQ 116.35 113.75 116.00 108.00 116.00 114.70
## 3 2000-01-05 SESAGOA EQ 114.70 107.55 115.85 107.55 114.50 114.00
## 4 2000-01-06 SESAGOA EQ 114.00 112.00 123.10 112.00 118.80 119.30
## 5 2000-01-07 SESAGOA EQ 119.30 119.85 120.00 114.05 116.50 116.50
## 6 2000-01-10 SESAGOA EQ 116.50 120.50 120.85 116.00 119.00 119.00
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 114.80 20371 233859660000 NA NA NA
## 2 113.34 22366 253499440000 NA NA NA
## 3 112.78 18305 206436075000 NA NA NA
## 4 119.89 25800 309313325000 NA NA NA
## 5 116.84 17361 202840260000 NA NA NA
## 6 118.69 20707 245767815000 NA NA NA
# Extract year for ZEEL
zee$Year <- format(
zee$Date,
"%Y"
)
# Extract year for VEDL
vedl$Year <- format(
vedl$Date,
"%Y"
)
zee_volume <- aggregate(
Volume ~ Year,
data = zee,
sum
)
# Adding company name
zee_volume$Company <- "ZEEL"
# Annual volume for VEDL
vedl_volume <- aggregate(
Volume ~ Year,
data = vedl,
sum
)
# Adding company name
vedl_volume$Company <- "VEDL"
# Combine datasets
annual_volume <- rbind(
zee_volume,
vedl_volume
)
# Display first rows
head(annual_volume)
## Year Volume Company
## 1 2000 2124967599 ZEEL
## 2 2001 2136706114 ZEEL
## 3 2002 1402554565 ZEEL
## 4 2003 1000731804 ZEEL
## 5 2004 569852081 ZEEL
## 6 2005 459053470 ZEEL
# Plot annual sales volume
ggplot(
annual_volume,
aes(
x = Year,
y = Volume,
fill = Company
)
) +
# Creating clustered bars
geom_bar(
stat = "identity",
position = "dodge"
) +
# Adding labels
labs(
title = "Annual Sales Volume Comparison",
x = "Year",
y = "Total Volume"
) +
# Rotate year labels
theme(
axis.text.x = element_text(angle = 90)
)
## HIGHEST VOLUME IN 2018
# Filtering 2018 data
volume_2018 <- annual_volume[
annual_volume$Year == "2018",
]
# Displaying results
volume_2018
## Year Volume Company
## 19 2018 598775408 ZEEL
## 41 2018 3276296751 VEDL
Vedanta Ltd (VEDL) recorded the highest sales volume during 2018. ##LEAST VOLUME IN 2021
# Filter 2021 data
volume_2021 <- annual_volume[
annual_volume$Year == "2021",
]
# Displaying results
volume_2021
## Year Volume Company
## 22 2021 1302490317 ZEEL
## 44 2021 1430971315 VEDL
Zee Entertainment Enterprises Ltd (ZEEL) recorded the lowest sales volume during 2021.
##RATIO OF ZEEL TO VEDL IN 2021
# Extracting ZEEL 2021 volume
zee_2021 <- volume_2021$Volume[
volume_2021$Company == "ZEEL"
]
# Extracting VEDL 2021 volume
vedl_2021 <- volume_2021$Volume[
volume_2021$Company == "VEDL"
]
# Calculating ratio
ratio <- zee_2021 / vedl_2021
# Printing ratio
cat(
"Ratio of ZEEL to VEDL sales volume in 2021:",
ratio
)
## Ratio of ZEEL to VEDL sales volume in 2021: 0.9102141
The ratio of ZEEL sales volume to VEDL sales volume during 2021 was approximately 0.91.