#LOAD REQUIRED LIBRARIES

library(tidyverse)
library(ggplot2)
library(zoo)
library(corrplot)
library(Metrics)

#Set working directory to project folder
setwd("C:/Users/justine/Desktop/Midterm")

1 4.1 GET ALL CSV FILES

# Get all CSV files from data folder

mydatasets <- list.files(path = "data",pattern = "*.csv",full.names = TRUE)

# Display all file paths

mydatasets

##  [1] "data/ADANIPORTS.csv"     "data/ASIANPAINT.csv"    
##  [3] "data/AXISBANK.csv"       "data/BAJAJ-AUTO.csv"    
##  [5] "data/BAJAJFINSV.csv"     "data/BAJFINANCE.csv"    
##  [7] "data/BHARTIARTL.csv"     "data/BPCL.csv"          
##  [9] "data/BRITANNIA.csv"      "data/CIPLA.csv"         
## [11] "data/COALINDIA.csv"      "data/DRREDDY.csv"       
## [13] "data/EICHERMOT.csv"      "data/GAIL.csv"          
## [15] "data/GRASIM.csv"         "data/HCLTECH.csv"       
## [17] "data/HDFC.csv"           "data/HDFCBANK.csv"      
## [19] "data/HEROMOTOCO.csv"     "data/HINDALCO.csv"      
## [21] "data/HINDUNILVR.csv"     "data/ICICIBANK.csv"     
## [23] "data/INDUSINDBK.csv"     "data/INFRATEL.csv"      
## [25] "data/INFY.csv"           "data/IOC.csv"           
## [27] "data/ITC.csv"            "data/JSWSTEEL.csv"      
## [29] "data/KOTAKBANK.csv"      "data/LT.csv"            
## [31] "data/MARUTI.csv"         "data/MM.csv"            
## [33] "data/NESTLEIND.csv"      "data/NIFTY50_all.csv"   
## [35] "data/NTPC.csv"           "data/ONGC.csv"          
## [37] "data/POWERGRID.csv"      "data/RELIANCE.csv"      
## [39] "data/SBIN.csv"           "data/SHREECEM.csv"      
## [41] "data/stock_metadata.csv" "data/SUNPHARMA.csv"     
## [43] "data/TATAMOTORS.csv"     "data/TATASTEEL.csv"     
## [45] "data/TCS.csv"            "data/TECHM.csv"         
## [47] "data/TITAN.csv"          "data/ULTRACEMCO.csv"    
## [49] "data/UPL.csv"            "data/VEDL.csv"          
## [51] "data/WIPRO.csv"          "data/ZEEL.csv"

##LOAD THE DATASETS

#Read all csv files into a list 
stock_market_data<- lapply(mydatasets, read.csv)

##ASSIGN NAMES TO DATASETS

# Remove .csv extension from file names

names(stock_market_data) <- gsub(
  ".csv",
  "",
  basename(mydatasets)
)

# Display dataset names

names(stock_market_data)

##  [1] "ADANIPORTS"     "ASIANPAINT"     "AXISBANK"       "BAJAJ-AUTO"    
##  [5] "BAJAJFINSV"     "BAJFINANCE"     "BHARTIARTL"     "BPCL"          
##  [9] "BRITANNIA"      "CIPLA"          "COALINDIA"      "DRREDDY"       
## [13] "EICHERMOT"      "GAIL"           "GRASIM"         "HCLTECH"       
## [17] "HDFC"           "HDFCBANK"       "HEROMOTOCO"     "HINDALCO"      
## [21] "HINDUNILVR"     "ICICIBANK"      "INDUSINDBK"     "INFRATEL"      
## [25] "INFY"           "IOC"            "ITC"            "JSWSTEEL"      
## [29] "KOTAKBANK"      "LT"             "MARUTI"         "MM"            
## [33] "NESTLEIND"      "NIFTY50_all"    "NTPC"           "ONGC"          
## [37] "POWERGRID"      "RELIANCE"       "SBIN"           "SHREECEM"      
## [41] "stock_metadata" "SUNPHARMA"      "TATAMOTORS"     "TATASTEEL"     
## [45] "TCS"            "TECHM"          "TITAN"          "ULTRACEMCO"    
## [49] "UPL"            "VEDL"           "WIPRO"          "ZEEL"

#Extract Zee Entertainment Enterprises Ltd dataset
zee<-stock_market_data$ZEEL
#Displaying first rows
head(zee)

##         Date  Symbol Series Prev.Close    Open    High     Low    Last   Close
## 1 2000-01-03 ZEETELE     EQ    1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE     EQ    1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE     EQ    1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE     EQ    1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE     EQ    1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## 6 2000-01-10 ZEETELE     EQ    1026.25 1026.25 1026.25  944.30  962.00  966.70
##      VWAP  Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14     NA                 NA           NA
## 2 1228.02 4616547 5.669220e+14     NA                 NA           NA
## 3 1238.35 8763127 1.085178e+15     NA                 NA           NA
## 4 1135.04 5164020 5.861353e+14     NA                 NA           NA
## 5 1029.94  755129 7.777374e+13     NA                 NA           NA
## 6  980.49 3942813 3.865885e+14     NA                 NA           NA

colnames(zee)

##  [1] "Date"               "Symbol"             "Series"            
##  [4] "Prev.Close"         "Open"               "High"              
##  [7] "Low"                "Last"               "Close"             
## [10] "VWAP"               "Volume"             "Turnover"          
## [13] "Trades"             "Deliverable.Volume" "X.Deliverble"

# Convert date column into date format
zee$Date<- as.Date(zee$Date)
#check structure of dataset
str(zee)

## 'data.frame':    5306 obs. of  15 variables:
##  $ Date              : Date, format: "2000-01-03" "2000-01-04" ...
##  $ Symbol            : chr  "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
##  $ Series            : chr  "EQ" "EQ" "EQ" "EQ" ...
##  $ Prev.Close        : num  1093 1180 1261 1177 1115 ...
##  $ Open              : num  1175 1220 1161 1195 1097 ...
##  $ High              : num  1180 1274 1318 1200 1097 ...
##  $ Low               : num  1160 1183 1160 1095 1026 ...
##  $ Last              : num  1180 1274 1191 1106 1026 ...
##  $ Close             : num  1180 1261 1177 1115 1026 ...
##  $ VWAP              : num  1177 1228 1238 1135 1030 ...
##  $ Volume            : int  1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
##  $ Turnover          : num  1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
##  $ Trades            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Deliverable.Volume: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ X.Deliverble      : num  NA NA NA NA NA NA NA NA NA NA ...

2 4.2 Exploratory Data Analysis

2.1 Creating Stock Profiling Funcion

# Create a function to profile stock datasets

profile_stock <- function(data){

  # Trading date range

  cat("TRADING DATE RANGE\n")

  print(range(data$Date))

  # Missing values

  cat("\nMISSING VALUES\n")

  print(colSums(is.na(data)))

  # First 5 rows

  cat("\nTOP 5 ROWS\n")

  print(head(data, 5))

  # Data types

  cat("\nDATA TYPES\n")

  print(str(data))

  # Summary statistics

  cat("\nSUMMARY STATISTICS\n")

  print(summary(data))
  

# Outlier detection using boxplot

cat("\nOUTLIERS\n")

# Select numeric columns

numeric_cols <- sapply(data, is.numeric)

# Loop through numeric columns

for(col in names(data[, numeric_cols])){

  # Use boxplot to detect outliers

  outliers <- boxplot.stats(data[[col]])$out

  # Print number of outliers

  cat(col, ":", length(outliers), "outliers\n")
}
}
#Apply function Zeel dataset
profile_stock(zee)

## TRADING DATE RANGE
## [1] "2000-01-03" "2021-04-30"
## 
## MISSING VALUES
##               Date             Symbol             Series         Prev.Close 
##                  0                  0                  0                  0 
##               Open               High                Low               Last 
##                  0                  0                  0                  0 
##              Close               VWAP             Volume           Turnover 
##                  0                  0                  0                  0 
##             Trades Deliverable.Volume       X.Deliverble 
##               2850                519                519 
## 
## TOP 5 ROWS
##         Date  Symbol Series Prev.Close    Open    High     Low    Last   Close
## 1 2000-01-03 ZEETELE     EQ    1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE     EQ    1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE     EQ    1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE     EQ    1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE     EQ    1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
##      VWAP  Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14     NA                 NA           NA
## 2 1228.02 4616547 5.669220e+14     NA                 NA           NA
## 3 1238.35 8763127 1.085178e+15     NA                 NA           NA
## 4 1135.04 5164020 5.861353e+14     NA                 NA           NA
## 5 1029.94  755129 7.777374e+13     NA                 NA           NA
## 
## DATA TYPES
## 'data.frame':    5306 obs. of  15 variables:
##  $ Date              : Date, format: "2000-01-03" "2000-01-04" ...
##  $ Symbol            : chr  "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
##  $ Series            : chr  "EQ" "EQ" "EQ" "EQ" ...
##  $ Prev.Close        : num  1093 1180 1261 1177 1115 ...
##  $ Open              : num  1175 1220 1161 1195 1097 ...
##  $ High              : num  1180 1274 1318 1200 1097 ...
##  $ Low               : num  1160 1183 1160 1095 1026 ...
##  $ Last              : num  1180 1274 1191 1106 1026 ...
##  $ Close             : num  1180 1261 1177 1115 1026 ...
##  $ VWAP              : num  1177 1228 1238 1135 1030 ...
##  $ Volume            : int  1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
##  $ Turnover          : num  1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
##  $ Trades            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Deliverable.Volume: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ X.Deliverble      : num  NA NA NA NA NA NA NA NA NA NA ...
## NULL
## 
## SUMMARY STATISTICS
##       Date                  Symbol           Series       Prev.Close    
##  Min.   :2000-01-03   Length   :5306   Length   :5306   Min.   :  62.3  
##  1st Qu.:2005-04-13   N.unique :   2   N.unique :   1   1st Qu.: 143.2  
##  Median :2010-08-17   N.blank  :   0   N.blank  :   0   Median : 238.2  
##  Mean   :2010-08-18   Min.nchar:   4   Min.nchar:   2   Mean   : 273.4  
##  3rd Qu.:2015-12-17   Max.nchar:   7   Max.nchar:   2   3rd Qu.: 345.6  
##  Max.   :2021-04-30                                     Max.   :1541.7  
##                                                                         
##       Open           High             Low              Last       
##  Min.   :  62   Min.   :  66.3   Min.   :  60.1   Min.   :  62.7  
##  1st Qu.: 144   1st Qu.: 146.9   1st Qu.: 140.0   1st Qu.: 143.5  
##  Median : 238   Median : 244.0   Median : 231.4   Median : 237.7  
##  Mean   : 274   Mean   : 279.6   Mean   : 267.6   Mean   : 273.2  
##  3rd Qu.: 346   3rd Qu.: 352.8   3rd Qu.: 338.4   3rd Qu.: 345.1  
##  Max.   :1640   Max.   :1645.0   Max.   :1512.2   Max.   :1564.0  
##                                                                   
##      Close             VWAP             Volume             Turnover        
##  Min.   :  62.3   Min.   :  63.08   Min.   :     4415   Min.   :7.021e+10  
##  1st Qu.: 143.2   1st Qu.: 143.68   1st Qu.:  1218226   1st Qu.:2.595e+13  
##  Median : 238.1   Median : 238.90   Median :  2138807   Median :5.250e+13  
##  Mean   : 273.2   Mean   : 273.63   Mean   :  4825422   Mean   :1.249e+14  
##  3rd Qu.: 345.6   3rd Qu.: 345.64   3rd Qu.:  4532904   3rd Qu.:1.137e+14  
##  Max.   :1541.7   Max.   :1578.11   Max.   :165959680   Max.   :4.286e+15  
##                                                                            
##      Trades        Deliverable.Volume  X.Deliverble   
##  Min.   :    296   Min.   :    4415   Min.   :0.0557  
##  1st Qu.:  24579   1st Qu.:  513686   1st Qu.:0.3073  
##  Median :  41074   Median :  893532   Median :0.4635  
##  Mean   :  62646   Mean   : 1415718   Mean   :0.4522  
##  3rd Qu.:  71463   3rd Qu.: 1593444   3rd Qu.:0.5939  
##  Max.   :1088460   Max.   :42891428   Max.   :1.0000  
##  NAs    :2850      NAs    :519        NAs    :519     
## 
## OUTLIERS
## Prev.Close : 85 outliers
## Open : 83 outliers
## High : 87 outliers
## Low : 81 outliers
## Last : 83 outliers
## Close : 84 outliers
## VWAP : 87 outliers
## Volume : 672 outliers
## Turnover : 744 outliers
## Trades : 227 outliers
## Deliverable.Volume : 382 outliers
## X.Deliverble : 0 outliers

##Trading Volume Trend

#trading volume trend chart
ggplot(zee, aes(x= Date, y= Volume))+
  #create line chart
  geom_line(color="darkgreen")+
  #Add title and Labels
  labs(
    title = "Trading Volume Trend for Zeel",
    x = "Date",
    y = "Trading Volume"
  )

##Identifying Unusual volume spikes

volume_threshold <- quantile(zee$Volume,0.99)
#Extract unusually high trading volume
volume_spikes <- zee[zee$Volume > volume_threshold, ]
#display spike dates and volumes
head(volume_spikes[, c("Date","Volume")])

##            Date    Volume
## 3715 2014-11-25  60972072
## 4748 2019-01-25  74261004
## 4749 2019-01-28 121305516
## 4815 2019-05-08  67279826
## 4820 2019-05-15  43282009
## 4907 2019-09-20  53252658

#Top 10 highest trading volumes
volume_spikes %>%
  arrange(desc(Volume)) %>%
  head(10)

##          Date Symbol Series Prev.Close   Open   High    Low   Last  Close
## 1  2020-08-20   ZEEL     EQ     196.80 195.00 207.00 192.50 199.05 199.45
## 2  2020-08-19   ZEEL     EQ     173.95 172.50 200.00 169.25 198.40 196.80
## 3  2019-01-28   ZEEL     EQ     318.40 350.20 382.00 320.00 368.50 373.30
## 4  2019-11-21   ZEEL     EQ     307.00 337.70 364.00 320.20 343.00 345.15
## 5  2021-02-05   ZEEL     EQ     249.45 251.50 251.50 212.05 216.70 215.30
## 6  2020-08-21   ZEEL     EQ     199.45 199.40 204.00 189.15 192.05 192.05
## 7  2020-09-24   ZEEL     EQ     191.80 185.00 195.80 182.85 193.50 190.35
## 8  2020-08-24   ZEEL     EQ     192.05 195.00 203.85 193.45 201.20 201.60
## 9  2019-01-25   ZEEL     EQ     433.85 437.75 439.35 288.30 299.70 318.40
## 10 2020-05-28   ZEEL     EQ     164.45 164.40 189.05 163.30 180.20 180.70
##      VWAP    Volume     Turnover  Trades Deliverable.Volume X.Deliverble
## 1  200.60 165959680 3.329180e+15  732032           19283601       0.1162
## 2  186.51 144863037 2.701825e+15  626425           31883619       0.2201
## 3  353.29 121305516 4.285622e+15 1088460           37531534       0.3094
## 4  337.95 100218862 3.386853e+15  699898           23719948       0.2367
## 5  222.43  83931232 1.866843e+15  465910           18462973       0.2200
## 6  194.66  82296210 1.602012e+15  382492           12360894       0.1502
## 7  190.14  76316695 1.451102e+15  279397           24361908       0.3192
## 8  200.02  74694058 1.494055e+15  369116            8821541       0.1181
## 9  353.90  74261004 2.628104e+15  757676           18117711       0.2440
## 10 178.30  71029787 1.266447e+15  383014            7807378       0.1099

##Trading Volume with spike annotations

ggplot(zee, aes(x = Date, y = Volume))+
  #trading volume line
  geom_line(color = "darkgreen") +
  #Annotate unusual spikes
  geom_vline(xintercept = as.Date("2014-11-25"),color = "red", linetype = "dashed")+
  geom_vline(xintercept = as.Date("2019-01-28"),color = "blue", linetype = "dashed")+
  geom_vline(xintercept = as.Date("2019-09-20"),color = "purple", linetype = "dashed")+
  
  # Labels
  annotate("text",x = as.Date("2014-11-25"),y = max(zee$Volume),label = "volume spike 2014", angle = 90, color = "red",size = 3)+
  
  annotate("text",x = as.Date("2019-01-28"),y = max(zee$Volume)*0.8,label = "High trading activity", angle = 90, color = "blue",size = 3)+
  
  annotate("text",x = as.Date("2019-09-20"),y = max(zee$Volume)*0.6,label = "Market Volatility", angle = 90, color = "purple",size = 3)+
  
  #Labels
  labs(title= "Trading volume trend with unusual spikes", x = "Date",y ="Trading Volume")

#4.3 Visualization

##CLOSING PRICE TREND

#plot closing price trend

ggplot(zee, aes(x = Date, y = Close)) +

  # Draw line chart

  geom_line(color = ("darkblue")) +

  # adding labels

  labs(title = "Closing Price Trend of ZEEL Stock",
       x= "Date",
       y = "Closing Price"
  )

The closing price trend shows that ZEEL stock prices fluctuated considerably between 2000 and 2021, reflecting changes in market conditions and investor sentiment.

2.2 Percentage Change Calculation

# Calculate percentage change

zee$Percent.Change <- ((zee$Close - zee$Prev.Close)/ zee$Prev.Close)*100

#Display first rows
head(zee[, c(
        "Date",
        "Close",
        "Prev.Close",
        "Percent.Change"
)])

##         Date   Close Prev.Close Percent.Change
## 1 2000-01-03 1179.95    1092.55       7.999634
## 2 2000-01-04 1260.65    1179.95       6.839273
## 3 2000-01-05 1176.55    1260.65      -6.671162
## 4 2000-01-06 1115.45    1176.55      -5.193149
## 5 2000-01-07 1026.25    1115.45      -7.996773
## 6 2000-01-10  966.70    1026.25      -5.802680

2.3 Percentage Change Trend

ggplot(zee,aes(x = Date, y = Percent.Change))+
  
  #Draw line chart
  geom_line(color = "darkred")+
  
  #Add horizontal reference line
  
  geom_hline(yintercept = 0,linetype = "dashed")+
  
  #Add labels
  
  labs(title = "Daily Percentage Change in Closing Price",
      x = "Date",
      y = "Percentage change(%)")+
    theme_minimal()

The percentage change trend indicates periods of high volatility with both positive and negative returns. Large fluctuations were observed during periods of market uncertainty.

##Sales Volume Trend

ggplot(zee, aes(x = Date, y = Volume))+
  #sales volume line
  geom_line(color = "darkgreen") +
  
  #Labels
  labs(title = "Sales Volume Trend", x = "Date",y ="Sales Volume")

##CALCULATE MOVING AVERAGES

#Calculate 15-day moving average

zee$MA15 <- rollmean(zee$Close,15, fill = NA, align = "right")

#Calculate 30-day moving average

zee$MA30 <- rollmean(zee$Close, 30, fill = NA, align = "right")

#Calculate 45-day moving average

zee$MA45 <- rollmean(zee$Close, 45, fill = NA, align = "right")

#Display first rows

head(zee)

##         Date  Symbol Series Prev.Close    Open    High     Low    Last   Close
## 1 2000-01-03 ZEETELE     EQ    1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE     EQ    1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE     EQ    1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE     EQ    1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE     EQ    1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## 6 2000-01-10 ZEETELE     EQ    1026.25 1026.25 1026.25  944.30  962.00  966.70
##      VWAP  Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14     NA                 NA           NA
## 2 1228.02 4616547 5.669220e+14     NA                 NA           NA
## 3 1238.35 8763127 1.085178e+15     NA                 NA           NA
## 4 1135.04 5164020 5.861353e+14     NA                 NA           NA
## 5 1029.94  755129 7.777374e+13     NA                 NA           NA
## 6  980.49 3942813 3.865885e+14     NA                 NA           NA
##   Percent.Change MA15 MA30 MA45
## 1       7.999634   NA   NA   NA
## 2       6.839273   NA   NA   NA
## 3      -6.671162   NA   NA   NA
## 4      -5.193149   NA   NA   NA
## 5      -7.996773   NA   NA   NA
## 6      -5.802680   NA   NA   NA

##MOVING AVERAGE VISUALIZATION

ggplot(zee, aes(x = Date))+
  #Closing price line
  geom_line(aes(y = Close, color = "Close Price"))+
  
  # 15-day moving average
  
  geom_line(aes(y = MA15, color = "MA15"))+
  
  # 30-day moving average
  
  geom_line(aes(y = MA30, color = "MA30"))+
  
  # 45-day moving average
  
  geom_line(aes(y = MA45, color = "MA45"))+
  
  #labels
  
  labs(title = "Closing Price with Moving Averages",
       x = "Date",
       y = "Price",
       color = "Legend")

##HISTOGRAM OF PERCENTAGE CHANGE

#creating histogram 

ggplot(zee, aes(x= Percent.Change))+

  #drawing histogram
  geom_histogram(bins=  48,
                 fill = "darkblue",
                 color = "white")+
  #Adding labels
  labs(title = "Distribution  of Percentage Change",
       x = "Percentage Change",
       y= " Frequency")

The histogram shows that most percentage changes are concentrated around zero, indicating that daily stock price movements are usually small. However, some extreme values are present, showing periods of high volatility.

3 4.4 Correlation Analysis

##Select numerical Variables for correlation

correlation_data <-zee[, c("Prev.Close","Open","High","Low", "Volume","Close")]

#Display first rows

head(correlation_data)

##   Prev.Close    Open    High     Low  Volume   Close
## 1    1092.55 1175.00 1179.95 1160.00 1261391 1179.95
## 2    1179.95 1220.00 1274.35 1183.10 4616547 1260.65
## 3    1260.65 1160.55 1317.70 1159.80 8763127 1176.55
## 4    1176.55 1195.00 1200.00 1095.00 5164020 1115.45
## 5    1115.45 1097.10 1097.10 1026.25  755129 1026.25
## 6    1026.25 1026.25 1026.25  944.30 3942813  966.70

##Correlation Matrix

#Calculate correlation matrix
 
correlation_matrix <- cor(
  correlation_data, use = "complete.obs"
)
#Display correlation matrix

correlation_matrix

##             Prev.Close        Open        High         Low      Volume
## Prev.Close  1.00000000  0.99942268  0.99885060  0.99838428 -0.04934908
## Open        0.99942268  1.00000000  0.99913584  0.99869374 -0.04832402
## High        0.99885060  0.99913584  1.00000000  0.99814370 -0.03974465
## Low         0.99838428  0.99869374  0.99814370  1.00000000 -0.05694402
## Volume     -0.04934908 -0.04832402 -0.03974465 -0.05694402  1.00000000
## Close       0.99790147  0.99811311  0.99880555  0.99911821 -0.04744036
##                  Close
## Prev.Close  0.99790147
## Open        0.99811311
## High        0.99880555
## Low         0.99911821
## Volume     -0.04744036
## Close       1.00000000

##Correlation Visualization

#Plot correlation matrix
corrplot(correlation_matrix,
         method = "color",
         type = "upper",
         tl.col = "black",
         tl.sr = 45)

The correlation analysis shows that Prev.Close, Open, High, Low, and Close have very strong positive correlations. Most correlations are close to 1, indicating that these stock price variables move together.

Volume has weaker correlations with price related variables. The high correlations among predictors may cause multicollinearity in regression analysis, making coefficient estimates unstable and reducing interpretability.

4 4.5 Regression Analysis

##PREPARE DATA FOR REGRESSION

#Selecting variables for regression
regression_data <- zee[, c("Close", 
                           "Prev.Close",
                           "Open", 
                           "High", 
                           "Low", 
                           "Volume")]
# removing missing values
regression_data <- na.omit(regression_data)

#displaying first rows 
head(regression_data)

##     Close Prev.Close    Open    High     Low  Volume
## 1 1179.95    1092.55 1175.00 1179.95 1160.00 1261391
## 2 1260.65    1179.95 1220.00 1274.35 1183.10 4616547
## 3 1176.55    1260.65 1160.55 1317.70 1159.80 8763127
## 4 1115.45    1176.55 1195.00 1200.00 1095.00 5164020
## 5 1026.25    1115.45 1097.10 1097.10 1026.25  755129
## 6  966.70    1026.25 1026.25 1026.25  944.30 3942813

##TRAIN TEST SPLIT

# Set seed
set.seed((123))

#creating row indexes for training data
train_index <- sample(
  1:nrow(regression_data),
  0.8 * nrow(regression_data)
)

# Creating training dataset

train_data <- regression_data[train_index, ]

# Creating test dataset

test_data <- regression_data[-train_index, ]

# Print dataset sizes

cat("Training data size:", nrow(train_data), "\n")

## Training data size: 4244

cat("Test data size:", nrow(test_data), "\n")

## Test data size: 1062

##TRAIN LINEAR REGRESSION MODEL

#Train regression model
model <-lm(
  Close ~ Prev.Close + Open + High + Low + Volume,
  data = train_data
)

# Display model summary

summary(model)

## 
## Call:
## lm(formula = Close ~ Prev.Close + Open + High + Low + Volume, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -91.224  -1.547  -0.319   1.264  66.371 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.095e-01  1.483e-01   4.110 4.03e-05 ***
## Prev.Close  -8.240e-02  1.225e-02  -6.729 1.94e-11 ***
## Open        -4.747e-01  1.511e-02 -31.422  < 2e-16 ***
## High         7.474e-01  1.065e-02  70.184  < 2e-16 ***
## Low          8.079e-01  8.767e-03  92.151  < 2e-16 ***
## Volume       1.639e-08  9.238e-09   1.774   0.0762 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 4238 degrees of freedom
## Multiple R-squared:  0.9992, Adjusted R-squared:  0.9992 
## F-statistic: 1.112e+06 on 5 and 4238 DF,  p-value: < 2.2e-16

4.1 PREDICT TEST DATA

# Predict closing prices

predictions <- predict(
  model,
  newdata = test_data
)

# Display first predictions

head(predictions)

##         4         6         8        15        23        24 
## 1117.9757  958.8398 1074.6155 1162.2597 1326.6539 1365.6769

##MODEL PERFORMANCE

# Calculate RMSE

rmse_value <- rmse(
  test_data$Close,
  predictions
)

# Calculate MAE

mae_value <- mae(
  test_data$Close,
  predictions
)

# Calculate R squared

r2_value <- cor(
  test_data$Close,
  predictions
)^2

# Print metrics

cat("RMSE:", rmse_value, "\n")

## RMSE: 4.847351

cat("MAE:", mae_value, "\n")

## MAE: 2.403604

cat("Rsquared:", r2_value, "\n")

## Rsquared: 0.9992598

The regression model achieved a high R² value, indicating strong predictive performance. Low RMSE and MAE values suggest that prediction errors were relatively small.

##Actual vs Predicted Scatter Plot

#Creating dataframe for plotting

results <- data.frame(
  Actual = test_data$Close,
  Predicted = predictions
)

# Plot actual vs predicted values

ggplot(results, aes(x = Actual, y = Predicted)) +

  # Scatter points

  geom_point(color = "blue", alpha = 0.5) +

  # 45-degree reference line

  geom_abline(
    slope = 1,
    intercept = 0,
    color = "red",
    linetype = "dashed"
  ) +

  # Add labels

  labs(
    title = "Actual vs Predicted Closing Prices",
    x = "Actual Closing Price",
    y = "Predicted Closing Price"
  )+
  theme_minimal()

The scatter plot shows that most predicted values lie close to the 45 degree reference line, indicating good model performance. The spread around the line represents prediction errors. Smaller spread indicates more accurate predictions, while points farther from the line indicate larger prediction errors.

##Standardize independent variables

# Selecting independent variables

x_variables <- regression_data[, c(
  "Prev.Close",
  "Open",
  "High",
  "Low",
  "Volume"
)]

# Standardize variables

x_scaled <- scale(x_variables)

# Display first rows

head(x_scaled)

##      Prev.Close     Open     High      Low      Volume
## [1,]   4.643600 5.079458 4.967118 5.177397 -0.42418743
## [2,]   5.139056 5.333141 5.487904 5.311414 -0.02486008
## [3,]   5.596531 4.997997 5.727058 5.176237  0.46866176
## [4,]   5.119782 5.192206 5.077730 4.800292  0.04029961
## [5,]   4.773416 4.640303 4.510050 4.401431 -0.48444223
## [6,]   4.267756 4.240891 4.119185 3.925988 -0.10504723

##VERIFY SCALING

# Calculate means

scaled_means <- colMeans(x_scaled)

# Calculate standard deviations

scaled_sd <- apply(
  x_scaled,
  2,
  sd
)

# Print means

cat("Means after scaling:\n")

## Means after scaling:

print(scaled_means)

##    Prev.Close          Open          High           Low        Volume 
##  1.260117e-16 -1.713930e-17  3.974498e-17 -4.879196e-17  1.369993e-17

# Print standard deviations

cat("\nStandard deviations after scaling:\n")

## 
## Standard deviations after scaling:

print(scaled_sd)

## Prev.Close       Open       High        Low     Volume 
##          1          1          1          1          1

The standardized variables produced means very close to zero due to floating point precision and standard deviations close to one, confirming successful scaling of the independent variables.

4.2 Apply PCA (Principal Component Analysis)

pca_model <- prcomp(
  x_scaled,
  center = TRUE,
  scale. = TRUE
)

#PCA summary

summary(pca_model)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5
## Standard deviation     1.9999 0.9985 0.04346 0.03368 0.02263
## Proportion of Variance 0.7999 0.1994 0.00038 0.00023 0.00010
## Cumulative Proportion  0.7999 0.9993 0.99967 0.99990 1.00000

4.3 Extract First two Principal Components

pca_data <- data.frame(
  PC1 = pca_model$x[,1],
  PC2 = pca_model$x[,2],
  Close = regression_data$Close
)

# Display first rows

head(pca_data)

##          PC1        PC2   Close
## 1  -9.942295  0.1037662 1179.95
## 2 -10.630962 -0.3212474 1260.65
## 3 -10.728045 -0.8209573 1176.55
## 4 -10.088440 -0.3694890 1115.45
## 5  -9.173517  0.1867117 1026.25
## 6  -8.276006 -0.1645722  966.70

4.4 SPLIT PCA DATA

# Training data

pca_train <- pca_data[train_index, ]

# Test data

pca_test <- pca_data[-train_index, ]

# Display sizes

cat("PCA Training Size:", nrow(pca_train), "\n")

## PCA Training Size: 4244

cat("PCA Test Size:", nrow(pca_test), "\n")

## PCA Test Size: 1062

4.5 PCA REGRESSION MODEL

# Train regression model using PCA components

pca_regression_model <- lm(
  Close ~ PC1 + PC2,
  data = pca_train
)

# Display model summary

summary(pca_regression_model)

## 
## Call:
## lm(formula = Close ~ PC1 + PC2, data = pca_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -81.479  -2.469  -0.229   2.471  79.596 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 273.22368    0.12195  2240.42   <2e-16 ***
## PC1         -87.91147    0.06115 -1437.58   <2e-16 ***
## PC2          -2.91021    0.12213   -23.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.945 on 4241 degrees of freedom
## Multiple R-squared:  0.998,  Adjusted R-squared:  0.998 
## F-statistic: 1.034e+06 on 2 and 4241 DF,  p-value: < 2.2e-16

4.6 PCA PREDICTIONS

# Predict test values

pca_predictions <- predict(
  pca_regression_model,
  newdata = pca_test
)

# Display first predictions

head(pca_predictions)

##        4        6        8       15       23       24 
## 1161.189 1001.258 1039.965 1149.208 1272.853 1366.522

4.7 PCA MODEL R-SQUARED

# Calculating PCA R squared

pca_r2 <- cor(
  pca_test$Close,
  pca_predictions
)^2

# Printing PCA R2

cat("PCA Model R squared:", pca_r2)

## PCA Model R squared: 0.9976517

The raw regression model performed slightly better than the PCA model because it retained all original predictor variables. Although PCA reduced multicollinearity, reducing the data to two principal components resulted in slight information loss, leading to a lower R² value compared to the raw model.

5 4.6 Sales Volume Comparisons

##LOAD VEDANTA DATASET

# Extract Vedanta dataset

vedl <- stock_market_data$VEDL

# Convert Date column

vedl$Date <- as.Date(vedl$Date)

# Display first rows

head(vedl)

##         Date  Symbol Series Prev.Close   Open   High    Low   Last  Close
## 1 2000-01-03 SESAGOA     EQ     107.70 111.00 116.35 108.00 116.35 116.35
## 2 2000-01-04 SESAGOA     EQ     116.35 113.75 116.00 108.00 116.00 114.70
## 3 2000-01-05 SESAGOA     EQ     114.70 107.55 115.85 107.55 114.50 114.00
## 4 2000-01-06 SESAGOA     EQ     114.00 112.00 123.10 112.00 118.80 119.30
## 5 2000-01-07 SESAGOA     EQ     119.30 119.85 120.00 114.05 116.50 116.50
## 6 2000-01-10 SESAGOA     EQ     116.50 120.50 120.85 116.00 119.00 119.00
##     VWAP Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 114.80  20371 233859660000     NA                 NA           NA
## 2 113.34  22366 253499440000     NA                 NA           NA
## 3 112.78  18305 206436075000     NA                 NA           NA
## 4 119.89  25800 309313325000     NA                 NA           NA
## 5 116.84  17361 202840260000     NA                 NA           NA
## 6 118.69  20707 245767815000     NA                 NA           NA

# Extract year for ZEEL

zee$Year <- format(
  zee$Date,
  "%Y"
)

# Extract year for VEDL

vedl$Year <- format(
  vedl$Date,
  "%Y"
)

5.1 ANNUAL SALES VOLUME

zee_volume <- aggregate(
  Volume ~ Year,
  data = zee,
  sum
)

# Adding company name

zee_volume$Company <- "ZEEL"

# Annual volume for VEDL

vedl_volume <- aggregate(
  Volume ~ Year,
  data = vedl,
  sum
)

# Adding company name

vedl_volume$Company <- "VEDL"

# Combine datasets

annual_volume <- rbind(
  zee_volume,
  vedl_volume
)

# Display first rows

head(annual_volume)

##   Year     Volume Company
## 1 2000 2124967599    ZEEL
## 2 2001 2136706114    ZEEL
## 3 2002 1402554565    ZEEL
## 4 2003 1000731804    ZEEL
## 5 2004  569852081    ZEEL
## 6 2005  459053470    ZEEL

# Plot annual sales volume

ggplot(
  annual_volume,
  aes(
    x = Year,
    y = Volume,
    fill = Company
  )
) +

  # Creating clustered bars

  geom_bar(
    stat = "identity",
    position = "dodge"
  ) +

  # Adding labels

  labs(
    title = "Annual Sales Volume Comparison",
    x = "Year",
    y = "Total Volume"
  ) +

  # Rotate year labels

  theme(
    axis.text.x = element_text(angle = 90)
  )

## HIGHEST VOLUME IN 2018

# Filtering 2018 data

volume_2018 <- annual_volume[
  annual_volume$Year == "2018",
]

# Displaying results

volume_2018

##    Year     Volume Company
## 19 2018  598775408    ZEEL
## 41 2018 3276296751    VEDL

Vedanta Ltd (VEDL) recorded the highest sales volume during 2018. ##LEAST VOLUME IN 2021

# Filter 2021 data

volume_2021 <- annual_volume[
  annual_volume$Year == "2021",
]

# Displaying results

volume_2021

##    Year     Volume Company
## 22 2021 1302490317    ZEEL
## 44 2021 1430971315    VEDL

Zee Entertainment Enterprises Ltd (ZEEL) recorded the lowest sales volume during 2021.

##RATIO OF ZEEL TO VEDL IN 2021

# Extracting ZEEL 2021 volume

zee_2021 <- volume_2021$Volume[
  volume_2021$Company == "ZEEL"
]

# Extracting VEDL 2021 volume

vedl_2021 <- volume_2021$Volume[
  volume_2021$Company == "VEDL"
]

# Calculating ratio

ratio <- zee_2021 / vedl_2021

# Printing ratio

cat(
  "Ratio of ZEEL to VEDL sales volume in 2021:",
  ratio
)

## Ratio of ZEEL to VEDL sales volume in 2021: 0.9102141

The ratio of ZEEL sales volume to VEDL sales volume during 2021 was approximately 0.91.

Midterm_Stock Market Analysis and Prediction by Using a Regression Model

Justine Mudahogora

2026-05-25