Install packages

install.packages(c(“dplyr”, “ggplot2”, “readr”, “lubridate”, “corrplot”, “caret”, “scales”, “zoo”)) install.packages(“packrat”) install.packages(“rsconnect”)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(corrplot)
## corrplot 0.95 loaded
library(caret)
## Loading required package: lattice
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

3. Load the Dataset

# Set working directory
setwd("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive")

# Load datasets
zee_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/ZEEL.csv")

# View zee_data
head(zee_data)
##         Date  Symbol Series Prev.Close    Open    High     Low    Last   Close
## 1 2000-01-03 ZEETELE     EQ    1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE     EQ    1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE     EQ    1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE     EQ    1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE     EQ    1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## 6 2000-01-10 ZEETELE     EQ    1026.25 1026.25 1026.25  944.30  962.00  966.70
##      VWAP  Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14     NA                 NA           NA
## 2 1228.02 4616547 5.669220e+14     NA                 NA           NA
## 3 1238.35 8763127 1.085178e+15     NA                 NA           NA
## 4 1135.04 5164020 5.861353e+14     NA                 NA           NA
## 5 1029.94  755129 7.777374e+13     NA                 NA           NA
## 6  980.49 3942813 3.865885e+14     NA                 NA           NA
vedanta_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/VEDL.csv")

head(vedanta_data)
##         Date  Symbol Series Prev.Close   Open   High    Low   Last  Close
## 1 2000-01-03 SESAGOA     EQ     107.70 111.00 116.35 108.00 116.35 116.35
## 2 2000-01-04 SESAGOA     EQ     116.35 113.75 116.00 108.00 116.00 114.70
## 3 2000-01-05 SESAGOA     EQ     114.70 107.55 115.85 107.55 114.50 114.00
## 4 2000-01-06 SESAGOA     EQ     114.00 112.00 123.10 112.00 118.80 119.30
## 5 2000-01-07 SESAGOA     EQ     119.30 119.85 120.00 114.05 116.50 116.50
## 6 2000-01-10 SESAGOA     EQ     116.50 120.50 120.85 116.00 119.00 119.00
##     VWAP Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 114.80  20371 233859660000     NA                 NA           NA
## 2 113.34  22366 253499440000     NA                 NA           NA
## 3 112.78  18305 206436075000     NA                 NA           NA
## 4 119.89  25800 309313325000     NA                 NA           NA
## 5 116.84  17361 202840260000     NA                 NA           NA
## 6 118.69  20707 245767815000     NA                 NA           NA
ADANI_PORTS <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/ADANIPORTS.csv")
head(ADANI_PORTS)
##         Date     Symbol Series Prev.Close   Open    High Low Last   Close
## 1 2007-11-27 MUNDRAPORT     EQ     440.00 770.00 1050.00 770  959  962.90
## 2 2007-11-28 MUNDRAPORT     EQ     962.90 984.00  990.00 874  885  893.90
## 3 2007-11-29 MUNDRAPORT     EQ     893.90 909.00  914.75 841  887  884.20
## 4 2007-11-30 MUNDRAPORT     EQ     884.20 890.00  958.00 890  929  921.55
## 5 2007-12-03 MUNDRAPORT     EQ     921.55 939.75  995.00 922  980  969.30
## 6 2007-12-04 MUNDRAPORT     EQ     969.30 985.00 1056.00 976 1049 1041.45
##      VWAP   Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1  984.72 27294366 2.687719e+15     NA            9859619       0.3612
## 2  941.38  4581338 4.312765e+14     NA            1453278       0.3172
## 3  888.09  5124121 4.550658e+14     NA            1069678       0.2088
## 4  929.17  4609762 4.283257e+14     NA            1260913       0.2735
## 5  965.65  2977470 2.875200e+14     NA             816123       0.2741
## 6 1015.39  4849250 4.923867e+14     NA            1537667       0.3171
ASIAN_PAINT <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/ASIANPAINT.csv")
head(ASIAN_PAINT)
##         Date     Symbol Series Prev.Close  Open  High   Low  Last  Close   VWAP
## 1 2000-01-03 ASIANPAINT     EQ     361.20 370.0 390.0 370.0 385.0 381.65 380.54
## 2 2000-01-04 ASIANPAINT     EQ     381.65 380.0 392.0 375.0 390.0 385.55 383.50
## 3 2000-01-05 ASIANPAINT     EQ     385.55 371.5 390.0 371.5 383.0 383.00 379.81
## 4 2000-01-06 ASIANPAINT     EQ     383.00 384.9 384.9 374.5 375.1 377.50 379.88
## 5 2000-01-07 ASIANPAINT     EQ     377.50 376.0 390.0 370.0 389.0 385.70 383.38
## 6 2000-01-10 ASIANPAINT     EQ     385.70 415.0 416.6 409.0 416.6 415.00 414.97
##   Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1   3318 1.262617e+11     NA                 NA           NA
## 2   4818 1.847699e+11     NA                 NA           NA
## 3   2628 9.981384e+10     NA                 NA           NA
## 4   3354 1.274114e+11     NA                 NA           NA
## 5   9589 3.676275e+11     NA                 NA           NA
## 6  60313 2.502823e+12     NA                 NA           NA

#Exploratory Data Analysis (EDA) #Create Profile Function

profile_stock <- function(data){
   # Trading date range
  cat("Trading Date Range:\n")
  print(range(as.Date(data$Date)))
  
  # Missing values
  cat("\nMissing Values:\n")
  print(colSums(is.na(data)))
  
  # Data types
  cat("\nData Types:\n")
  print(str(data))
  
  # First 5 rows
  cat("\nTop 5 Rows:\n")
  print(head(data, 5))
  
  # Summary statistics
  cat("\nSummary Statistics:\n")
  print(summary(data))
  
  # Outliers using IQR
  numeric_cols <- sapply(data, is.numeric)
  
  outlier_count <- sapply(data[, numeric_cols], function(x) {
    Q1 <- quantile(x, 0.25, na.rm = TRUE)
    Q3 <- quantile(x, 0.75, na.rm = TRUE)
    IQR_value <- Q3 - Q1
    sum(x < (Q1 - 1.5 * IQR_value) |
          x > (Q3 + 1.5 * IQR_value), na.rm = TRUE)
  })
  
  cat("\nNumber of Outliers:\n")
  print(outlier_count)
}
profile_stock(zee_data)
## Trading Date Range:
## [1] "2000-01-03" "2021-04-30"
## 
## Missing Values:
##               Date             Symbol             Series         Prev.Close 
##                  0                  0                  0                  0 
##               Open               High                Low               Last 
##                  0                  0                  0                  0 
##              Close               VWAP             Volume           Turnover 
##                  0                  0                  0                  0 
##             Trades Deliverable.Volume       X.Deliverble 
##               2850                519                519 
## 
## Data Types:
## 'data.frame':    5306 obs. of  15 variables:
##  $ Date              : chr  "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
##  $ Symbol            : chr  "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
##  $ Series            : chr  "EQ" "EQ" "EQ" "EQ" ...
##  $ Prev.Close        : num  1093 1180 1261 1177 1115 ...
##  $ Open              : num  1175 1220 1161 1195 1097 ...
##  $ High              : num  1180 1274 1318 1200 1097 ...
##  $ Low               : num  1160 1183 1160 1095 1026 ...
##  $ Last              : num  1180 1274 1191 1106 1026 ...
##  $ Close             : num  1180 1261 1177 1115 1026 ...
##  $ VWAP              : num  1177 1228 1238 1135 1030 ...
##  $ Volume            : int  1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
##  $ Turnover          : num  1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
##  $ Trades            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Deliverable.Volume: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ X.Deliverble      : num  NA NA NA NA NA NA NA NA NA NA ...
## NULL
## 
## Top 5 Rows:
##         Date  Symbol Series Prev.Close    Open    High     Low    Last   Close
## 1 2000-01-03 ZEETELE     EQ    1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE     EQ    1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE     EQ    1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE     EQ    1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE     EQ    1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
##      VWAP  Volume     Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14     NA                 NA           NA
## 2 1228.02 4616547 5.669220e+14     NA                 NA           NA
## 3 1238.35 8763127 1.085178e+15     NA                 NA           NA
## 4 1135.04 5164020 5.861353e+14     NA                 NA           NA
## 5 1029.94  755129 7.777374e+13     NA                 NA           NA
## 
## Summary Statistics:
##         Date            Symbol           Series       Prev.Close    
##  Length   :5306   Length   :5306   Length   :5306   Min.   :  62.3  
##  N.unique :5306   N.unique :   2   N.unique :   1   1st Qu.: 143.2  
##  N.blank  :   0   N.blank  :   0   N.blank  :   0   Median : 238.2  
##  Min.nchar:  10   Min.nchar:   4   Min.nchar:   2   Mean   : 273.4  
##  Max.nchar:  10   Max.nchar:   7   Max.nchar:   2   3rd Qu.: 345.6  
##                                                     Max.   :1541.7  
##                                                                     
##       Open           High             Low              Last       
##  Min.   :  62   Min.   :  66.3   Min.   :  60.1   Min.   :  62.7  
##  1st Qu.: 144   1st Qu.: 146.9   1st Qu.: 140.0   1st Qu.: 143.5  
##  Median : 238   Median : 244.0   Median : 231.4   Median : 237.7  
##  Mean   : 274   Mean   : 279.6   Mean   : 267.6   Mean   : 273.2  
##  3rd Qu.: 346   3rd Qu.: 352.8   3rd Qu.: 338.4   3rd Qu.: 345.1  
##  Max.   :1640   Max.   :1645.0   Max.   :1512.2   Max.   :1564.0  
##                                                                   
##      Close             VWAP             Volume             Turnover        
##  Min.   :  62.3   Min.   :  63.08   Min.   :     4415   Min.   :7.021e+10  
##  1st Qu.: 143.2   1st Qu.: 143.68   1st Qu.:  1218226   1st Qu.:2.595e+13  
##  Median : 238.1   Median : 238.90   Median :  2138807   Median :5.250e+13  
##  Mean   : 273.2   Mean   : 273.63   Mean   :  4825422   Mean   :1.249e+14  
##  3rd Qu.: 345.6   3rd Qu.: 345.64   3rd Qu.:  4532904   3rd Qu.:1.137e+14  
##  Max.   :1541.7   Max.   :1578.11   Max.   :165959680   Max.   :4.286e+15  
##                                                                            
##      Trades        Deliverable.Volume  X.Deliverble   
##  Min.   :    296   Min.   :    4415   Min.   :0.0557  
##  1st Qu.:  24579   1st Qu.:  513686   1st Qu.:0.3073  
##  Median :  41074   Median :  893532   Median :0.4635  
##  Mean   :  62646   Mean   : 1415718   Mean   :0.4522  
##  3rd Qu.:  71463   3rd Qu.: 1593444   3rd Qu.:0.5939  
##  Max.   :1088460   Max.   :42891428   Max.   :1.0000  
##  NAs    :2850      NAs    :519        NAs    :519     
## 
## Number of Outliers:
##         Prev.Close               Open               High                Low 
##                 85                 83                 87                 81 
##               Last              Close               VWAP             Volume 
##                 83                 84                 87                672 
##           Turnover             Trades Deliverable.Volume       X.Deliverble 
##                744                227                382                  0

Histogram_of_Percentage_Change

ggplot(zee_data, aes(x = Percent_Change)) +
  geom_histogram(fill = "orange", bins = 30) +
  labs(title = "Distribution of Percentage Change",
       x = "Percentage Change",
       y = "Frequency") +
  theme_minimal()

#Correlation Analysis

#Select Numerical Variables
correlation_data <- zee_data %>%
  select(Prev.Close, Open, High, Low, Volume, Close)

#Correlation Matrix

#Select Numerical Variables
cor_matrix <- cor(correlation_data, use = "complete.obs")
print(cor_matrix)
##             Prev.Close        Open        High         Low      Volume
## Prev.Close  1.00000000  0.99942268  0.99885060  0.99838428 -0.04934908
## Open        0.99942268  1.00000000  0.99913584  0.99869374 -0.04832402
## High        0.99885060  0.99913584  1.00000000  0.99814370 -0.03974465
## Low         0.99838428  0.99869374  0.99814370  1.00000000 -0.05694402
## Volume     -0.04934908 -0.04832402 -0.03974465 -0.05694402  1.00000000
## Close       0.99790147  0.99811311  0.99880555  0.99911821 -0.04744036
##                  Close
## Prev.Close  0.99790147
## Open        0.99811311
## High        0.99880555
## Low         0.99911821
## Volume     -0.04744036
## Close       1.00000000

#Correlation Plot

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black",
         tl.srt = 45)

#Highly Correlated Variables

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black",
         tl.srt = 45)

#Regression Analysis

#Select Variables
regression_data <- zee_data %>%
  select(Close, Prev.Close, Open, High, Low, Volume) %>%
  na.omit()

#Split Data into Training and Testing Sets

set.seed(123)

train_index <- createDataPartition(regression_data$Close,
                                   p = 0.8,
                                   list = FALSE)

train_data <- regression_data[train_index, ]
test_data <- regression_data[-train_index, ]

cat("Training Set Size:", nrow(train_data), "\n")
## Training Set Size: 4246
cat("Testing Set Size:", nrow(test_data), "\n")
## Testing Set Size: 1060

#Train Linear Regression Model

model <- lm(Close ~ Prev.Close + Open + High + Low + Volume,
            data = train_data)

summary(model)
## 
## Call:
## lm(formula = Close ~ Prev.Close + Open + High + Low + Volume, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.142  -1.524  -0.303   1.283  67.085 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.610e-01  1.524e-01   3.024  0.00251 ** 
## Prev.Close  -8.569e-02  1.306e-02  -6.560 6.01e-11 ***
## Open        -4.596e-01  1.608e-02 -28.577  < 2e-16 ***
## High         7.316e-01  1.059e-02  69.111  < 2e-16 ***
## Low          8.125e-01  8.928e-03  91.005  < 2e-16 ***
## Volume       3.001e-08  9.532e-09   3.148  0.00165 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.015 on 4240 degrees of freedom
## Multiple R-squared:  0.9992, Adjusted R-squared:  0.9992 
## F-statistic: 1.071e+06 on 5 and 4240 DF,  p-value: < 2.2e-16

#Predict on Test Data

predictions <- predict(model, test_data)

#Evaluate Model Performance
# RMSE
rmse <- sqrt(mean((test_data$Close - predictions)^2))

# MAE
mae <- mean(abs(test_data$Close - predictions))

# R-squared
r2 <- cor(test_data$Close, predictions)^2

cat("RMSE:", rmse, "\n")
## RMSE: 4.107454
cat("MAE:", mae, "\n")
## MAE: 2.25627
cat("R-squared:", r2, "\n")
## R-squared: 0.9993964

#Actual vs Predicted Scatter Plot

plot(test_data$Close,
     predictions,
     main = "Actual vs Predicted Closing Prices",
     xlab = "Actual Close",
     ylab = "Predicted Close",
     col = "blue",
     pch = 16)

abline(a = 0, b = 1, col = "red", lwd = 2)