20251MBI004_Solange_Uwiringiyimana

library(scales)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

all_files <- list.files(path = "C:/Users/Soluc/OneDrive/AUCA/R/Mid_Sem_Exam/alldatasets", pattern = "*.csv", full.names = TRUE)

all_datasets <- lapply(all_files, read.csv)

names(all_datasets) <- tools::file_path_sans_ext(basename(all_files))

names(all_datasets)

##  [1] "ADANIPORTS" "ASIANPAINT" "AXISBANK"   "BAJAJ-AUTO" "BAJAJFINSV"
##  [6] "BAJFINANCE" "BHARTIARTL" "BPCL"       "BRITANNIA"  "CIPLA"     
## [11] "COALINDIA"  "DRREDDY"    "EICHERMOT"  "GAIL"       "GRASIM"    
## [16] "HCLTECH"    "HDFC"       "HDFCBANK"   "HEROMOTOCO" "HINDALCO"  
## [21] "HINDUNILVR" "ICICIBANK"  "INDUSINDBK" "INFRATEL"   "INFY"      
## [26] "IOC"        "ITC"        "JSWSTEEL"   "KOTAKBANK"  "LT"        
## [31] "MARUTI"     "MM"         "NESTLEIND"  "NTPC"       "ONGC"      
## [36] "POWERGRID"  "RELIANCE"   "SBIN"       "SHREECEM"   "SUNPHARMA" 
## [41] "TATAMOTORS" "TATASTEEL"  "TCS"        "TECHM"      "TITAN"     
## [46] "ULTRACEMCO" "UPL"        "VEDL"       "WIPRO"      "ZEEL"

profile<- function(dset){

  cat("Trading Date Range:\n")
  cat(min(dset$Date), "to", max(dset$Date), "\n\n")
  
  cat("Missing values\n")
  print(colSums(is.na(dset)))
  
  cat("\nNumber of Outliers:\n")
  outliers <- length(boxplot.stats(dset$Close)$out)
  cat("Close ->", outliers, "outliers\n")
  
  cat("\n Top 5 rows :\n")
  head(dset,5)
  
  cat("\n:data type\n")
  str(dset)
  #sapply(dset, class)
  
  cat("\nSummary statistic\n")
  
  
  dset%>%
    summary()
}

profile(all_datasets$ZEEL)

## Trading Date Range:
## 2000-01-03 to 2021-04-30 
## 
## Missing values
##               Date             Symbol             Series         Prev.Close 
##                  0                  0                  0                  0 
##               Open               High                Low               Last 
##                  0                  0                  0                  0 
##              Close               VWAP             Volume           Turnover 
##                  0                  0                  0                  0 
##             Trades Deliverable.Volume       X.Deliverble 
##               2850                519                519 
## 
## Number of Outliers:
## Close -> 84 outliers
## 
##  Top 5 rows :
## 
## :data type
## 'data.frame':    5306 obs. of  15 variables:
##  $ Date              : chr  "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
##  $ Symbol            : chr  "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
##  $ Series            : chr  "EQ" "EQ" "EQ" "EQ" ...
##  $ Prev.Close        : num  1093 1180 1261 1177 1115 ...
##  $ Open              : num  1175 1220 1161 1195 1097 ...
##  $ High              : num  1180 1274 1318 1200 1097 ...
##  $ Low               : num  1160 1183 1160 1095 1026 ...
##  $ Last              : num  1180 1274 1191 1106 1026 ...
##  $ Close             : num  1180 1261 1177 1115 1026 ...
##  $ VWAP              : num  1177 1228 1238 1135 1030 ...
##  $ Volume            : int  1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
##  $ Turnover          : num  1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
##  $ Trades            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Deliverable.Volume: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ X.Deliverble      : num  NA NA NA NA NA NA NA NA NA NA ...
## 
## Summary statistic

##      Date              Symbol             Series            Prev.Close    
##  Length:5306        Length:5306        Length:5306        Min.   :  62.3  
##  Class :character   Class :character   Class :character   1st Qu.: 143.2  
##  Mode  :character   Mode  :character   Mode  :character   Median : 238.2  
##                                                           Mean   : 273.4  
##                                                           3rd Qu.: 345.6  
##                                                           Max.   :1541.7  
##                                                                           
##       Open           High             Low              Last       
##  Min.   :  62   Min.   :  66.3   Min.   :  60.1   Min.   :  62.7  
##  1st Qu.: 144   1st Qu.: 146.9   1st Qu.: 140.0   1st Qu.: 143.5  
##  Median : 238   Median : 244.0   Median : 231.4   Median : 237.7  
##  Mean   : 274   Mean   : 279.6   Mean   : 267.6   Mean   : 273.2  
##  3rd Qu.: 346   3rd Qu.: 352.8   3rd Qu.: 338.4   3rd Qu.: 345.1  
##  Max.   :1640   Max.   :1645.0   Max.   :1512.2   Max.   :1564.0  
##                                                                   
##      Close             VWAP             Volume             Turnover        
##  Min.   :  62.3   Min.   :  63.08   Min.   :     4415   Min.   :7.021e+10  
##  1st Qu.: 143.2   1st Qu.: 143.68   1st Qu.:  1218226   1st Qu.:2.595e+13  
##  Median : 238.1   Median : 238.90   Median :  2138807   Median :5.250e+13  
##  Mean   : 273.2   Mean   : 273.63   Mean   :  4825422   Mean   :1.249e+14  
##  3rd Qu.: 345.6   3rd Qu.: 345.64   3rd Qu.:  4532904   3rd Qu.:1.137e+14  
##  Max.   :1541.7   Max.   :1578.11   Max.   :165959680   Max.   :4.286e+15  
##                                                                            
##      Trades        Deliverable.Volume  X.Deliverble   
##  Min.   :    296   Min.   :    4415   Min.   :0.0557  
##  1st Qu.:  24579   1st Qu.:  513686   1st Qu.:0.3073  
##  Median :  41074   Median :  893532   Median :0.4635  
##  Mean   :  62646   Mean   : 1415718   Mean   :0.4522  
##  3rd Qu.:  71463   3rd Qu.: 1593444   3rd Qu.:0.5939  
##  Max.   :1088460   Max.   :42891428   Max.   :1.0000  
##  NA's   :2850      NA's   :519        NA's   :519

###Analyze trading volume trends and identify unusual volume spikes. Identify at least 3 significant market events visible in the data (e.g., COVID-19,…) and annotate them on chart.

#  Extract TITAN and fix date
titan <- all_datasets$TITAN
titan$Date <- as.Date(titan$Date)

# Calculate z-score to detect spikes
titan$vol_mean <- mean(titan$Volume)
titan$vol_sd   <- sd(titan$Volume)
titan$z_score  <- (titan$Volume - titan$vol_mean) / titan$vol_sd
titan$is_spike <- titan$z_score > 2.5

# Define 3 market events to annotate
events <- data.frame(
  date  = as.Date(c("2020-03-23", "2021-04-01", "2022-02-24")),
  label = c("COVID-19 Crash", "COVID 2nd Wave", "Russia-Ukraine War")
)

# Plot
ggplot(titan, aes(x = Date, y = Volume)) +
  
  # draw volume bars, red if spike, blue if normal
  geom_col(aes(fill = is_spike)) +
  scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red"),
                    labels  = c("Normal", "Spike"),
                    name    = "Volume Type") +
  
  # vertical lines for market events
  geom_vline(data = events, aes(xintercept = date),
             color = "green", linetype = "dashed", linewidth = 0.8) +
  
  # text labels for each event
  geom_text(data = events,
            aes(x = date, y = max(titan$Volume) * 0.95, label = label),
            angle = 90,      # rotate text vertically
            vjust = -0.3,    # shift text slightly left of line
            color = "purple",
            size  = 3) +
  
  # titles and labels
  labs(
    title = "TITAN Trading Volume Over Time",
    subtitle = "Red = unusual volume spike | Dashed lines = major market events",
    x = "Date",
    y = "Volume"
  )

###4.3 Visualization ###With ggplot2, use Zee Entertainment Enterprises Ltd to come up with the visualization:– Closing price trend & Corresponding percentage change– Sales volume trend– 15,30 and 45 days moving average trend for the closing price.– Use a histogram to represent the distribution of the percentage change for the Prev Close.

zeel <- all_datasets$ZEEL
names(zeel)

##  [1] "Date"               "Symbol"             "Series"            
##  [4] "Prev.Close"         "Open"               "High"              
##  [7] "Low"                "Last"               "Close"             
## [10] "VWAP"               "Volume"             "Turnover"          
## [13] "Trades"             "Deliverable.Volume" "X.Deliverble"

# extract DATA
zeel <- all_datasets$ZEEL
zeel$Date <- as.Date(zeel$Date)

# calculate percentage change of closing price using Close and Prev.Close
zeel$pac_change <- (zeel$Close - zeel$Prev.Close) / zeel$Prev.Close * 100

# calculate 15, 30, 45 days moving averages
zeel$ma15 <- zoo::rollmean(zeel$Close, 15, fill = NA)
zeel$ma30 <- zoo::rollmean(zeel$Close, 30, fill = NA)
zeel$ma45 <- zoo::rollmean(zeel$Close, 45, fill = NA)

# calculate percentage change of Prev Close
zeel$prev_pac_change <- (zeel$Prev.Close - lag(zeel$Prev.Close)) / lag(zeel$Prev.Close) * 100

#CLOSING PRICE TREND
ggplot(zeel, aes(x = Date, y = Close)) +
  geom_line(color = "skyblue") +
  labs(
    title = "ZEEL Closing Price Trend",
    x     = "Date",
    y     = "Closing Price"
  )

# PERCENTAGE CHANGE OF CLOSING PRICE
ggplot(zeel, aes(x = Date, y = pac_change)) +
  geom_line(color = "blue") +
  labs(
    title = "ZEEL Closing Price Percentage Change",
    x     = "Date",
    y     = "Percentage Change %"
  )

# SALES VOLUME TREND
ggplot(zeel, aes(x = Date, y = Volume)) +
  geom_col(fill = "blue") +
  labs(
    title = "Sales Volume Trend of ZEEL",
    x     = "Date",
    y     = "Volume"
  )

# MOVING AVERAGES
ggplot(zeel, aes(x = Date)) +
  geom_line(aes(y = Close), color = "gray") +
  geom_line(aes(y = ma15),  color = "blue") +
  geom_line(aes(y = ma30),  color = "green") +
  geom_line(aes(y = ma45),  color = "red") +
  labs(
    title   = "ZEEL Moving Averages",
    x       = "Date",
    y       = "Price",
    subtitle= "Gray = Actual Price | Blue = 15 days | Green = 30 days | Red = 45 days"
    
  )

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 29 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_line()`).

# HISTOGRAM OF PREV CLOSE PERCENTAGE CHANGE
ggplot(zeel, aes(x = prev_pac_change)) +
  geom_histogram(fill = "purple", color = "grey") +
  labs(
    title = "Distribution of ZEEL Prev Close Percentage Change",
    x     = "Percentage Change %",
    y     = "Count"
  )

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

#4.4 Correlation Analysis Using visualization and statistical analysis of numerical values, determine the pattern of relationships between Prev Close, Open, High, Low, volume and close by using By using Zee Entertainment Enterprises Ltd dataset. Identify any highly correlated pairs (|r| > 0.95) and explain what problem this could cause in your regression model.Interpret your findings.

#Select columns needed and compute correlation matrix
choosen <- zeel[, c("Prev.Close", "Open", "High", "Low", "Volume", "Close")]

cor_matrix <- cor(choosen, use = "complete.obs")  # save it 
round(cor_matrix, 3)                               # print it

##            Prev.Close   Open   High    Low Volume  Close
## Prev.Close      1.000  0.999  0.999  0.998 -0.049  0.998
## Open            0.999  1.000  0.999  0.999 -0.048  0.998
## High            0.999  0.999  1.000  0.998 -0.040  0.999
## Low             0.998  0.999  0.998  1.000 -0.057  0.999
## Volume         -0.049 -0.048 -0.040 -0.057  1.000 -0.047
## Close           0.998  0.998  0.999  0.999 -0.047  1.000

# Reshape for ggplot2
cor_reshape <- as.data.frame(as.table(cor_matrix))
names(cor_reshape) <- c("Var1", "Var2", "Correlation")
#Heatmap
ggplot(cor_reshape, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile(color = "gray") +
  geom_text(aes(label = round(Correlation, 2)), size = 3.5) +
  scale_fill_gradient2(
    low      = "red",
    mid      ="skyblue",
    high     = "darkgreen",
    midpoint = 0,
    limits   = c(-1, 1)
  ) +
  labs(title = "ZEEL Correlation Heatmap", x = "", y = "")

# high correlation pairs
cor_reshape %>%
  filter(Var1 != Var2, abs(Correlation) > 0.95) %>%
  arrange(desc(abs(Correlation)))

##          Var1       Var2 Correlation
## 1        Open Prev.Close   0.9994227
## 2  Prev.Close       Open   0.9994227
## 3        High       Open   0.9991358
## 4        Open       High   0.9991358
## 5       Close        Low   0.9991182
## 6         Low      Close   0.9991182
## 7        High Prev.Close   0.9988506
## 8  Prev.Close       High   0.9988506
## 9       Close       High   0.9988055
## 10       High      Close   0.9988055
## 11        Low       Open   0.9986937
## 12       Open        Low   0.9986937
## 13        Low Prev.Close   0.9983843
## 14 Prev.Close        Low   0.9983843
## 15        Low       High   0.9981437
## 16       High        Low   0.9981437
## 17      Close       Open   0.9981131
## 18       Open      Close   0.9981131
## 19      Close Prev.Close   0.9979015
## 20 Prev.Close      Close   0.9979015

problem : multicollinearity

###when predictors are too strongly correlated, the model cannot separate their individual effects, coefficients become unstable, and standard errors can cause insignificant variables.

Interpretation :

###All price variables (Prev.Close, Open, High, Low, Close) are correlated above 0.997 with each other → all qualify as highly correlated pairs Volume is the only independent variable with near-zero correlation (≈ -0.05) with everything else

###4.5 Regression Analysis: By considering close as a dependent variable and Prev Close, Open, High, Low, and volume as independent variables. Split the data into 80% training and 20% test sets. Print the sizes of each split.– Train the regression model and evaluate it’s performance by using 3 performance metrics of your choice (use Zee Entertainment Enterprises Ltd. data).

# select needed columns
reg_data <- zeel[, c("Close", "Prev.Close", "Open", "High", "Low", "Volume")]

# remove missing values
reg_data <- na.omit(reg_data)

# split data into 80% train and 20% test
set.seed(123)

train_rows <- sample(nrow(reg_data), 0.8 * nrow(reg_data))

train_data <- reg_data[train_rows, ]
test_data  <- reg_data[-train_rows, ]

# print sizes
cat("Training set size :", nrow(train_data), "\n")

## Training set size : 4244

cat("Testing set size  :", nrow(test_data), "\n")

## Testing set size  : 1062

###Train the regression model and evaluate it’s performance by using 3 performance metrics of your choice (use Zee Entertainment Enterprises Ltd. data).

# train regression model
model <- lm(Close ~ Prev.Close + Open + High + Low + Volume,
            data = train_data)

# make predictions
predictions <- predict(model, test_data)

# performance metrics
mae  <- mean(abs(test_data$Close - predictions))

rmse <- sqrt(mean((test_data$Close - predictions)^2))

r2   <- cor(test_data$Close, predictions)^2

# print results
cat("MAE  :", mae, "\n")

## MAE  : 2.403604

cat("RMSE :", rmse, "\n")

## RMSE : 4.847351

cat("R2   :", r2, "\n")

## R2   : 0.9992598

Plot the actual vs predicted closing prices on the test set (scatter plot). Add a 45-degree reference line (perfect prediction line). What does the spread around this line tell you?

# create dataframe for plotting
result <- data.frame(
  Actual = test_data$Close,
  Predicted = predictions
)

# scatter plot
ggplot(result, aes(x = Actual, y = Predicted)) +
  geom_point(color = "blue") +
  
  # 45 degree reference line
  geom_abline(slope = 1, intercept = 0, color = "red") +
  
  labs(
    title = "Actual vs Predicted Closing Prices",
    x = "Actual Close Price",
    y = "Predicted Close Price"
  )

###standardised dataset: Standardise the independent variables using Stan dardScaler so that each variable have mean=0 and std=1. Print the mean and standard deviation of each column after scaling to verify.

# standardize independent variables
scaled_x <- scale(reg_data[, c("Prev.Close", "Open", "High", "Low", "Volume")])

# means after scaling
colMeans(scaled_x)

##    Prev.Close          Open          High           Low        Volume 
##  1.260117e-16 -1.713930e-17  3.974498e-17 -4.879196e-17  1.369993e-17

# standard deviations after scaling
apply(scaled_x, 2, sd)

## Prev.Close       Open       High        Low     Volume 
##          1          1          1          1          1

Apply PCA to reduce X to 2 components, then train a Linear Regression model on the PCA-transformed training data. Print the R2 on the test set.

# apply PCA on standardized variables
pca <- prcomp(scaled_x)

# keep only 2 principal components
pca_data <- as.data.frame(pca$x[, 1:2])

# add dependent variable
pca_data$Close <- reg_data$Close

# split data into train and test
set.seed(123)

train_rows <- sample(nrow(pca_data), 0.8 * nrow(pca_data))

train_pca <- pca_data[train_rows, ]
test_pca  <- pca_data[-train_rows, ]

# train regression model
pca_model <- lm(Close ~ PC1 + PC2, data = train_pca)

# predictions
pred_pca <- predict(pca_model, test_pca)

# calculate R2
r2_pca <- cor(test_pca$Close, pred_pca)^2

# print R2
cat("R2 :", r2_pca)

## R2 : 0.9976517

Compare the R2 of the PCA model vs the raw model(Model developed on step 2). Which performs better? Why might this be the case?

###Interpretation :

###The raw regression model performs better if its R2 is higher than the PCA model.

###This happens because the raw model uses all original variables (Prev.Close, Open, High, Low, Volume), so it keeps all information from the dataset.

###The PCA model reduces the variables into only 2 principal components. During this reduction, some information may be lost, which can lower prediction accuracy.

###However, PCA helps reduce multicollinearity because the principal components are independent from each other.

###Raw model is usually higher accuracy ###PCA model is simpler model with less multicollinearity and reduced dimensions

###4.6 Sales volume comparisons: ###After computing the annual sales volume (total) of each of Zee Entertainment Enterprises Ltd and Vedanta Ltd., use a comparative bar chart (clustered bar chart) to represent that information and answer the following questions: – Which company sold the highest volume during 2018?

# ZEEL data
zeel <- all_datasets$ZEEL
zeel$Date <- as.Date(zeel$Date)
zeel$Year <- format(zeel$Date, "%Y")

# Vedanta data
vedl <- all_datasets$VEDL
vedl$Date <- as.Date(vedl$Date)
vedl$Year <- format(vedl$Date, "%Y")

###Filter 2018 and compute total Volume

# Filter 2018
zeel_2018 <- zeel %>% filter(Year == "2018")
vedl_2018 <- vedl %>% filter(Year == "2018")

# Compute total volume
volume_compare <- data.frame(
  Company = c("ZEEL", "Vedanta"),
  Volume = c(sum(zeel_2018$Volume, na.rm = TRUE),
             sum(vedl_2018$Volume, na.rm = TRUE))
)

volume_compare

##   Company     Volume
## 1    ZEEL  598775408
## 2 Vedanta 3276296751

###Bar Chart

ggplot(volume_compare, aes(x = Company, y = Volume, fill = Company)) +
  geom_col(position = "dodge") +
  
  scale_fill_manual(values = c(
    "ZEEL" = "darkred",
    "Vedanta" = "darkgreen"
  )) +
  
  labs(
    title = "2018 Annual Sales Volume Comparison",
    x = "Company",
    y = "Total Trading Volume"
  )

###Vedanta sold the highest volume during 2018

###Which company sold the least volume during 2021?

###filter 2021

### Filter 2021 and compute total Volume

# 1. First, create the dataframes for 2021
zeel_2021 <- zeel[format(zeel$Date, "%Y") == "2021", ]
vedl_2021 <- vedl[format(vedl$Date, "%Y") == "2021", ]

# 2. FIX: Create the missing 'volume_2021' summary dataset
volume_2021 <- data.frame(
  Company = c("ZEEL", "Vedanta"),
  Volume = c(sum(zeel_2021$Volume, na.rm = TRUE),
             sum(vedl_2021$Volume, na.rm = TRUE))
)

# Print it to make sure it works
volume_2021

##   Company     Volume
## 1    ZEEL 1302490317
## 2 Vedanta 1430971315

####Compute total volume

 #### Compute total volume chart

# This will now work perfectly because 'volume_2021' exists!
ggplot(volume_2021, aes(x = Company, y = Volume, fill = Company)) +
  geom_col() +
  
  scale_fill_manual(values = c(
    "ZEEL" = "darkred",
    "Vedanta" = "gold"
  )) +
  
  labs(
    title = "2021 Annual Sales Volume Comparison",
    x = "Company",
    y = "Total Trading Volume"
  )

###ZEEL sold the least volume during 2021

What is the ratio of Zee Entertainment Enterprises Ltd sales to Vedanta Ltd sales during 2021?

ratio <- volume_2021$Volume[volume_2021$Company == "ZEEL"] /
         volume_2021$Volume[volume_2021$Company == "Vedanta"]

ratio

## [1] 0.9102141