library(scales)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
all_files <- list.files(path = "C:/Users/Soluc/OneDrive/AUCA/R/Mid_Sem_Exam/alldatasets", pattern = "*.csv", full.names = TRUE)
all_datasets <- lapply(all_files, read.csv)
names(all_datasets) <- tools::file_path_sans_ext(basename(all_files))
names(all_datasets)
## [1] "ADANIPORTS" "ASIANPAINT" "AXISBANK" "BAJAJ-AUTO" "BAJAJFINSV"
## [6] "BAJFINANCE" "BHARTIARTL" "BPCL" "BRITANNIA" "CIPLA"
## [11] "COALINDIA" "DRREDDY" "EICHERMOT" "GAIL" "GRASIM"
## [16] "HCLTECH" "HDFC" "HDFCBANK" "HEROMOTOCO" "HINDALCO"
## [21] "HINDUNILVR" "ICICIBANK" "INDUSINDBK" "INFRATEL" "INFY"
## [26] "IOC" "ITC" "JSWSTEEL" "KOTAKBANK" "LT"
## [31] "MARUTI" "MM" "NESTLEIND" "NTPC" "ONGC"
## [36] "POWERGRID" "RELIANCE" "SBIN" "SHREECEM" "SUNPHARMA"
## [41] "TATAMOTORS" "TATASTEEL" "TCS" "TECHM" "TITAN"
## [46] "ULTRACEMCO" "UPL" "VEDL" "WIPRO" "ZEEL"
profile<- function(dset){
cat("Trading Date Range:\n")
cat(min(dset$Date), "to", max(dset$Date), "\n\n")
cat("Missing values\n")
print(colSums(is.na(dset)))
cat("\nNumber of Outliers:\n")
outliers <- length(boxplot.stats(dset$Close)$out)
cat("Close ->", outliers, "outliers\n")
cat("\n Top 5 rows :\n")
head(dset,5)
cat("\n:data type\n")
str(dset)
#sapply(dset, class)
cat("\nSummary statistic\n")
dset%>%
summary()
}
profile(all_datasets$ZEEL)
## Trading Date Range:
## 2000-01-03 to 2021-04-30
##
## Missing values
## Date Symbol Series Prev.Close
## 0 0 0 0
## Open High Low Last
## 0 0 0 0
## Close VWAP Volume Turnover
## 0 0 0 0
## Trades Deliverable.Volume X.Deliverble
## 2850 519 519
##
## Number of Outliers:
## Close -> 84 outliers
##
## Top 5 rows :
##
## :data type
## 'data.frame': 5306 obs. of 15 variables:
## $ Date : chr "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
## $ Symbol : chr "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
## $ Series : chr "EQ" "EQ" "EQ" "EQ" ...
## $ Prev.Close : num 1093 1180 1261 1177 1115 ...
## $ Open : num 1175 1220 1161 1195 1097 ...
## $ High : num 1180 1274 1318 1200 1097 ...
## $ Low : num 1160 1183 1160 1095 1026 ...
## $ Last : num 1180 1274 1191 1106 1026 ...
## $ Close : num 1180 1261 1177 1115 1026 ...
## $ VWAP : num 1177 1228 1238 1135 1030 ...
## $ Volume : int 1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
## $ Turnover : num 1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
## $ Trades : num NA NA NA NA NA NA NA NA NA NA ...
## $ Deliverable.Volume: num NA NA NA NA NA NA NA NA NA NA ...
## $ X.Deliverble : num NA NA NA NA NA NA NA NA NA NA ...
##
## Summary statistic
## Date Symbol Series Prev.Close
## Length:5306 Length:5306 Length:5306 Min. : 62.3
## Class :character Class :character Class :character 1st Qu.: 143.2
## Mode :character Mode :character Mode :character Median : 238.2
## Mean : 273.4
## 3rd Qu.: 345.6
## Max. :1541.7
##
## Open High Low Last
## Min. : 62 Min. : 66.3 Min. : 60.1 Min. : 62.7
## 1st Qu.: 144 1st Qu.: 146.9 1st Qu.: 140.0 1st Qu.: 143.5
## Median : 238 Median : 244.0 Median : 231.4 Median : 237.7
## Mean : 274 Mean : 279.6 Mean : 267.6 Mean : 273.2
## 3rd Qu.: 346 3rd Qu.: 352.8 3rd Qu.: 338.4 3rd Qu.: 345.1
## Max. :1640 Max. :1645.0 Max. :1512.2 Max. :1564.0
##
## Close VWAP Volume Turnover
## Min. : 62.3 Min. : 63.08 Min. : 4415 Min. :7.021e+10
## 1st Qu.: 143.2 1st Qu.: 143.68 1st Qu.: 1218226 1st Qu.:2.595e+13
## Median : 238.1 Median : 238.90 Median : 2138807 Median :5.250e+13
## Mean : 273.2 Mean : 273.63 Mean : 4825422 Mean :1.249e+14
## 3rd Qu.: 345.6 3rd Qu.: 345.64 3rd Qu.: 4532904 3rd Qu.:1.137e+14
## Max. :1541.7 Max. :1578.11 Max. :165959680 Max. :4.286e+15
##
## Trades Deliverable.Volume X.Deliverble
## Min. : 296 Min. : 4415 Min. :0.0557
## 1st Qu.: 24579 1st Qu.: 513686 1st Qu.:0.3073
## Median : 41074 Median : 893532 Median :0.4635
## Mean : 62646 Mean : 1415718 Mean :0.4522
## 3rd Qu.: 71463 3rd Qu.: 1593444 3rd Qu.:0.5939
## Max. :1088460 Max. :42891428 Max. :1.0000
## NA's :2850 NA's :519 NA's :519
###Analyze trading volume trends and identify unusual volume spikes. Identify at least 3 significant market events visible in the data (e.g., COVID-19,…) and annotate them on chart.
# Extract TITAN and fix date
titan <- all_datasets$TITAN
titan$Date <- as.Date(titan$Date)
# Calculate z-score to detect spikes
titan$vol_mean <- mean(titan$Volume)
titan$vol_sd <- sd(titan$Volume)
titan$z_score <- (titan$Volume - titan$vol_mean) / titan$vol_sd
titan$is_spike <- titan$z_score > 2.5
# Define 3 market events to annotate
events <- data.frame(
date = as.Date(c("2020-03-23", "2021-04-01", "2022-02-24")),
label = c("COVID-19 Crash", "COVID 2nd Wave", "Russia-Ukraine War")
)
# Plot
ggplot(titan, aes(x = Date, y = Volume)) +
# draw volume bars, red if spike, blue if normal
geom_col(aes(fill = is_spike)) +
scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red"),
labels = c("Normal", "Spike"),
name = "Volume Type") +
# vertical lines for market events
geom_vline(data = events, aes(xintercept = date),
color = "green", linetype = "dashed", linewidth = 0.8) +
# text labels for each event
geom_text(data = events,
aes(x = date, y = max(titan$Volume) * 0.95, label = label),
angle = 90, # rotate text vertically
vjust = -0.3, # shift text slightly left of line
color = "purple",
size = 3) +
# titles and labels
labs(
title = "TITAN Trading Volume Over Time",
subtitle = "Red = unusual volume spike | Dashed lines = major market events",
x = "Date",
y = "Volume"
)
###4.3 Visualization ###With ggplot2, use Zee Entertainment Enterprises
Ltd to come up with the visualization:– Closing price trend &
Corresponding percentage change– Sales volume trend– 15,30 and 45 days
moving average trend for the closing price.– Use a histogram to
represent the distribution of the percentage change for the Prev
Close.
zeel <- all_datasets$ZEEL
names(zeel)
## [1] "Date" "Symbol" "Series"
## [4] "Prev.Close" "Open" "High"
## [7] "Low" "Last" "Close"
## [10] "VWAP" "Volume" "Turnover"
## [13] "Trades" "Deliverable.Volume" "X.Deliverble"
# extract DATA
zeel <- all_datasets$ZEEL
zeel$Date <- as.Date(zeel$Date)
# calculate percentage change of closing price using Close and Prev.Close
zeel$pac_change <- (zeel$Close - zeel$Prev.Close) / zeel$Prev.Close * 100
# calculate 15, 30, 45 days moving averages
zeel$ma15 <- zoo::rollmean(zeel$Close, 15, fill = NA)
zeel$ma30 <- zoo::rollmean(zeel$Close, 30, fill = NA)
zeel$ma45 <- zoo::rollmean(zeel$Close, 45, fill = NA)
# calculate percentage change of Prev Close
zeel$prev_pac_change <- (zeel$Prev.Close - lag(zeel$Prev.Close)) / lag(zeel$Prev.Close) * 100
#CLOSING PRICE TREND
ggplot(zeel, aes(x = Date, y = Close)) +
geom_line(color = "skyblue") +
labs(
title = "ZEEL Closing Price Trend",
x = "Date",
y = "Closing Price"
)
# PERCENTAGE CHANGE OF CLOSING PRICE
ggplot(zeel, aes(x = Date, y = pac_change)) +
geom_line(color = "blue") +
labs(
title = "ZEEL Closing Price Percentage Change",
x = "Date",
y = "Percentage Change %"
)
# SALES VOLUME TREND
ggplot(zeel, aes(x = Date, y = Volume)) +
geom_col(fill = "blue") +
labs(
title = "Sales Volume Trend of ZEEL",
x = "Date",
y = "Volume"
)
# MOVING AVERAGES
ggplot(zeel, aes(x = Date)) +
geom_line(aes(y = Close), color = "gray") +
geom_line(aes(y = ma15), color = "blue") +
geom_line(aes(y = ma30), color = "green") +
geom_line(aes(y = ma45), color = "red") +
labs(
title = "ZEEL Moving Averages",
x = "Date",
y = "Price",
subtitle= "Gray = Actual Price | Blue = 15 days | Green = 30 days | Red = 45 days"
)
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 29 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_line()`).
# HISTOGRAM OF PREV CLOSE PERCENTAGE CHANGE
ggplot(zeel, aes(x = prev_pac_change)) +
geom_histogram(fill = "purple", color = "grey") +
labs(
title = "Distribution of ZEEL Prev Close Percentage Change",
x = "Percentage Change %",
y = "Count"
)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
#4.4 Correlation Analysis Using visualization and statistical analysis
of numerical values, determine the pattern of relationships between Prev
Close, Open, High, Low, volume and close by using By using Zee
Entertainment Enterprises Ltd dataset. Identify any highly correlated
pairs (|r| > 0.95) and explain what problem this could cause in your
regression model.Interpret your findings.
#Select columns needed and compute correlation matrix
choosen <- zeel[, c("Prev.Close", "Open", "High", "Low", "Volume", "Close")]
cor_matrix <- cor(choosen, use = "complete.obs") # save it
round(cor_matrix, 3) # print it
## Prev.Close Open High Low Volume Close
## Prev.Close 1.000 0.999 0.999 0.998 -0.049 0.998
## Open 0.999 1.000 0.999 0.999 -0.048 0.998
## High 0.999 0.999 1.000 0.998 -0.040 0.999
## Low 0.998 0.999 0.998 1.000 -0.057 0.999
## Volume -0.049 -0.048 -0.040 -0.057 1.000 -0.047
## Close 0.998 0.998 0.999 0.999 -0.047 1.000
# Reshape for ggplot2
cor_reshape <- as.data.frame(as.table(cor_matrix))
names(cor_reshape) <- c("Var1", "Var2", "Correlation")
#Heatmap
ggplot(cor_reshape, aes(x = Var1, y = Var2, fill = Correlation)) +
geom_tile(color = "gray") +
geom_text(aes(label = round(Correlation, 2)), size = 3.5) +
scale_fill_gradient2(
low = "red",
mid ="skyblue",
high = "darkgreen",
midpoint = 0,
limits = c(-1, 1)
) +
labs(title = "ZEEL Correlation Heatmap", x = "", y = "")
# high correlation pairs
cor_reshape %>%
filter(Var1 != Var2, abs(Correlation) > 0.95) %>%
arrange(desc(abs(Correlation)))
## Var1 Var2 Correlation
## 1 Open Prev.Close 0.9994227
## 2 Prev.Close Open 0.9994227
## 3 High Open 0.9991358
## 4 Open High 0.9991358
## 5 Close Low 0.9991182
## 6 Low Close 0.9991182
## 7 High Prev.Close 0.9988506
## 8 Prev.Close High 0.9988506
## 9 Close High 0.9988055
## 10 High Close 0.9988055
## 11 Low Open 0.9986937
## 12 Open Low 0.9986937
## 13 Low Prev.Close 0.9983843
## 14 Prev.Close Low 0.9983843
## 15 Low High 0.9981437
## 16 High Low 0.9981437
## 17 Close Open 0.9981131
## 18 Open Close 0.9981131
## 19 Close Prev.Close 0.9979015
## 20 Prev.Close Close 0.9979015
###when predictors are too strongly correlated, the model cannot separate their individual effects, coefficients become unstable, and standard errors can cause insignificant variables.
###All price variables (Prev.Close, Open, High, Low, Close) are correlated above 0.997 with each other → all qualify as highly correlated pairs Volume is the only independent variable with near-zero correlation (≈ -0.05) with everything else
###4.5 Regression Analysis: By considering close as a dependent variable and Prev Close, Open, High, Low, and volume as independent variables. Split the data into 80% training and 20% test sets. Print the sizes of each split.– Train the regression model and evaluate it’s performance by using 3 performance metrics of your choice (use Zee Entertainment Enterprises Ltd. data).
# select needed columns
reg_data <- zeel[, c("Close", "Prev.Close", "Open", "High", "Low", "Volume")]
# remove missing values
reg_data <- na.omit(reg_data)
# split data into 80% train and 20% test
set.seed(123)
train_rows <- sample(nrow(reg_data), 0.8 * nrow(reg_data))
train_data <- reg_data[train_rows, ]
test_data <- reg_data[-train_rows, ]
# print sizes
cat("Training set size :", nrow(train_data), "\n")
## Training set size : 4244
cat("Testing set size :", nrow(test_data), "\n")
## Testing set size : 1062
###Train the regression model and evaluate it’s performance by using 3 performance metrics of your choice (use Zee Entertainment Enterprises Ltd. data).
# train regression model
model <- lm(Close ~ Prev.Close + Open + High + Low + Volume,
data = train_data)
# make predictions
predictions <- predict(model, test_data)
# performance metrics
mae <- mean(abs(test_data$Close - predictions))
rmse <- sqrt(mean((test_data$Close - predictions)^2))
r2 <- cor(test_data$Close, predictions)^2
# print results
cat("MAE :", mae, "\n")
## MAE : 2.403604
cat("RMSE :", rmse, "\n")
## RMSE : 4.847351
cat("R2 :", r2, "\n")
## R2 : 0.9992598
# create dataframe for plotting
result <- data.frame(
Actual = test_data$Close,
Predicted = predictions
)
# scatter plot
ggplot(result, aes(x = Actual, y = Predicted)) +
geom_point(color = "blue") +
# 45 degree reference line
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(
title = "Actual vs Predicted Closing Prices",
x = "Actual Close Price",
y = "Predicted Close Price"
)
###standardised dataset: Standardise the independent variables using Stan dardScaler so that each variable have mean=0 and std=1. Print the mean and standard deviation of each column after scaling to verify.
# standardize independent variables
scaled_x <- scale(reg_data[, c("Prev.Close", "Open", "High", "Low", "Volume")])
# means after scaling
colMeans(scaled_x)
## Prev.Close Open High Low Volume
## 1.260117e-16 -1.713930e-17 3.974498e-17 -4.879196e-17 1.369993e-17
# standard deviations after scaling
apply(scaled_x, 2, sd)
## Prev.Close Open High Low Volume
## 1 1 1 1 1
# apply PCA on standardized variables
pca <- prcomp(scaled_x)
# keep only 2 principal components
pca_data <- as.data.frame(pca$x[, 1:2])
# add dependent variable
pca_data$Close <- reg_data$Close
# split data into train and test
set.seed(123)
train_rows <- sample(nrow(pca_data), 0.8 * nrow(pca_data))
train_pca <- pca_data[train_rows, ]
test_pca <- pca_data[-train_rows, ]
# train regression model
pca_model <- lm(Close ~ PC1 + PC2, data = train_pca)
# predictions
pred_pca <- predict(pca_model, test_pca)
# calculate R2
r2_pca <- cor(test_pca$Close, pred_pca)^2
# print R2
cat("R2 :", r2_pca)
## R2 : 0.9976517
###Interpretation :
###The raw regression model performs better if its R2 is higher than the PCA model.
###This happens because the raw model uses all original variables (Prev.Close, Open, High, Low, Volume), so it keeps all information from the dataset.
###The PCA model reduces the variables into only 2 principal components. During this reduction, some information may be lost, which can lower prediction accuracy.
###However, PCA helps reduce multicollinearity because the principal components are independent from each other.
###Raw model is usually higher accuracy ###PCA model is simpler model with less multicollinearity and reduced dimensions
###4.6 Sales volume comparisons: ###After computing the annual sales volume (total) of each of Zee Entertainment Enterprises Ltd and Vedanta Ltd., use a comparative bar chart (clustered bar chart) to represent that information and answer the following questions: – Which company sold the highest volume during 2018?
# ZEEL data
zeel <- all_datasets$ZEEL
zeel$Date <- as.Date(zeel$Date)
zeel$Year <- format(zeel$Date, "%Y")
# Vedanta data
vedl <- all_datasets$VEDL
vedl$Date <- as.Date(vedl$Date)
vedl$Year <- format(vedl$Date, "%Y")
###Filter 2018 and compute total Volume
# Filter 2018
zeel_2018 <- zeel %>% filter(Year == "2018")
vedl_2018 <- vedl %>% filter(Year == "2018")
# Compute total volume
volume_compare <- data.frame(
Company = c("ZEEL", "Vedanta"),
Volume = c(sum(zeel_2018$Volume, na.rm = TRUE),
sum(vedl_2018$Volume, na.rm = TRUE))
)
volume_compare
## Company Volume
## 1 ZEEL 598775408
## 2 Vedanta 3276296751
###Bar Chart
ggplot(volume_compare, aes(x = Company, y = Volume, fill = Company)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c(
"ZEEL" = "darkred",
"Vedanta" = "darkgreen"
)) +
labs(
title = "2018 Annual Sales Volume Comparison",
x = "Company",
y = "Total Trading Volume"
)
###Vedanta sold the highest volume during 2018
###Which company sold the least volume during 2021?
###filter 2021
### Filter 2021 and compute total Volume
# 1. First, create the dataframes for 2021
zeel_2021 <- zeel[format(zeel$Date, "%Y") == "2021", ]
vedl_2021 <- vedl[format(vedl$Date, "%Y") == "2021", ]
# 2. FIX: Create the missing 'volume_2021' summary dataset
volume_2021 <- data.frame(
Company = c("ZEEL", "Vedanta"),
Volume = c(sum(zeel_2021$Volume, na.rm = TRUE),
sum(vedl_2021$Volume, na.rm = TRUE))
)
# Print it to make sure it works
volume_2021
## Company Volume
## 1 ZEEL 1302490317
## 2 Vedanta 1430971315
####Compute total volume
#### Compute total volume chart
# This will now work perfectly because 'volume_2021' exists!
ggplot(volume_2021, aes(x = Company, y = Volume, fill = Company)) +
geom_col() +
scale_fill_manual(values = c(
"ZEEL" = "darkred",
"Vedanta" = "gold"
)) +
labs(
title = "2021 Annual Sales Volume Comparison",
x = "Company",
y = "Total Trading Volume"
)
###ZEEL sold the least volume during 2021
ratio <- volume_2021$Volume[volume_2021$Company == "ZEEL"] /
volume_2021$Volume[volume_2021$Company == "Vedanta"]
ratio
## [1] 0.9102141