getwd()
## [1] "/Users/macbook/Desktop/AUCA"
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
###click here to download NIFTY-50 Stock Market Data (2000 - 2021). Load all the datasets to your R Stidio.
filestogether<-list.files(path = "/Users/macbook/Desktop/AUCA/archive",pattern = "*.csv",full.names = TRUE)
filesread <-lapply(filestogether, read.csv)
names(filesread) <- tools::file_path_sans_ext(basename(filestogether))
names(filesread)
## [1] "ADANIPORTS" "ASIANPAINT" "AXISBANK" "BAJAJ-AUTO" "BAJAJFINSV"
## [6] "BAJFINANCE" "BHARTIARTL" "BPCL" "BRITANNIA" "CIPLA"
## [11] "COALINDIA" "DRREDDY" "EICHERMOT" "GAIL" "GRASIM"
## [16] "HCLTECH" "HDFC" "HDFCBANK" "HEROMOTOCO" "HINDALCO"
## [21] "HINDUNILVR" "ICICIBANK" "INDUSINDBK" "INFRATEL" "INFY"
## [26] "IOC" "ITC" "JSWSTEEL" "KOTAKBANK" "LT"
## [31] "MARUTI" "MM" "NESTLEIND" "NTPC" "ONGC"
## [36] "POWERGRID" "RELIANCE" "SBIN" "SHREECEM" "SUNPHARMA"
## [41] "TATAMOTORS" "TATASTEEL" "TCS" "TECHM" "TITAN"
## [46] "ULTRACEMCO" "UPL" "VEDL" "WIPRO" "ZEEL"
view(filesread$TCS)
functionmid <- function(dset){
cat("Trading Date Range:\n")
cat(min(dset$Date), "to", max(dset$Date), "\n\n")
cat("Missing values:\n")
print(colSums(is.na(dset)))
cat("\nNumber of Outliers:\n")
Q1 <- quantile(dset$Close, 0.25)
Q3 <- quantile(dset$Close, 0.75)
outliers <- sum(dset$Close < Q1 - 1.5*(Q3-Q1) | dset$Close > Q3 + 1.5*(Q3-Q1))
cat("Close ->", outliers, "outliers\n")
cat("top 5 rows:\n")
dset%>%
head(5)
cat("data types:\n")
dset%>%
str()
cat("summary statistics:\n")
dset%>%
summary()
}
functionmid(filesread$ZEEL)
## Trading Date Range:
## 2000-01-03 to 2021-04-30
##
## Missing values:
## Date Symbol Series Prev.Close
## 0 0 0 0
## Open High Low Last
## 0 0 0 0
## Close VWAP Volume Turnover
## 0 0 0 0
## Trades Deliverable.Volume X.Deliverble
## 2850 519 519
##
## Number of Outliers:
## Close -> 84 outliers
## top 5 rows:
## data types:
## 'data.frame': 5306 obs. of 15 variables:
## $ Date : chr "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
## $ Symbol : chr "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
## $ Series : chr "EQ" "EQ" "EQ" "EQ" ...
## $ Prev.Close : num 1093 1180 1261 1177 1115 ...
## $ Open : num 1175 1220 1161 1195 1097 ...
## $ High : num 1180 1274 1318 1200 1097 ...
## $ Low : num 1160 1183 1160 1095 1026 ...
## $ Last : num 1180 1274 1191 1106 1026 ...
## $ Close : num 1180 1261 1177 1115 1026 ...
## $ VWAP : num 1177 1228 1238 1135 1030 ...
## $ Volume : int 1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
## $ Turnover : num 1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
## $ Trades : num NA NA NA NA NA NA NA NA NA NA ...
## $ Deliverable.Volume: num NA NA NA NA NA NA NA NA NA NA ...
## $ X.Deliverble : num NA NA NA NA NA NA NA NA NA NA ...
## summary statistics:
## Date Symbol Series Prev.Close
## Length :5306 Length :5306 Length :5306 Min. : 62.3
## N.unique :5306 N.unique : 2 N.unique : 1 1st Qu.: 143.2
## N.blank : 0 N.blank : 0 N.blank : 0 Median : 238.2
## Min.nchar: 10 Min.nchar: 4 Min.nchar: 2 Mean : 273.4
## Max.nchar: 10 Max.nchar: 7 Max.nchar: 2 3rd Qu.: 345.6
## Max. :1541.7
##
## Open High Low Last
## Min. : 62 Min. : 66.3 Min. : 60.1 Min. : 62.7
## 1st Qu.: 144 1st Qu.: 146.9 1st Qu.: 140.0 1st Qu.: 143.5
## Median : 238 Median : 244.0 Median : 231.4 Median : 237.7
## Mean : 274 Mean : 279.6 Mean : 267.6 Mean : 273.2
## 3rd Qu.: 346 3rd Qu.: 352.8 3rd Qu.: 338.4 3rd Qu.: 345.1
## Max. :1640 Max. :1645.0 Max. :1512.2 Max. :1564.0
##
## Close VWAP Volume Turnover
## Min. : 62.3 Min. : 63.08 Min. : 4415 Min. :7.021e+10
## 1st Qu.: 143.2 1st Qu.: 143.68 1st Qu.: 1218226 1st Qu.:2.595e+13
## Median : 238.1 Median : 238.90 Median : 2138807 Median :5.250e+13
## Mean : 273.2 Mean : 273.63 Mean : 4825422 Mean :1.249e+14
## 3rd Qu.: 345.6 3rd Qu.: 345.64 3rd Qu.: 4532904 3rd Qu.:1.137e+14
## Max. :1541.7 Max. :1578.11 Max. :165959680 Max. :4.286e+15
##
## Trades Deliverable.Volume X.Deliverble
## Min. : 296 Min. : 4415 Min. :0.0557
## 1st Qu.: 24579 1st Qu.: 513686 1st Qu.:0.3073
## Median : 41074 Median : 893532 Median :0.4635
## Mean : 62646 Mean : 1415718 Mean :0.4522
## 3rd Qu.: 71462 3rd Qu.: 1593444 3rd Qu.:0.5939
## Max. :1088460 Max. :42891428 Max. :1.0000
## NAs :2850 NAs :519 NAs :519
###• Analyze trading volume trends and identify unusual volume spikes. Identify at least 3 significant market events visible in the data (e.g., COVID-19,…) and annotate them on chart.
### First step ,let's pick one random (TCS)stock.
TCS<- filesread$TCS
TCS$Date <- as.Date(TCS$Date)
### Second step,let's find the spike beginning
beginning <- mean(TCS$Volume) + 2 * sd(TCS$Volume)
# Third step,Let's plot this using ggplot2
ggplot(TCS, aes(x = Date, y = Volume)) +
geom_line(color = "steelblue") +
#Fourth step, let's put a " Beginning line "
geom_hline(yintercept = beginning, color = "orange", linetype = "dotted") +
#Firth step, An event lines.
geom_vline(xintercept = as.Date("2020-03-23"), color = "red", linetype = "dotdash") +
geom_vline(xintercept = as.Date("2021-04-01"), color = "blue", linetype ="dotdash") +
geom_vline(xintercept = as.Date("2022-02-24"), color = "black", linetype = "dotdash") +
# Sixth step, event labels
annotate("text", x = as.Date("2020-03-23"), y = max(TCS$Volume), label = "COVID Crash", color = "red", size = 3) +
annotate("text", x = as.Date("2021-04-01"), y = max(TCS$Volume)*0.9, label = "2nd Wave", color = "blue", size = 3) +
annotate("text", x = as.Date("2022-02-24"), y = max(TCS$Volume)*0.8, label = "Russia-Ukraine", color = "black", size = 3) +
labs(title = "TCS Trading Volume Over Time", x = "Date", y = "Volume")
###the visualization: ###– Closing price trend & Corresponding percentage change ###– Sales volume trend ###– 15,30 and 45 days moving average trend for the closing price. ###– Use a histogram to represent the distribution of the percentage change for the Prev Close.
### Closing price trend & Corresponding percentage change
ZEEL <- filesread$ZEEL
ZEEL$Date <- as.Date(ZEEL$Date)
ZEEL$ma15 <-rollmean(ZEEL$Close, k = 15, fill = NA, align = "right")
ZEEL$ma30 <- rollmean(ZEEL$Close, k = 30, fill = NA, align = "right")
ZEEL$ma45 <- rollmean(ZEEL$Close, k = 45, fill = NA, align = "right")
ggplot(ZEEL, aes(x = Date)) +
geom_line(aes(y = Close, color = "Actual Close"), na.rm = TRUE) +
geom_line(aes(y = ma15, color = "15-Day MA"), na.rm = TRUE) +
geom_line(aes(y = ma30, color = "30-Day MA"), na.rm = TRUE) +
geom_line(aes(y = ma45, color = "45-Day MA"), na.rm = TRUE) +
scale_color_manual(values = c("Actual Close" = "black",
"15-Day MA" = "darkgreen",
"30-Day MA" = "blue",
"45-Day MA" = "red")) +
labs(title = "ZEEL Moving Averages 15, 30 and 45 days",
x = "Date",
y = "Price",
color = "Legend")
### Sales volume trend.
ggplot(ZEEL, aes(x = Date, y = Volume)) +
geom_line(color = "blue", na.rm = TRUE) +
labs(title = "ZEEL Sales Volume Trend", x = "Date", y = "Volume")
### 15,30 and 45 days moving average trend for the closing price
ggplot(ZEEL, aes(x = Date)) +
geom_line(aes(y = Close), color = "white", na.rm = TRUE) +
geom_line(aes(y = ma15), color = "darkgreen", na.rm = TRUE) +
geom_line(aes(y = ma30), color = "blue", na.rm = TRUE) +
geom_line(aes(y = ma45), color = "red", na.rm = TRUE) +
labs(title = "ZEEL Moving Averages 15, 30 and 45 days", x = "Date", y = "Price")
###Use a histogram to represent the distribution of the percentage change for the Prev Close
ZEEL$prevclose.pct <- (ZEEL$Close - ZEEL$Prev.Close) / ZEEL$Prev.Close * 100
ggplot(ZEEL, aes(x = prevclose.pct)) +
geom_histogram(bins = 30, fill = "brown", color = "yellow", na.rm = TRUE) +
labs(title = "Prev Close % Change Distribution", x = "% Change", y = "Count")
###Using visualization and statistical analysis of numerical values, determine the pattern of relationships between Prev Close, Open, High, Low, volume and close by using By using Zee Entertainment Enterprises Ltd dataset. Identify any highly correlated pairs (|r| > 0.95) and explain what problem this could cause in your regression model.Interpret your findings.
ZEEL <- filesread$ZEEL
num_cols <- ZEEL[, c("Prev.Close", "Open", "High", "Low", "Volume", "Close")]
cor_df <- as.data.frame(as.table(cor(num_cols, use = "complete.obs")))
ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
geom_tile() +
geom_text(aes(label = round(Freq, 2)), size = 3) +
scale_fill_gradient2(low = "white", mid = "skyblue", high = "lightpink", midpoint = 0.5) +
labs(title = "ZEEL Correlation Matrix", x = "", y = "")
## Find pairs above 0.95
pairmatrix <- cor(ZEEL[, c("Prev.Close", "Open", "High", "Low", "Volume", "Close")], use = "complete.obs")
# Printing pairs above 0.95
pairdf <- as.data.frame(as.table(pairmatrix))
highpair <- subset(cor_df, abs(Freq) > 0.95 & Var1 != Var2)
print(highpair)
## Var1 Var2 Freq
## 2 Open Prev.Close 0.9994227
## 3 High Prev.Close 0.9988506
## 4 Low Prev.Close 0.9983843
## 6 Close Prev.Close 0.9979015
## 7 Prev.Close Open 0.9994227
## 9 High Open 0.9991358
## 10 Low Open 0.9986937
## 12 Close Open 0.9981131
## 13 Prev.Close High 0.9988506
## 14 Open High 0.9991358
## 16 Low High 0.9981437
## 18 Close High 0.9988055
## 19 Prev.Close Low 0.9983843
## 20 Open Low 0.9986937
## 21 High Low 0.9981437
## 24 Close Low 0.9991182
## 31 Prev.Close Close 0.9979015
## 32 Open Close 0.9981131
## 33 High Close 0.9988055
## 34 Low Close 0.9991182
### Problem: Multicollinearity When independent variables are too strongly correlated with each other, it creates an issue called multicollinearity. As a result of this, the regression model cannot separate the individual effects of each predictor. This makes the model unstable and inflates the standard errors, which can cause important variables to look statistically insignificant.
### Interpretation: In the analysis of the Zee Entertainment Enterprises Ltd (ZEEL) dataset, I found that all the price variables (Prev.Close, Open, High, Low, and Close) are highly correlated with each other, with all correlation values standing above 0.997. Hence, every single one of these price features qualifies as a highly correlated pair. Additionally, Volume is the only independent variable that behaves differently, showing a near-zero correlation (approximately -0.05) with all other variables.
### By considering close as a dependent variable and Prev Close, Open,High, Low, and volume as independent variables. Split the data into 80% training and 20% test sets. Print the sizes of each split.
ZEEL <- filesread$ZEEL
# Here let's define depend and independent variables.
X <- ZEEL[, c("Prev.Close", "Open", "High", "Low", "Volume")]
y <- ZEEL$Close
# Split the data into 80% training and 20% test sets.
set.seed(123)
train_index <- sample(1:nrow(ZEEL), 0.8 * nrow(ZEEL))
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
# Print the sizes of each split
cat("Training size:", nrow(X_train), "\n")
## Training size: 4244
cat("Test size:", nrow(X_test), "\n")
## Test size: 1062
##Train the regression model and evaluate it’s performance by using 3 performance metrics of your choice (use Zee Entertainment Enterprises Ltd data).
# combine for lm()
ZEEL<-filesread$ZEEL
train_ZEEL<- data.frame(X_train, Close = y_train)
test_ZEEL <- data.frame(X_test, Close = y_test)
# train model
model <- lm(Close ~ ., data = train_ZEEL)
# predict
predictions <- predict(model, newdata = test_ZEEL)
# 3 performance metrics
mae <- mean(abs(y_test - predictions))
rmse <- sqrt(mean((y_test - predictions)^2))
r2 <- 1 - sum((y_test - predictions)^2) / sum((y_test - mean(y_test))^2)
cat("MAE:", mae, "\n")
## MAE: 2.403604
cat("RMSE:", rmse, "\n")
## RMSE: 4.847351
cat("R2:", r2, "\n")
## R2: 0.9992589
##Plot the actual vs predicted closing prices on the test set (scatter plot) Add a 45-degree reference line (perfect prediction line). What does the spread around this line tell you?
outcome <- data.frame(Actual = y_test, Predicted = predictions)
ggplot(outcome, aes(x = Actual, y = Predicted)) +
geom_point(color = "darkblue") +
geom_abline(slope = 1, intercept = 0 ## this gives a 45 degree reference line
, color = "lightpink", linetype = "dotted") +
labs(title = "Actual vs Predicted Closing Price", x = "Actual", y = "Predicted")
##standardised dataset: Standardise the independent variables using StandardScaler so that each variable have mean=0 and std=1. Print the mean and standard deviation of each column after scaling to verify.
# standardise
X_train_scaled <- scale(X_train)
X_test_scaled <- scale(X_test,
center = attr(X_train_scaled, "scaled:center"),
scale = attr(X_train_scaled, "scaled:scale"))
# print mean and std of each column
cat("Mean:\n")
## Mean:
print(round(colMeans(X_train_scaled), 2))
## Prev.Close Open High Low Volume
## 0 0 0 0 0
cat("Standard Deviation:\n")
## Standard Deviation:
print(round(apply(X_train_scaled, 2, sd), 2))
## Prev.Close Open High Low Volume
## 1 1 1 1 1
### Apply PCA to reduce X to 2 components, then train a Linear Regression model on the PCA-transformed training data. Print the R2 on the test set.
pcafile <- prcomp(X_train_scaled, center = FALSE, scale. = FALSE)
Xtrain_pca <- pcafile$x[, 1:2]
Xtest_pca <- predict(pcafile, newdata = X_test_scaled)[, 1:2]
pcafiletrain_df <- data.frame(Xtrain_pca, Close = y_train)
pcafiletest_df <- data.frame(Xtest_pca, Close = y_test)
pcamodel <- lm(Close ~ ., data = pcafiletrain_df)
pca_predictions <- predict(pcamodel, newdata = pcafiletest_df)
# The below is the R2 of pca model
pca_r2 <- 1 - sum((y_test - pca_predictions)^2) / sum((y_test - mean(y_test))^2)
cat("PCA Model R2:", pca_r2, "\n")
## PCA Model R2: -2.38476
### Compare the R2 of the PCA model vs the raw model(Model developed on step 2). Which performs better? Why might this be the case?
cat("Raw Model R2:", r2, "\n")
## Raw Model R2: 0.9992589
cat("PCA Model R2:", pca_r2, "\n")
## PCA Model R2: -2.38476
#Interpretation: Referring on the results, the Raw Model has an R² value of 0.9992589, while the PCA Model has an R² value of the values. The Raw Model performs better because it uses all five variables: Prev.Close, Open, High, Low, and Volume. Putting in action all variables provides more information for prediction, allowing the model to explain about 99.92% of the variation in the closing price.
##The PCA Model performs slightly worse because PCA reduces the five variables into only two principal components. Even though these components capture most of the data variation, some of the information is lost during the reduction process, making the predictions slightly less accurate than the Raw Model.
#– Which company sold the highest volume during 2018? #– Which company sold the least volume during 2021? #– What is the ratio of Zee Entertainment Enterprises Ltd sales to Vedanta Ltd sales during 2021?
## #– Which company sold the highest volume during 2018?
zeel <-filesread$ZEEL
zeel$Date <- as.Date(zeel$Date)
zeel$Year <- format(zeel$Date, "%Y")
# Vedanta data
vedl <-filesread$VEDL
vedl$Date <- as.Date(vedl$Date)
vedl$Year <- format(vedl$Date, "%Y")
##Filter 2018
zeel_2018 <- zeel %>% filter(Year == "2018")
vedl_2018 <- vedl %>% filter(Year == "2018")
# Compute total volume
volume_compare <- data.frame(
Company = c("ZEEL", "Vedanta"),
Volume = c(sum(zeel_2018$Volume, na.rm = TRUE),
sum(vedl_2018$Volume, na.rm = TRUE))
)
volume_compare
## Company Volume
## 1 ZEEL 598775408
## 2 Vedanta 3276296751
ggplot(volume_compare, aes(x = Company, y = Volume, fill = Company)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c(
"ZEEL" = "skyblue",
"Vedanta" = "yellow"
)) +
labs(
title = "2018 Annual Sales Volume Comparison",
x = "Company",
y = "Total Trading Volume"
## The company that sold the highest volume in 2018 is VEDL(VENDATA).
)
## Which company sold the least volume during 2021?
zeel_2021 <- zeel[format(zeel$Date, "%Y") == "2021", ]
vedl_2021 <- vedl[format(vedl$Date, "%Y") == "2021", ]
volume_2021 <- data.frame(
Company = c("ZEEL", "Vedanta"),
Volume = c(sum(zeel_2021$Volume, na.rm = TRUE),
sum(vedl_2021$Volume, na.rm = TRUE))
)
# Print it to make sure it works
volume_2021
## Company Volume
## 1 ZEEL 1302490317
## 2 Vedanta 1430971315
## The company that sold the least volume in 2021 is ZEEL.
ggplot(volume_2021, aes(x = Company, y = Volume, fill = Company)) +
geom_col() +
scale_fill_manual(values = c(
"ZEEL" = "brown",
"Vedanta" = "black"
)) +
labs(
title = "2021 Annual Sales Volume Comparison",
x = "Company",
y = "Total Trading Volume"
)
### What is the ratio of Zee Entertainment Enterprises Ltd sales to Vedanta Ltd sales during 2021?
ratio <- volume_2021$Volume[volume_2021$Company == "ZEEL"] /
volume_2021$Volume[volume_2021$Company == "Vedanta"]
ratio
## [1] 0.9102141