getwd()
## [1] "/Users/macbook/Desktop/AUCA"
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

4.Task description

4.1 Downloading data and load them to R studio

###click here to download NIFTY-50 Stock Market Data (2000 - 2021). Load all the datasets to your R Stidio.

filestogether<-list.files(path = "/Users/macbook/Desktop/AUCA/archive",pattern = "*.csv",full.names = TRUE)

filesread <-lapply(filestogether, read.csv)
names(filesread) <- tools::file_path_sans_ext(basename(filestogether))
names(filesread)
##  [1] "ADANIPORTS" "ASIANPAINT" "AXISBANK"   "BAJAJ-AUTO" "BAJAJFINSV"
##  [6] "BAJFINANCE" "BHARTIARTL" "BPCL"       "BRITANNIA"  "CIPLA"     
## [11] "COALINDIA"  "DRREDDY"    "EICHERMOT"  "GAIL"       "GRASIM"    
## [16] "HCLTECH"    "HDFC"       "HDFCBANK"   "HEROMOTOCO" "HINDALCO"  
## [21] "HINDUNILVR" "ICICIBANK"  "INDUSINDBK" "INFRATEL"   "INFY"      
## [26] "IOC"        "ITC"        "JSWSTEEL"   "KOTAKBANK"  "LT"        
## [31] "MARUTI"     "MM"         "NESTLEIND"  "NTPC"       "ONGC"      
## [36] "POWERGRID"  "RELIANCE"   "SBIN"       "SHREECEM"   "SUNPHARMA" 
## [41] "TATAMOTORS" "TATASTEEL"  "TCS"        "TECHM"      "TITAN"     
## [46] "ULTRACEMCO" "UPL"        "VEDL"       "WIPRO"      "ZEEL"
view(filesread$TCS)

4.2 Exploratory data analysis.

• Create a function to profile each stock by displaying the following information: trading date range, missing values, number of outliers, top 5 rows, data types, and summary statistics. Then, apply the function to the Zee Entertainment Enterprises Ltd dataset and display the results.

functionmid <- function(dset){

  cat("Trading Date Range:\n")
  cat(min(dset$Date), "to", max(dset$Date), "\n\n")
  
  cat("Missing values:\n")
  print(colSums(is.na(dset)))
  
cat("\nNumber of Outliers:\n")
  Q1 <- quantile(dset$Close, 0.25)
  Q3 <- quantile(dset$Close, 0.75)
  outliers <- sum(dset$Close < Q1 - 1.5*(Q3-Q1) | dset$Close > Q3 + 1.5*(Q3-Q1))
  cat("Close ->", outliers, "outliers\n")
  
  
cat("top 5 rows:\n")
dset%>%
  head(5)

cat("data types:\n")
dset%>%
  str()

cat("summary statistics:\n")
dset%>%
  summary()

}

functionmid(filesread$ZEEL)
## Trading Date Range:
## 2000-01-03 to 2021-04-30 
## 
## Missing values:
##               Date             Symbol             Series         Prev.Close 
##                  0                  0                  0                  0 
##               Open               High                Low               Last 
##                  0                  0                  0                  0 
##              Close               VWAP             Volume           Turnover 
##                  0                  0                  0                  0 
##             Trades Deliverable.Volume       X.Deliverble 
##               2850                519                519 
## 
## Number of Outliers:
## Close -> 84 outliers
## top 5 rows:
## data types:
## 'data.frame':    5306 obs. of  15 variables:
##  $ Date              : chr  "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
##  $ Symbol            : chr  "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
##  $ Series            : chr  "EQ" "EQ" "EQ" "EQ" ...
##  $ Prev.Close        : num  1093 1180 1261 1177 1115 ...
##  $ Open              : num  1175 1220 1161 1195 1097 ...
##  $ High              : num  1180 1274 1318 1200 1097 ...
##  $ Low               : num  1160 1183 1160 1095 1026 ...
##  $ Last              : num  1180 1274 1191 1106 1026 ...
##  $ Close             : num  1180 1261 1177 1115 1026 ...
##  $ VWAP              : num  1177 1228 1238 1135 1030 ...
##  $ Volume            : int  1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
##  $ Turnover          : num  1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
##  $ Trades            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Deliverable.Volume: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ X.Deliverble      : num  NA NA NA NA NA NA NA NA NA NA ...
## summary statistics:
##         Date            Symbol           Series       Prev.Close    
##  Length   :5306   Length   :5306   Length   :5306   Min.   :  62.3  
##  N.unique :5306   N.unique :   2   N.unique :   1   1st Qu.: 143.2  
##  N.blank  :   0   N.blank  :   0   N.blank  :   0   Median : 238.2  
##  Min.nchar:  10   Min.nchar:   4   Min.nchar:   2   Mean   : 273.4  
##  Max.nchar:  10   Max.nchar:   7   Max.nchar:   2   3rd Qu.: 345.6  
##                                                     Max.   :1541.7  
##                                                                     
##       Open           High             Low              Last       
##  Min.   :  62   Min.   :  66.3   Min.   :  60.1   Min.   :  62.7  
##  1st Qu.: 144   1st Qu.: 146.9   1st Qu.: 140.0   1st Qu.: 143.5  
##  Median : 238   Median : 244.0   Median : 231.4   Median : 237.7  
##  Mean   : 274   Mean   : 279.6   Mean   : 267.6   Mean   : 273.2  
##  3rd Qu.: 346   3rd Qu.: 352.8   3rd Qu.: 338.4   3rd Qu.: 345.1  
##  Max.   :1640   Max.   :1645.0   Max.   :1512.2   Max.   :1564.0  
##                                                                   
##      Close             VWAP             Volume             Turnover        
##  Min.   :  62.3   Min.   :  63.08   Min.   :     4415   Min.   :7.021e+10  
##  1st Qu.: 143.2   1st Qu.: 143.68   1st Qu.:  1218226   1st Qu.:2.595e+13  
##  Median : 238.1   Median : 238.90   Median :  2138807   Median :5.250e+13  
##  Mean   : 273.2   Mean   : 273.63   Mean   :  4825422   Mean   :1.249e+14  
##  3rd Qu.: 345.6   3rd Qu.: 345.64   3rd Qu.:  4532904   3rd Qu.:1.137e+14  
##  Max.   :1541.7   Max.   :1578.11   Max.   :165959680   Max.   :4.286e+15  
##                                                                            
##      Trades        Deliverable.Volume  X.Deliverble   
##  Min.   :    296   Min.   :    4415   Min.   :0.0557  
##  1st Qu.:  24579   1st Qu.:  513686   1st Qu.:0.3073  
##  Median :  41074   Median :  893532   Median :0.4635  
##  Mean   :  62646   Mean   : 1415718   Mean   :0.4522  
##  3rd Qu.:  71462   3rd Qu.: 1593444   3rd Qu.:0.5939  
##  Max.   :1088460   Max.   :42891428   Max.   :1.0000  
##  NAs    :2850      NAs    :519        NAs    :519

###• Analyze trading volume trends and identify unusual volume spikes. Identify at least 3 significant market events visible in the data (e.g., COVID-19,…) and annotate them on chart.

### First step ,let's pick one random  (TCS)stock.
TCS<- filesread$TCS
TCS$Date <- as.Date(TCS$Date)

### Second step,let's find the spike beginning
beginning <- mean(TCS$Volume) + 2 * sd(TCS$Volume)

# Third step,Let's plot this using ggplot2
ggplot(TCS, aes(x = Date, y = Volume)) +
  geom_line(color = "steelblue") +

  #Fourth step, let's put a " Beginning line "
  geom_hline(yintercept = beginning, color = "orange", linetype = "dotted") +

  #Firth step, An event lines.
  geom_vline(xintercept = as.Date("2020-03-23"), color = "red", linetype = "dotdash") +
  geom_vline(xintercept = as.Date("2021-04-01"), color = "blue", linetype ="dotdash") +
  geom_vline(xintercept = as.Date("2022-02-24"), color = "black", linetype = "dotdash") +

  #  Sixth step, event labels
  annotate("text", x = as.Date("2020-03-23"), y = max(TCS$Volume), label = "COVID Crash", color = "red", size = 3) +
  annotate("text", x = as.Date("2021-04-01"), y = max(TCS$Volume)*0.9, label = "2nd Wave", color = "blue", size = 3) +
  annotate("text", x = as.Date("2022-02-24"), y = max(TCS$Volume)*0.8, label = "Russia-Ukraine", color = "black", size = 3) +

  labs(title = "TCS Trading Volume Over Time", x = "Date", y = "Volume")

4.3 Visualization

With ggplot2, use Zee Entertainment Enterprises Ltd to come up with

###the visualization: ###– Closing price trend & Corresponding percentage change ###– Sales volume trend ###– 15,30 and 45 days moving average trend for the closing price. ###– Use a histogram to represent the distribution of the percentage change for the Prev Close.

### Closing price trend & Corresponding percentage change

ZEEL <- filesread$ZEEL
ZEEL$Date <- as.Date(ZEEL$Date)

ZEEL$ma15 <-rollmean(ZEEL$Close, k = 15, fill = NA, align = "right") 
ZEEL$ma30 <- rollmean(ZEEL$Close, k = 30, fill = NA, align = "right") 
ZEEL$ma45 <- rollmean(ZEEL$Close, k = 45, fill = NA, align = "right")

ggplot(ZEEL, aes(x = Date)) +
  geom_line(aes(y = Close, color = "Actual Close"), na.rm = TRUE) +
  geom_line(aes(y = ma15, color = "15-Day MA"), na.rm = TRUE) +
  geom_line(aes(y = ma30, color = "30-Day MA"), na.rm = TRUE) +
  geom_line(aes(y = ma45, color = "45-Day MA"), na.rm = TRUE) +
  scale_color_manual(values = c("Actual Close" = "black", 
                                "15-Day MA" = "darkgreen", 
                                "30-Day MA" = "blue", 
                                "45-Day MA" = "red")) +
  labs(title = "ZEEL Moving Averages 15, 30 and 45 days", 
       x = "Date", 
       y = "Price",
       color = "Legend")

### Sales volume trend.

ggplot(ZEEL, aes(x = Date, y = Volume)) +
  geom_line(color = "blue", na.rm = TRUE) +
  labs(title = "ZEEL Sales Volume Trend", x = "Date", y = "Volume")

### 15,30 and 45 days moving average trend for the closing price

ggplot(ZEEL, aes(x = Date)) +
  geom_line(aes(y = Close), color = "white", na.rm = TRUE) +
  geom_line(aes(y = ma15), color = "darkgreen", na.rm = TRUE) +
  geom_line(aes(y = ma30), color = "blue", na.rm = TRUE) +
  geom_line(aes(y = ma45), color = "red", na.rm = TRUE) +
  labs(title = "ZEEL Moving Averages 15, 30 and 45 days", x = "Date", y = "Price")

###Use a histogram to represent the distribution of the percentage change for the Prev Close

ZEEL$prevclose.pct <- (ZEEL$Close - ZEEL$Prev.Close) / ZEEL$Prev.Close * 100

ggplot(ZEEL, aes(x = prevclose.pct)) +
  geom_histogram(bins = 30, fill = "brown", color = "yellow", na.rm = TRUE) +
  labs(title = "Prev Close % Change Distribution", x = "% Change", y = "Count")

4.4 Correlation Analysis.

###Using visualization and statistical analysis of numerical values, determine the pattern of relationships between Prev Close, Open, High, Low, volume and close by using By using Zee Entertainment Enterprises Ltd dataset. Identify any highly correlated pairs (|r| > 0.95) and explain what problem this could cause in your regression model.Interpret your findings.

ZEEL <- filesread$ZEEL

num_cols <- ZEEL[, c("Prev.Close", "Open", "High", "Low", "Volume", "Close")]
cor_df <- as.data.frame(as.table(cor(num_cols, use = "complete.obs")))

ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = round(Freq, 2)), size = 3) +
  scale_fill_gradient2(low = "white", mid = "skyblue", high = "lightpink", midpoint = 0.5) +
  labs(title = "ZEEL Correlation Matrix", x = "", y = "")

## Find pairs above 0.95
pairmatrix <- cor(ZEEL[, c("Prev.Close", "Open", "High", "Low", "Volume", "Close")], use = "complete.obs")

# Printing pairs above 0.95
pairdf <- as.data.frame(as.table(pairmatrix))
highpair <- subset(cor_df, abs(Freq) > 0.95 & Var1 != Var2)
print(highpair)
##          Var1       Var2      Freq
## 2        Open Prev.Close 0.9994227
## 3        High Prev.Close 0.9988506
## 4         Low Prev.Close 0.9983843
## 6       Close Prev.Close 0.9979015
## 7  Prev.Close       Open 0.9994227
## 9        High       Open 0.9991358
## 10        Low       Open 0.9986937
## 12      Close       Open 0.9981131
## 13 Prev.Close       High 0.9988506
## 14       Open       High 0.9991358
## 16        Low       High 0.9981437
## 18      Close       High 0.9988055
## 19 Prev.Close        Low 0.9983843
## 20       Open        Low 0.9986937
## 21       High        Low 0.9981437
## 24      Close        Low 0.9991182
## 31 Prev.Close      Close 0.9979015
## 32       Open      Close 0.9981131
## 33       High      Close 0.9988055
## 34        Low      Close 0.9991182
### Problem: Multicollinearity When independent variables are too strongly correlated with each other, it creates an issue called multicollinearity. As a result of this, the regression model cannot separate the individual effects of each predictor. This makes the model unstable and inflates the standard errors, which can cause important variables to look statistically insignificant.

### Interpretation: In the analysis of the Zee Entertainment Enterprises Ltd (ZEEL) dataset, I found that all the price variables (Prev.Close, Open, High, Low, and Close) are highly correlated with each other, with all correlation values standing above 0.997. Hence, every single one of these price features qualifies as a highly correlated pair. Additionally, Volume is the only independent variable that behaves differently, showing a near-zero correlation (approximately -0.05) with all other variables.

4.5 Regression Analysis

### By considering close as a dependent variable and Prev Close, Open,High, Low, and volume as independent variables. Split the data into 80% training and 20% test sets. Print the sizes of each split.

ZEEL <- filesread$ZEEL

# Here let's define depend and independent variables.
X <- ZEEL[, c("Prev.Close", "Open", "High", "Low", "Volume")]
y <- ZEEL$Close

# Split the data into 80% training  and 20% test sets.
set.seed(123)
train_index <- sample(1:nrow(ZEEL), 0.8 * nrow(ZEEL))

X_train <- X[train_index, ]
X_test  <- X[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

# Print the sizes of each split

cat("Training size:", nrow(X_train), "\n")
## Training size: 4244
cat("Test size:", nrow(X_test), "\n")
## Test size: 1062
##Train the regression model and evaluate it’s performance by using 3 performance metrics of your choice (use Zee Entertainment Enterprises Ltd data).

# combine for lm()

ZEEL<-filesread$ZEEL
train_ZEEL<- data.frame(X_train, Close = y_train)
test_ZEEL <- data.frame(X_test, Close = y_test)

# train model
model <- lm(Close ~ ., data = train_ZEEL)

# predict
predictions <- predict(model, newdata = test_ZEEL)

# 3 performance metrics
mae  <- mean(abs(y_test - predictions))
rmse <- sqrt(mean((y_test - predictions)^2))
r2   <- 1 - sum((y_test - predictions)^2) / sum((y_test - mean(y_test))^2)

cat("MAE:", mae, "\n")
## MAE: 2.403604
cat("RMSE:", rmse, "\n")
## RMSE: 4.847351
cat("R2:", r2, "\n")
## R2: 0.9992589
##Plot the actual vs predicted closing prices on the test set (scatter plot) Add a 45-degree reference line (perfect prediction line). What does the spread around this line tell you?

outcome <- data.frame(Actual = y_test, Predicted = predictions)

ggplot(outcome, aes(x = Actual, y = Predicted)) +
  geom_point(color = "darkblue") +
  geom_abline(slope = 1, intercept = 0 ## this gives a 45 degree reference line
              , color = "lightpink", linetype = "dotted") +
  labs(title = "Actual vs Predicted Closing Price", x = "Actual", y = "Predicted")

##standardised dataset: Standardise the independent variables using StandardScaler so that each variable have mean=0 and std=1. Print the mean and standard deviation of each column after scaling to verify.

# standardise
X_train_scaled <- scale(X_train)
X_test_scaled  <- scale(X_test,
                        center = attr(X_train_scaled, "scaled:center"),
                        scale  = attr(X_train_scaled, "scaled:scale"))

# print mean and std of each column
cat("Mean:\n")
## Mean:
print(round(colMeans(X_train_scaled), 2))
## Prev.Close       Open       High        Low     Volume 
##          0          0          0          0          0
cat("Standard Deviation:\n")
## Standard Deviation:
print(round(apply(X_train_scaled, 2, sd), 2))
## Prev.Close       Open       High        Low     Volume 
##          1          1          1          1          1
### Apply PCA to reduce X to 2 components, then train a Linear Regression model on the PCA-transformed training data. Print the R2 on the test set.

pcafile <- prcomp(X_train_scaled, center = FALSE, scale. = FALSE)

Xtrain_pca <- pcafile$x[, 1:2]
Xtest_pca  <- predict(pcafile, newdata = X_test_scaled)[, 1:2]

pcafiletrain_df <- data.frame(Xtrain_pca, Close = y_train)
pcafiletest_df  <- data.frame(Xtest_pca,  Close = y_test)

pcamodel <- lm(Close ~ ., data = pcafiletrain_df)
pca_predictions <- predict(pcamodel, newdata = pcafiletest_df)

# The below is the R2 of pca model

pca_r2 <- 1 - sum((y_test - pca_predictions)^2) / sum((y_test - mean(y_test))^2)
cat("PCA Model R2:", pca_r2, "\n")
## PCA Model R2: -2.38476
### Compare the R2 of the PCA model vs the raw model(Model developed on step 2). Which performs better? Why might this be the case?

cat("Raw Model R2:", r2, "\n")
## Raw Model R2: 0.9992589
cat("PCA Model R2:", pca_r2, "\n")
## PCA Model R2: -2.38476
#Interpretation: Referring on the results, the Raw Model has an R² value of 0.9992589, while the PCA Model has an R² value of the values. The Raw Model performs better because it uses all five variables: Prev.Close, Open, High, Low, and Volume. Putting in action all variables provides more information for prediction, allowing the model to explain about 99.92% of the variation in the closing price.

##The PCA Model performs slightly worse because PCA reduces the five variables into only two principal components. Even though these components capture most of the data variation, some of the  information is lost during the reduction process, making the predictions slightly less accurate than the Raw Model.

4.6 Sales volume comparisons

After computing the annual sales volume (total) of each of Zee Entertainment Enterprises Ltd and Vedanta Ltd., use a comparative bar chart (clustered bar chart) to represent that information and answer the following questions:

#– Which company sold the highest volume during 2018? #– Which company sold the least volume during 2021? #– What is the ratio of Zee Entertainment Enterprises Ltd sales to Vedanta Ltd sales during 2021?

## #– Which company sold the highest volume during 2018?
zeel <-filesread$ZEEL
zeel$Date <- as.Date(zeel$Date)
zeel$Year <- format(zeel$Date, "%Y")

# Vedanta data
vedl <-filesread$VEDL
vedl$Date <- as.Date(vedl$Date)
vedl$Year <- format(vedl$Date, "%Y")
##Filter 2018

zeel_2018 <- zeel %>% filter(Year == "2018")
vedl_2018 <- vedl %>% filter(Year == "2018")

# Compute total volume
volume_compare <- data.frame(
  Company = c("ZEEL", "Vedanta"),
  Volume = c(sum(zeel_2018$Volume, na.rm = TRUE),
             sum(vedl_2018$Volume, na.rm = TRUE))
)

volume_compare
##   Company     Volume
## 1    ZEEL  598775408
## 2 Vedanta 3276296751
ggplot(volume_compare, aes(x = Company, y = Volume, fill = Company)) +
  geom_col(position = "dodge") +
  
  scale_fill_manual(values = c(
    "ZEEL" = "skyblue",
    "Vedanta" = "yellow"
  )) +
  
  labs(
    title = "2018 Annual Sales Volume Comparison",
    x = "Company",
    y = "Total Trading Volume"
    
    ## The company that sold the highest volume in 2018 is VEDL(VENDATA).
  )

## Which company sold the least volume during 2021?



zeel_2021 <- zeel[format(zeel$Date, "%Y") == "2021", ]
vedl_2021 <- vedl[format(vedl$Date, "%Y") == "2021", ]


volume_2021 <- data.frame(
  Company = c("ZEEL", "Vedanta"),
  Volume = c(sum(zeel_2021$Volume, na.rm = TRUE),
             sum(vedl_2021$Volume, na.rm = TRUE))
)

# Print it to make sure it works
volume_2021
##   Company     Volume
## 1    ZEEL 1302490317
## 2 Vedanta 1430971315
## The company that sold the least volume in 2021 is ZEEL.
ggplot(volume_2021, aes(x = Company, y = Volume, fill = Company)) +
  geom_col() +
  
  scale_fill_manual(values = c(
    "ZEEL" = "brown",
    "Vedanta" = "black"
  )) +
  
  labs(
    title = "2021 Annual Sales Volume Comparison",
    x = "Company",
    y = "Total Trading Volume"
  )

### What is the ratio of Zee Entertainment Enterprises Ltd sales to Vedanta Ltd sales during 2021?

ratio <- volume_2021$Volume[volume_2021$Company == "ZEEL"] /
         volume_2021$Volume[volume_2021$Company == "Vedanta"]

ratio
## [1] 0.9102141