1 Step 0 — Install and Load Packages

We first install any missing packages, then load them all.

# List of packages we need
packages_needed <- c("tidyverse", "lubridate", "ggplot2", "scales",
                     "corrplot", "GGally", "caret", "gridExtra",
                     "zoo", "ggrepel", "factoextra")

# Install only the ones that are not yet installed
for (pkg in packages_needed) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

# Load all packages
library(tidyverse)   # data manipulation and ggplot2
library(lubridate)   # working with dates
library(ggplot2)     # charts and visualisation
library(scales)      # number formatting (comma, percent)
library(corrplot)    # correlation heatmap
library(GGally)      # pair plots
library(caret)       # data pre-processing (scaling)
library(gridExtra)   # put two plots side by side / stacked
library(zoo)         # rolling (moving) averages
library(ggrepel)     # non-overlapping text labels on charts
library(factoextra)  # PCA visualisation helpers

2 Task 4.1 — Load the Data

Before running: Download the NIFTY-50 dataset from Kaggle and put all the CSV files inside a folder called nifty50_data that lives in the same directory as this .Rmd file.

# ── 1. Read every CSV file inside the nifty50_data folder ─────────────────
# Folder containing all NIFTY-50 CSV files
data_dir <- "nifty50_data"

# Get all CSV files from the folder
csv_files <- list.files(data_dir, pattern = "\\.csv$", full.names = TRUE)
# Read all CSV files into a list
stock_list <- lapply(csv_files, read_csv, show_col_types = FALSE)

# Use file names as stock names
names(stock_list) <- tools::file_path_sans_ext(basename(csv_files))
cat("Total stocks loaded:", length(stock_list), "\n")
## Total stocks loaded: 52
zee <- stock_list[["ZEEL"]]
View(zee)
# ── 2. Pull out the Zee Entertainment (ZEEL) dataset ──────────────────────
zee_key <- grep("ZEE|ZEEL", names(stock_list), ignore.case = TRUE, value = TRUE)[1]
# Find the ZEEL stock file
zee_name <- grep("ZEE|ZEEL", names(stock_list),ignore.case = TRUE, value = TRUE)[1]
cat("ZEEL file:", zee_name, "\n")
## ZEEL file: ZEEL
# Load ZEEL data
zee <- stock_list[[zee_name]]
names(zee) <- make.names(tolower(names(zee)))
# Convert date column and sort data
zee <- zee %>% mutate(date = as.Date(date)) %>% arrange(date)
# Display dataset information
cat("Rows:", nrow(zee), "| Columns:", ncol(zee), "\n")
## Rows: 5306 | Columns: 15
glimpse(zee)
## Rows: 5,306
## Columns: 15
## $ date               <date> 2000-01-03, 2000-01-04, 2000-01-05, 2000-01-06, 20…
## $ symbol             <chr> "ZEETELE", "ZEETELE", "ZEETELE", "ZEETELE", "ZEETEL…
## $ series             <chr> "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ…
## $ prev.close         <dbl> 1092.55, 1179.95, 1260.65, 1176.55, 1115.45, 1026.2…
## $ open               <dbl> 1175.00, 1220.00, 1160.55, 1195.00, 1097.10, 1026.2…
## $ high               <dbl> 1179.95, 1274.35, 1317.70, 1200.00, 1097.10, 1026.2…
## $ low                <dbl> 1160.00, 1183.10, 1159.80, 1095.00, 1026.25, 944.30…
## $ last               <dbl> 1179.95, 1274.35, 1190.95, 1106.00, 1026.25, 962.00…
## $ close              <dbl> 1179.95, 1260.65, 1176.55, 1115.45, 1026.25, 966.70…
## $ vwap               <dbl> 1177.03, 1228.02, 1238.35, 1135.04, 1029.94, 980.49…
## $ volume             <dbl> 1261391, 4616547, 8763127, 5164020, 755129, 3942813…
## $ turnover           <dbl> 1.484690e+14, 5.669220e+14, 1.085178e+15, 5.861353e…
## $ trades             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ deliverable.volume <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ X.deliverble       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
View(zee)

3 Task 4.2 — Exploratory Data Analysis (EDA)

3.1 4.2.1 — Stock Profile Function

We write a function that takes any stock data frame and prints a clean summary.

# ── Define the function ────────────────────────────────────────────────────
stock_profile <- function(df, stock_name = "Stock") {
  
  cat("\n======================================================\n")
  cat(" PROFILE:", toupper(stock_name), "\n")
  cat("======================================================\n\n")
  
  # 1. Date range
  cat("Date range:\n")
  cat("  First trading day :", as.character(min(df$date, na.rm = TRUE)), "\n")
  cat("  Last  trading day :", as.character(max(df$date, na.rm = TRUE)), "\n")
  cat("  Total trading days:", nrow(df), "\n\n")
  
  # 2. Missing values per column
  cat("Missing values per column:\n")
  print(colSums(is.na(df)))
  cat("\n")
  
  # 3. Count outliers using the IQR rule for each numeric column
  # Outlier = value below Q1 - 1.5*IQR  OR  above Q3 + 1.5*IQR
  cat("Outlier counts (IQR rule) per numeric column:\n")
  numeric_cols <- names(df)[sapply(df, is.numeric)]
  
  outlier_counts <- sapply(numeric_cols, function(col) {
    x   <- df[[col]]
    Q1  <- quantile(x, 0.25, na.rm = TRUE)
    Q3  <- quantile(x, 0.75, na.rm = TRUE)
    IQR <- Q3 - Q1
    sum(x < Q1 - 1.5 * IQR | x > Q3 + 1.5 * IQR, na.rm = TRUE)
  })
  print(outlier_counts)
  cat("\n")
  
  # 4. First 5 rows
  cat("Top 5 rows:\n")
  print(head(df, 5))
  cat("\n")
  
  # 5. Data types
  cat("Column types:\n")
  print(sapply(df, class))
  cat("\n")
  
  # 6. Summary statistics
  cat("Summary statistics:\n")
  print(summary(df))
  cat("\n")
}

# ── Apply to Zee Entertainment ─────────────────────────────────────────────
stock_profile(zee, stock_name = "Zee Entertainment Enterprises Ltd (ZEEL)")
## 
## ======================================================
##  PROFILE: ZEE ENTERTAINMENT ENTERPRISES LTD (ZEEL) 
## ======================================================
## 
## Date range:
##   First trading day : 2000-01-03 
##   Last  trading day : 2021-04-30 
##   Total trading days: 5306 
## 
## Missing values per column:
##               date             symbol             series         prev.close 
##                  0                  0                  0                  0 
##               open               high                low               last 
##                  0                  0                  0                  0 
##              close               vwap             volume           turnover 
##                  0                  0                  0                  0 
##             trades deliverable.volume       X.deliverble 
##               2850                519                519 
## 
## Outlier counts (IQR rule) per numeric column:
##         prev.close               open               high                low 
##                 85                 83                 87                 81 
##               last              close               vwap             volume 
##                 83                 84                 87                672 
##           turnover             trades deliverable.volume       X.deliverble 
##                744                227                382                  0 
## 
## Top 5 rows:
## # A tibble: 5 × 15
##   date       symbol series prev.close  open  high   low  last close  vwap volume
##   <date>     <chr>  <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 2000-01-03 ZEETE… EQ          1093. 1175  1180. 1160  1180. 1180. 1177. 1.26e6
## 2 2000-01-04 ZEETE… EQ          1180. 1220  1274. 1183. 1274. 1261. 1228. 4.62e6
## 3 2000-01-05 ZEETE… EQ          1261. 1161. 1318. 1160. 1191. 1177. 1238. 8.76e6
## 4 2000-01-06 ZEETE… EQ          1177. 1195  1200  1095  1106  1115. 1135. 5.16e6
## 5 2000-01-07 ZEETE… EQ          1115. 1097. 1097. 1026. 1026. 1026. 1030. 7.55e5
## # ℹ 4 more variables: turnover <dbl>, trades <dbl>, deliverable.volume <dbl>,
## #   X.deliverble <dbl>
## 
## Column types:
##               date             symbol             series         prev.close 
##             "Date"        "character"        "character"          "numeric" 
##               open               high                low               last 
##          "numeric"          "numeric"          "numeric"          "numeric" 
##              close               vwap             volume           turnover 
##          "numeric"          "numeric"          "numeric"          "numeric" 
##             trades deliverable.volume       X.deliverble 
##          "numeric"          "numeric"          "numeric" 
## 
## Summary statistics:
##       date               symbol             series            prev.close    
##  Min.   :2000-01-03   Length:5306        Length:5306        Min.   :  62.3  
##  1st Qu.:2005-04-13   Class :character   Class :character   1st Qu.: 143.2  
##  Median :2010-08-17   Mode  :character   Mode  :character   Median : 238.2  
##  Mean   :2010-08-18                                         Mean   : 273.4  
##  3rd Qu.:2015-12-17                                         3rd Qu.: 345.6  
##  Max.   :2021-04-30                                         Max.   :1541.7  
##                                                                             
##       open           high             low              last       
##  Min.   :  62   Min.   :  66.3   Min.   :  60.1   Min.   :  62.7  
##  1st Qu.: 144   1st Qu.: 146.9   1st Qu.: 140.0   1st Qu.: 143.5  
##  Median : 238   Median : 244.0   Median : 231.4   Median : 237.7  
##  Mean   : 274   Mean   : 279.6   Mean   : 267.6   Mean   : 273.2  
##  3rd Qu.: 346   3rd Qu.: 352.8   3rd Qu.: 338.4   3rd Qu.: 345.1  
##  Max.   :1640   Max.   :1645.0   Max.   :1512.2   Max.   :1564.0  
##                                                                   
##      close             vwap             volume             turnover        
##  Min.   :  62.3   Min.   :  63.08   Min.   :     4415   Min.   :7.021e+10  
##  1st Qu.: 143.2   1st Qu.: 143.68   1st Qu.:  1218226   1st Qu.:2.595e+13  
##  Median : 238.1   Median : 238.90   Median :  2138807   Median :5.250e+13  
##  Mean   : 273.2   Mean   : 273.63   Mean   :  4825422   Mean   :1.249e+14  
##  3rd Qu.: 345.6   3rd Qu.: 345.64   3rd Qu.:  4532904   3rd Qu.:1.137e+14  
##  Max.   :1541.7   Max.   :1578.11   Max.   :165959680   Max.   :4.286e+15  
##                                                                            
##      trades        deliverable.volume  X.deliverble   
##  Min.   :    296   Min.   :    4415   Min.   :0.0557  
##  1st Qu.:  24579   1st Qu.:  513686   1st Qu.:0.3073  
##  Median :  41074   Median :  893532   Median :0.4635  
##  Mean   :  62646   Mean   : 1415718   Mean   :0.4522  
##  3rd Qu.:  71463   3rd Qu.: 1593444   3rd Qu.:0.5939  
##  Max.   :1088460   Max.   :42891428   Max.   :1.0000  
##  NA's   :2850      NA's   :519        NA's   :519

4 Task 4.3 — Visualisation with ggplot2

# ── Calculate daily % change in closing price and in prev close ────────────
zee <- zee %>%
  mutate(
    pct_close     = (close     / lag(close)     - 1) * 100,
    pct_prevclose = (prev.close / lag(prev.close) - 1) * 100
  )

4.1 4.3.1 — Closing Price Trend & Daily % Change

# Top chart: the raw closing price line
chart_price <- ggplot(zee, aes(x = date, y = close)) +
  geom_line(colour = "steelblue", linewidth = 0.7) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = comma) +
  labs(title = "Closing Price Over Time", x = NULL, y = "Price (INR)") +
  theme_minimal()

# Bottom chart: green bar for positive day, red for negative
chart_pct <- ggplot(zee %>% filter(!is.na(pct_close)),
                    aes(x = date, y = pct_close, fill = pct_close > 0)) +
  geom_col(width = 1, alpha = 0.8) +
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    guide  = "none") +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  labs(title = "Daily % Change in Closing Price",
       x = "Year", y = "% Change") +
  theme_minimal()

# Stack the two charts vertically
grid.arrange(chart_price, chart_pct, ncol = 1)

4.2 4.3.2 — Sales Volume Trend

ggplot(zee, aes(x = date, y = volume)) +
  geom_area(fill = "purple", alpha = 0.35) +
  geom_line(colour = "purple", linewidth = 0.5) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = comma) +
  labs(title = "Zee Entertainment — Sales Volume 2000-2021",
       x = "Year", y = "Volume (shares)") +
  theme_minimal()

4.3 4.3.3 — 15, 30 and 45-Day Moving Averages

# Calculate the three moving averages
zee <- zee %>%
  mutate(
    ma15 = rollmean(close, k = 15, fill = NA, align = "right"),
    ma30 = rollmean(close, k = 30, fill = NA, align = "right"),
    ma45 = rollmean(close, k = 45, fill = NA, align = "right")
  )

# Reshape to long format so ggplot can colour each line separately
zee %>%
  select(date, close, ma15, ma30, ma45) %>%
  pivot_longer(cols = -date, names_to = "line", values_to = "price") %>%
  mutate(line = factor(line,
                       levels = c("close", "ma15", "ma30", "ma45"),
                       labels = c("Daily Close", "15-day MA", "30-day MA", "45-day MA"))) %>%
  ggplot(aes(x = date, y = price, colour = line)) +
  geom_line(alpha = 0.8) +
  scale_colour_manual(values = c("grey70", "tomato", "darkgreen", "steelblue")) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = comma) +
  labs(title    = "Closing Price with 15, 30 and 45-Day Moving Averages",
       x = "Year", y = "Price (INR)", colour = NULL) +
  theme_minimal() +
  theme(legend.position = "top")