We first install any missing packages, then load them all.
# List of packages we need
packages_needed <- c("tidyverse", "lubridate", "ggplot2", "scales",
"corrplot", "GGally", "caret", "gridExtra",
"zoo", "ggrepel", "factoextra")
# Install only the ones that are not yet installed
for (pkg in packages_needed) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
# Load all packages
library(tidyverse) # data manipulation and ggplot2
library(lubridate) # working with dates
library(ggplot2) # charts and visualisation
library(scales) # number formatting (comma, percent)
library(corrplot) # correlation heatmap
library(GGally) # pair plots
library(caret) # data pre-processing (scaling)
library(gridExtra) # put two plots side by side / stacked
library(zoo) # rolling (moving) averages
library(ggrepel) # non-overlapping text labels on charts
library(factoextra) # PCA visualisation helpers
Before running: Download the NIFTY-50 dataset from Kaggle and put all the CSV files inside a folder called
nifty50_datathat lives in the same directory as this.Rmdfile.
# ── 1. Read every CSV file inside the nifty50_data folder ─────────────────
# Folder containing all NIFTY-50 CSV files
data_dir <- "nifty50_data"
# Get all CSV files from the folder
csv_files <- list.files(data_dir, pattern = "\\.csv$", full.names = TRUE)
# Read all CSV files into a list
stock_list <- lapply(csv_files, read_csv, show_col_types = FALSE)
# Use file names as stock names
names(stock_list) <- tools::file_path_sans_ext(basename(csv_files))
cat("Total stocks loaded:", length(stock_list), "\n")
## Total stocks loaded: 52
zee <- stock_list[["ZEEL"]]
View(zee)
# ── 2. Pull out the Zee Entertainment (ZEEL) dataset ──────────────────────
zee_key <- grep("ZEE|ZEEL", names(stock_list), ignore.case = TRUE, value = TRUE)[1]
# Find the ZEEL stock file
zee_name <- grep("ZEE|ZEEL", names(stock_list),ignore.case = TRUE, value = TRUE)[1]
cat("ZEEL file:", zee_name, "\n")
## ZEEL file: ZEEL
# Load ZEEL data
zee <- stock_list[[zee_name]]
names(zee) <- make.names(tolower(names(zee)))
# Convert date column and sort data
zee <- zee %>% mutate(date = as.Date(date)) %>% arrange(date)
# Display dataset information
cat("Rows:", nrow(zee), "| Columns:", ncol(zee), "\n")
## Rows: 5306 | Columns: 15
glimpse(zee)
## Rows: 5,306
## Columns: 15
## $ date <date> 2000-01-03, 2000-01-04, 2000-01-05, 2000-01-06, 20…
## $ symbol <chr> "ZEETELE", "ZEETELE", "ZEETELE", "ZEETELE", "ZEETEL…
## $ series <chr> "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ…
## $ prev.close <dbl> 1092.55, 1179.95, 1260.65, 1176.55, 1115.45, 1026.2…
## $ open <dbl> 1175.00, 1220.00, 1160.55, 1195.00, 1097.10, 1026.2…
## $ high <dbl> 1179.95, 1274.35, 1317.70, 1200.00, 1097.10, 1026.2…
## $ low <dbl> 1160.00, 1183.10, 1159.80, 1095.00, 1026.25, 944.30…
## $ last <dbl> 1179.95, 1274.35, 1190.95, 1106.00, 1026.25, 962.00…
## $ close <dbl> 1179.95, 1260.65, 1176.55, 1115.45, 1026.25, 966.70…
## $ vwap <dbl> 1177.03, 1228.02, 1238.35, 1135.04, 1029.94, 980.49…
## $ volume <dbl> 1261391, 4616547, 8763127, 5164020, 755129, 3942813…
## $ turnover <dbl> 1.484690e+14, 5.669220e+14, 1.085178e+15, 5.861353e…
## $ trades <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ deliverable.volume <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ X.deliverble <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
View(zee)
We write a function that takes any stock data frame and prints a clean summary.
# ── Define the function ────────────────────────────────────────────────────
stock_profile <- function(df, stock_name = "Stock") {
cat("\n======================================================\n")
cat(" PROFILE:", toupper(stock_name), "\n")
cat("======================================================\n\n")
# 1. Date range
cat("Date range:\n")
cat(" First trading day :", as.character(min(df$date, na.rm = TRUE)), "\n")
cat(" Last trading day :", as.character(max(df$date, na.rm = TRUE)), "\n")
cat(" Total trading days:", nrow(df), "\n\n")
# 2. Missing values per column
cat("Missing values per column:\n")
print(colSums(is.na(df)))
cat("\n")
# 3. Count outliers using the IQR rule for each numeric column
# Outlier = value below Q1 - 1.5*IQR OR above Q3 + 1.5*IQR
cat("Outlier counts (IQR rule) per numeric column:\n")
numeric_cols <- names(df)[sapply(df, is.numeric)]
outlier_counts <- sapply(numeric_cols, function(col) {
x <- df[[col]]
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
sum(x < Q1 - 1.5 * IQR | x > Q3 + 1.5 * IQR, na.rm = TRUE)
})
print(outlier_counts)
cat("\n")
# 4. First 5 rows
cat("Top 5 rows:\n")
print(head(df, 5))
cat("\n")
# 5. Data types
cat("Column types:\n")
print(sapply(df, class))
cat("\n")
# 6. Summary statistics
cat("Summary statistics:\n")
print(summary(df))
cat("\n")
}
# ── Apply to Zee Entertainment ─────────────────────────────────────────────
stock_profile(zee, stock_name = "Zee Entertainment Enterprises Ltd (ZEEL)")
##
## ======================================================
## PROFILE: ZEE ENTERTAINMENT ENTERPRISES LTD (ZEEL)
## ======================================================
##
## Date range:
## First trading day : 2000-01-03
## Last trading day : 2021-04-30
## Total trading days: 5306
##
## Missing values per column:
## date symbol series prev.close
## 0 0 0 0
## open high low last
## 0 0 0 0
## close vwap volume turnover
## 0 0 0 0
## trades deliverable.volume X.deliverble
## 2850 519 519
##
## Outlier counts (IQR rule) per numeric column:
## prev.close open high low
## 85 83 87 81
## last close vwap volume
## 83 84 87 672
## turnover trades deliverable.volume X.deliverble
## 744 227 382 0
##
## Top 5 rows:
## # A tibble: 5 × 15
## date symbol series prev.close open high low last close vwap volume
## <date> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2000-01-03 ZEETE… EQ 1093. 1175 1180. 1160 1180. 1180. 1177. 1.26e6
## 2 2000-01-04 ZEETE… EQ 1180. 1220 1274. 1183. 1274. 1261. 1228. 4.62e6
## 3 2000-01-05 ZEETE… EQ 1261. 1161. 1318. 1160. 1191. 1177. 1238. 8.76e6
## 4 2000-01-06 ZEETE… EQ 1177. 1195 1200 1095 1106 1115. 1135. 5.16e6
## 5 2000-01-07 ZEETE… EQ 1115. 1097. 1097. 1026. 1026. 1026. 1030. 7.55e5
## # ℹ 4 more variables: turnover <dbl>, trades <dbl>, deliverable.volume <dbl>,
## # X.deliverble <dbl>
##
## Column types:
## date symbol series prev.close
## "Date" "character" "character" "numeric"
## open high low last
## "numeric" "numeric" "numeric" "numeric"
## close vwap volume turnover
## "numeric" "numeric" "numeric" "numeric"
## trades deliverable.volume X.deliverble
## "numeric" "numeric" "numeric"
##
## Summary statistics:
## date symbol series prev.close
## Min. :2000-01-03 Length:5306 Length:5306 Min. : 62.3
## 1st Qu.:2005-04-13 Class :character Class :character 1st Qu.: 143.2
## Median :2010-08-17 Mode :character Mode :character Median : 238.2
## Mean :2010-08-18 Mean : 273.4
## 3rd Qu.:2015-12-17 3rd Qu.: 345.6
## Max. :2021-04-30 Max. :1541.7
##
## open high low last
## Min. : 62 Min. : 66.3 Min. : 60.1 Min. : 62.7
## 1st Qu.: 144 1st Qu.: 146.9 1st Qu.: 140.0 1st Qu.: 143.5
## Median : 238 Median : 244.0 Median : 231.4 Median : 237.7
## Mean : 274 Mean : 279.6 Mean : 267.6 Mean : 273.2
## 3rd Qu.: 346 3rd Qu.: 352.8 3rd Qu.: 338.4 3rd Qu.: 345.1
## Max. :1640 Max. :1645.0 Max. :1512.2 Max. :1564.0
##
## close vwap volume turnover
## Min. : 62.3 Min. : 63.08 Min. : 4415 Min. :7.021e+10
## 1st Qu.: 143.2 1st Qu.: 143.68 1st Qu.: 1218226 1st Qu.:2.595e+13
## Median : 238.1 Median : 238.90 Median : 2138807 Median :5.250e+13
## Mean : 273.2 Mean : 273.63 Mean : 4825422 Mean :1.249e+14
## 3rd Qu.: 345.6 3rd Qu.: 345.64 3rd Qu.: 4532904 3rd Qu.:1.137e+14
## Max. :1541.7 Max. :1578.11 Max. :165959680 Max. :4.286e+15
##
## trades deliverable.volume X.deliverble
## Min. : 296 Min. : 4415 Min. :0.0557
## 1st Qu.: 24579 1st Qu.: 513686 1st Qu.:0.3073
## Median : 41074 Median : 893532 Median :0.4635
## Mean : 62646 Mean : 1415718 Mean :0.4522
## 3rd Qu.: 71463 3rd Qu.: 1593444 3rd Qu.:0.5939
## Max. :1088460 Max. :42891428 Max. :1.0000
## NA's :2850 NA's :519 NA's :519
# ── Add a 30-day moving average and a z-score for volume ──────────────────
zee <- zee %>%
mutate(
vol_30day_avg = rollmean(volume, k = 30, fill = NA, align = "right"),
vol_zscore = (volume - mean(volume, na.rm = TRUE)) / sd(volume, na.rm = TRUE)
)
# ── Define 3 important market events ──────────────────────────────────────
key_events <- tibble(
date = as.Date(c("2008-09-15", "2016-11-08", "2020-03-23")),
label = c("Lehman\nCollapse\n2008",
"India\nDemonetisation\n2016",
"COVID-19\nCrash\n2020")
)
# Match each event to the closest actual trading day in our data
key_events <- key_events %>%
rowwise() %>%
mutate(date = zee$date[which.min(abs(zee$date - date))]) %>%
ungroup() %>%
left_join(zee %>% select(date, volume), by = "date")
# ── Draw the chart ─────────────────────────────────────────────────────────
ggplot(zee, aes(x = date, y = volume)) +
# Bar for every day; orange if z-score > 2 (unusual spike), blue otherwise
geom_col(aes(fill = vol_zscore > 2), alpha = 0.65, width = 1) +
# 30-day moving average line
geom_line(aes(y = vol_30day_avg), colour = "red", linewidth = 0.8) +
# Blue dot on the event days
geom_point(data = key_events, aes(x = date, y = volume),
colour = "darkblue", size = 3) +
# Labels that avoid overlapping each other
geom_label_repel(data = key_events,
aes(x = date, y = volume, label = label),
size = 3, fill = "lightyellow", box.padding = 0.6) +
scale_fill_manual(values = c("FALSE" = "steelblue", "TRUE" = "orange"),
labels = c("Normal volume", "Volume spike (z > 2)"),
name = "") +
scale_y_continuous(labels = comma) +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
labs(title = "Zee Entertainment — Daily Trading Volume 2000-2021",
subtitle = "Orange = unusual spike | Red line = 30-day average",
x = "Year", y = "Volume (shares)") +
theme_minimal(base_size = 12) +
theme(legend.position = "top")
Key observations:
# ── Calculate daily % change in closing price and in prev close ────────────
zee <- zee %>%
mutate(
pct_close = (close / lag(close) - 1) * 100,
pct_prevclose = (prev.close / lag(prev.close) - 1) * 100
)
# Top chart: the raw closing price line
chart_price <- ggplot(zee, aes(x = date, y = close)) +
geom_line(colour = "steelblue", linewidth = 0.7) +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
scale_y_continuous(labels = comma) +
labs(title = "Closing Price Over Time", x = NULL, y = "Price (INR)") +
theme_minimal()
# Bottom chart: green bar for positive day, red for negative
chart_pct <- ggplot(zee %>% filter(!is.na(pct_close)),
aes(x = date, y = pct_close, fill = pct_close > 0)) +
geom_col(width = 1, alpha = 0.8) +
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
guide = "none") +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
labs(title = "Daily % Change in Closing Price",
x = "Year", y = "% Change") +
theme_minimal()
# Stack the two charts vertically
grid.arrange(chart_price, chart_pct, ncol = 1)
ggplot(zee, aes(x = date, y = volume)) +
geom_area(fill = "purple", alpha = 0.35) +
geom_line(colour = "purple", linewidth = 0.5) +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
scale_y_continuous(labels = comma) +
labs(title = "Zee Entertainment — Sales Volume 2000-2021",
x = "Year", y = "Volume (shares)") +
theme_minimal()
# Calculate the three moving averages
zee <- zee %>%
mutate(
ma15 = rollmean(close, k = 15, fill = NA, align = "right"),
ma30 = rollmean(close, k = 30, fill = NA, align = "right"),
ma45 = rollmean(close, k = 45, fill = NA, align = "right")
)
# Reshape to long format so ggplot can colour each line separately
zee %>%
select(date, close, ma15, ma30, ma45) %>%
pivot_longer(cols = -date, names_to = "line", values_to = "price") %>%
mutate(line = factor(line,
levels = c("close", "ma15", "ma30", "ma45"),
labels = c("Daily Close", "15-day MA", "30-day MA", "45-day MA"))) %>%
ggplot(aes(x = date, y = price, colour = line)) +
geom_line(alpha = 0.8) +
scale_colour_manual(values = c("grey70", "tomato", "darkgreen", "steelblue")) +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
scale_y_continuous(labels = comma) +
labs(title = "Closing Price with 15, 30 and 45-Day Moving Averages",
x = "Year", y = "Price (INR)", colour = NULL) +
theme_minimal() +
theme(legend.position = "top")