Research Questions

  1. How strong is the long-term bullish trend of Nifty 50?
  2. How does moving average compare with actual prices?
  3. What is the relationship between return and risk?
  4. How does volatility change over time?
  5. Is there evidence of volatility clustering?
  6. What is the distribution of returns?
  7. Where do extreme return events occur?
  8. Can closing price be predicted using Open, High, Low?
  9. How accurate is the regression model?
  10. How are Open, High, Low, Close related?
  11. What proportion of days are positive vs negative?
  12. How are returns distributed across categories?
  13. What is the maximum drawdown?
  14. How does ₹1 investment grow over time?

Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
library(tidyr)
data <- read.csv("/Users/akxhat879/Downloads/nifty50_25years_ohlcv_1999_2026.csv")
data$Date <- as.Date(data$Date)

data$Date <- as.Date(data$Date)
if(any(is.na(data$Date))){
  data$Date <- as.Date(data$Date, format = "%Y-%m-%d")
}
data <- data %>% arrange(Date)
data <- data %>%
  mutate(Return = (Close - lag(Close))/lag(Close),
         Year = format(Date, "%Y"))
#Q1: Long-Term Trend

ggplot(data, aes(Date, Close)) + 
  geom_line(color = "darkblue", size = 1) + 
  labs(title = "Nifty 50 Long-Term Trend", y = "Closing Price") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Q2 : Moving Average
data$MA50 <- stats::filter(data$Close, rep(1/50, 50), sides = 1)
ggplot(data, aes(Date)) + 
  geom_line(aes(y = Close), color = "blue") + 
  geom_line(aes(y = MA50), color = "red") + 
  theme_minimal()
## Warning: Removed 49 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Q3: Risk vs Return
data <- data %>%
  mutate(Return = (Close - lag(Close))/lag(Close), 
         Year = format(Date,"%Y")) 
data$Volatility <- ave(data$Return, data$Year, FUN = function(x) sd(x, na.rm = TRUE))
ggplot(data, aes(Return, Volatility)) + 
  geom_point(color = "purple") + 
  geom_smooth(method = "lm", color = "orange")+ 
  theme_light()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Q4: Volatility Over Time
ggplot(data, aes(Date, Volatility)) +
  geom_line(color = "darkgreen") +
  theme_minimal()

#Q5: Volatility Clustering
ggplot(data, aes(Date, abs(Return))) +
  geom_line(color = "brown") +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

#Q6: Return Distribution
ggplot(data, aes(Return)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

#Q7: Extreme Events
ggplot(data, aes(Date, Return)) +
  geom_line(color = "grey") +
  geom_point(data = subset(data, abs(Return) > 0.05),
             color = "red") +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

#Q8: Regression Model
model <- lm(Close ~ Open + High + Low, data=data)
summary(model)
## 
## Call:
## lm(formula = Close ~ Open + High + Low, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -310.72  -11.95    1.02   12.47  609.67 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.930330   0.721102  -4.064 4.89e-05 ***
## Open        -0.632574   0.008981 -70.434  < 2e-16 ***
## High         0.905325   0.008120 111.499  < 2e-16 ***
## Low          0.727027   0.006584 110.420  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.51 on 6282 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 8.384e+07 on 3 and 6282 DF,  p-value: < 2.2e-16
#Q9: Prediction Accuracy
data$Pred <- predict(model, data)

ggplot(data, aes(Close, Pred)) +
  geom_point(color = "blue") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_light()

#Q10: Pair Plot
data %>%
  select(Open, High, Low, Close) %>%
  ggpairs()

# Q11: Positive vs Negative Returns
pie_data1 <- data %>%
  mutate(Return = (Close - lag(Close))/lag(Close),
         Category = ifelse(Return > 0,"Positive","Negative")) %>%
  count(Category)

ggplot(pie_data1, aes("", n, fill = Category)) +
  geom_bar(stat = "identity") +
  coord_polar("y") +
  scale_fill_manual(values = c("green","red")) +
  theme_void()

#Q12: Return Categories
pie_data2 <- data %>%
  mutate(Return = (Close - lag(Close))/lag(Close),
         Category = ifelse(Return > 0.02,"High",
                    ifelse(Return <- 0.02,"Low","Medium"))) %>%
  count(Category)

ggplot(pie_data2, aes("", n, fill=Category)) +
  geom_bar(stat = "identity") +
  coord_polar("y") +
  scale_fill_manual(values = c("green","yellow","red")) +
  theme_void()

#Q13: Maximum Drawdown
data$Cum_Max <- cummax(data$Close)
data$Drawdown <- (data$Close - data$Cum_Max)/data$Cum_Max

ggplot(data, aes(Date, Drawdown)) +
  geom_line(color = "red") +
  theme_minimal()

# Q14: Growth of ₹1 Investment

data$Return <- (data$Close - lag(data$Close)) / lag(data$Close)
data$Return[is.na(data$Return)] <- 0

data$Cum_Return <- cumprod(1 + data$Return)

ggplot(data, aes(Date, Cum_Return)) +
  geom_line(color = "darkgreen", size=1) +
  labs(title = "Growth of ₹1 Investment", y = "Value") +
  theme_minimal()