install.packages(c(“dplyr”, “ggplot2”, “readr”, “lubridate”, “corrplot”, “caret”, “scales”, “zoo”)) install.packages(“packrat”) install.packages(“rsconnect”)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(corrplot)
## corrplot 0.95 loaded
library(caret)
## Loading required package: lattice
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Set working directory
setwd("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive")
# Load datasets
zee_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/ZEEL.csv")
# View zee_data
head(zee_data)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 ZEETELE EQ 1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE EQ 1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE EQ 1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE EQ 1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE EQ 1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## 6 2000-01-10 ZEETELE EQ 1026.25 1026.25 1026.25 944.30 962.00 966.70
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14 NA NA NA
## 2 1228.02 4616547 5.669220e+14 NA NA NA
## 3 1238.35 8763127 1.085178e+15 NA NA NA
## 4 1135.04 5164020 5.861353e+14 NA NA NA
## 5 1029.94 755129 7.777374e+13 NA NA NA
## 6 980.49 3942813 3.865885e+14 NA NA NA
vedanta_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/VEDL.csv")
head(vedanta_data)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 SESAGOA EQ 107.70 111.00 116.35 108.00 116.35 116.35
## 2 2000-01-04 SESAGOA EQ 116.35 113.75 116.00 108.00 116.00 114.70
## 3 2000-01-05 SESAGOA EQ 114.70 107.55 115.85 107.55 114.50 114.00
## 4 2000-01-06 SESAGOA EQ 114.00 112.00 123.10 112.00 118.80 119.30
## 5 2000-01-07 SESAGOA EQ 119.30 119.85 120.00 114.05 116.50 116.50
## 6 2000-01-10 SESAGOA EQ 116.50 120.50 120.85 116.00 119.00 119.00
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 114.80 20371 233859660000 NA NA NA
## 2 113.34 22366 253499440000 NA NA NA
## 3 112.78 18305 206436075000 NA NA NA
## 4 119.89 25800 309313325000 NA NA NA
## 5 116.84 17361 202840260000 NA NA NA
## 6 118.69 20707 245767815000 NA NA NA
ADANI_PORTS <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/ADANIPORTS.csv")
head(ADANI_PORTS)
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2007-11-27 MUNDRAPORT EQ 440.00 770.00 1050.00 770 959 962.90
## 2 2007-11-28 MUNDRAPORT EQ 962.90 984.00 990.00 874 885 893.90
## 3 2007-11-29 MUNDRAPORT EQ 893.90 909.00 914.75 841 887 884.20
## 4 2007-11-30 MUNDRAPORT EQ 884.20 890.00 958.00 890 929 921.55
## 5 2007-12-03 MUNDRAPORT EQ 921.55 939.75 995.00 922 980 969.30
## 6 2007-12-04 MUNDRAPORT EQ 969.30 985.00 1056.00 976 1049 1041.45
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 984.72 27294366 2.687719e+15 NA 9859619 0.3612
## 2 941.38 4581338 4.312765e+14 NA 1453278 0.3172
## 3 888.09 5124121 4.550658e+14 NA 1069678 0.2088
## 4 929.17 4609762 4.283257e+14 NA 1260913 0.2735
## 5 965.65 2977470 2.875200e+14 NA 816123 0.2741
## 6 1015.39 4849250 4.923867e+14 NA 1537667 0.3171
ASIAN_PAINT <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/MidTermExam/archive/ASIANPAINT.csv")
head(ASIAN_PAINT)
## Date Symbol Series Prev.Close Open High Low Last Close VWAP
## 1 2000-01-03 ASIANPAINT EQ 361.20 370.0 390.0 370.0 385.0 381.65 380.54
## 2 2000-01-04 ASIANPAINT EQ 381.65 380.0 392.0 375.0 390.0 385.55 383.50
## 3 2000-01-05 ASIANPAINT EQ 385.55 371.5 390.0 371.5 383.0 383.00 379.81
## 4 2000-01-06 ASIANPAINT EQ 383.00 384.9 384.9 374.5 375.1 377.50 379.88
## 5 2000-01-07 ASIANPAINT EQ 377.50 376.0 390.0 370.0 389.0 385.70 383.38
## 6 2000-01-10 ASIANPAINT EQ 385.70 415.0 416.6 409.0 416.6 415.00 414.97
## Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 3318 1.262617e+11 NA NA NA
## 2 4818 1.847699e+11 NA NA NA
## 3 2628 9.981384e+10 NA NA NA
## 4 3354 1.274114e+11 NA NA NA
## 5 9589 3.676275e+11 NA NA NA
## 6 60313 2.502823e+12 NA NA NA
#Exploratory Data Analysis (EDA) #Create Profile Function
profile_stock <- function(data){
# Trading date range
cat("Trading Date Range:\n")
print(range(as.Date(data$Date)))
# Missing values
cat("\nMissing Values:\n")
print(colSums(is.na(data)))
# Data types
cat("\nData Types:\n")
print(str(data))
# First 5 rows
cat("\nTop 5 Rows:\n")
print(head(data, 5))
# Summary statistics
cat("\nSummary Statistics:\n")
print(summary(data))
# Outliers using IQR
numeric_cols <- sapply(data, is.numeric)
outlier_count <- sapply(data[, numeric_cols], function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
sum(x < (Q1 - 1.5 * IQR_value) |
x > (Q3 + 1.5 * IQR_value), na.rm = TRUE)
})
cat("\nNumber of Outliers:\n")
print(outlier_count)
}
profile_stock(zee_data)
## Trading Date Range:
## [1] "2000-01-03" "2021-04-30"
##
## Missing Values:
## Date Symbol Series Prev.Close
## 0 0 0 0
## Open High Low Last
## 0 0 0 0
## Close VWAP Volume Turnover
## 0 0 0 0
## Trades Deliverable.Volume X.Deliverble
## 2850 519 519
##
## Data Types:
## 'data.frame': 5306 obs. of 15 variables:
## $ Date : chr "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
## $ Symbol : chr "ZEETELE" "ZEETELE" "ZEETELE" "ZEETELE" ...
## $ Series : chr "EQ" "EQ" "EQ" "EQ" ...
## $ Prev.Close : num 1093 1180 1261 1177 1115 ...
## $ Open : num 1175 1220 1161 1195 1097 ...
## $ High : num 1180 1274 1318 1200 1097 ...
## $ Low : num 1160 1183 1160 1095 1026 ...
## $ Last : num 1180 1274 1191 1106 1026 ...
## $ Close : num 1180 1261 1177 1115 1026 ...
## $ VWAP : num 1177 1228 1238 1135 1030 ...
## $ Volume : int 1261391 4616547 8763127 5164020 755129 3942813 6802005 2968833 2251046 2949092 ...
## $ Turnover : num 1.48e+14 5.67e+14 1.09e+15 5.86e+14 7.78e+13 ...
## $ Trades : num NA NA NA NA NA NA NA NA NA NA ...
## $ Deliverable.Volume: num NA NA NA NA NA NA NA NA NA NA ...
## $ X.Deliverble : num NA NA NA NA NA NA NA NA NA NA ...
## NULL
##
## Top 5 Rows:
## Date Symbol Series Prev.Close Open High Low Last Close
## 1 2000-01-03 ZEETELE EQ 1092.55 1175.00 1179.95 1160.00 1179.95 1179.95
## 2 2000-01-04 ZEETELE EQ 1179.95 1220.00 1274.35 1183.10 1274.35 1260.65
## 3 2000-01-05 ZEETELE EQ 1260.65 1160.55 1317.70 1159.80 1190.95 1176.55
## 4 2000-01-06 ZEETELE EQ 1176.55 1195.00 1200.00 1095.00 1106.00 1115.45
## 5 2000-01-07 ZEETELE EQ 1115.45 1097.10 1097.10 1026.25 1026.25 1026.25
## VWAP Volume Turnover Trades Deliverable.Volume X.Deliverble
## 1 1177.03 1261391 1.484690e+14 NA NA NA
## 2 1228.02 4616547 5.669220e+14 NA NA NA
## 3 1238.35 8763127 1.085178e+15 NA NA NA
## 4 1135.04 5164020 5.861353e+14 NA NA NA
## 5 1029.94 755129 7.777374e+13 NA NA NA
##
## Summary Statistics:
## Date Symbol Series Prev.Close
## Length :5306 Length :5306 Length :5306 Min. : 62.3
## N.unique :5306 N.unique : 2 N.unique : 1 1st Qu.: 143.2
## N.blank : 0 N.blank : 0 N.blank : 0 Median : 238.2
## Min.nchar: 10 Min.nchar: 4 Min.nchar: 2 Mean : 273.4
## Max.nchar: 10 Max.nchar: 7 Max.nchar: 2 3rd Qu.: 345.6
## Max. :1541.7
##
## Open High Low Last
## Min. : 62 Min. : 66.3 Min. : 60.1 Min. : 62.7
## 1st Qu.: 144 1st Qu.: 146.9 1st Qu.: 140.0 1st Qu.: 143.5
## Median : 238 Median : 244.0 Median : 231.4 Median : 237.7
## Mean : 274 Mean : 279.6 Mean : 267.6 Mean : 273.2
## 3rd Qu.: 346 3rd Qu.: 352.8 3rd Qu.: 338.4 3rd Qu.: 345.1
## Max. :1640 Max. :1645.0 Max. :1512.2 Max. :1564.0
##
## Close VWAP Volume Turnover
## Min. : 62.3 Min. : 63.08 Min. : 4415 Min. :7.021e+10
## 1st Qu.: 143.2 1st Qu.: 143.68 1st Qu.: 1218226 1st Qu.:2.595e+13
## Median : 238.1 Median : 238.90 Median : 2138807 Median :5.250e+13
## Mean : 273.2 Mean : 273.63 Mean : 4825422 Mean :1.249e+14
## 3rd Qu.: 345.6 3rd Qu.: 345.64 3rd Qu.: 4532904 3rd Qu.:1.137e+14
## Max. :1541.7 Max. :1578.11 Max. :165959680 Max. :4.286e+15
##
## Trades Deliverable.Volume X.Deliverble
## Min. : 296 Min. : 4415 Min. :0.0557
## 1st Qu.: 24579 1st Qu.: 513686 1st Qu.:0.3073
## Median : 41074 Median : 893532 Median :0.4635
## Mean : 62646 Mean : 1415718 Mean :0.4522
## 3rd Qu.: 71463 3rd Qu.: 1593444 3rd Qu.:0.5939
## Max. :1088460 Max. :42891428 Max. :1.0000
## NAs :2850 NAs :519 NAs :519
##
## Number of Outliers:
## Prev.Close Open High Low
## 85 83 87 81
## Last Close VWAP Volume
## 83 84 87 672
## Turnover Trades Deliverable.Volume X.Deliverble
## 744 227 382 0
# Convert Date column
zee_data$Date <- as.Date(zee_data$Date)
# Plot volume trend
ggplot(zee_data, aes(x = Date, y = Volume)) +
geom_line(color = "blue") +
labs(title = "Trading Volume Trend",
x = "Date",
y = "Volume") +
theme_minimal()
#Visualization #Closing Price Trend and Percentage Change
# Calculate percentage change
zee_data <- zee_data %>%
mutate(Percent_Change = ((Close - Prev.Close) / Prev.Close) * 100)
# Closing price trend
p1 <- ggplot(zee_data, aes(x = Date, y = Close)) +
geom_line(color = "darkgreen") +
labs(title = "Closing Price Trend",
x = "Date",
y = "Closing Price") +
theme_minimal()
print(p1)
# Percentage change trend
p2 <- ggplot(zee_data, aes(x = Date, y = Percent_Change)) +
geom_line(color = "red") +
labs(title = "Percentage Change Trend",
x = "Date",
y = "% Change") +
theme_minimal()
print(p2)
# Convert Date column
zee_data$Date <- as.Date(zee_data$Date)
# Plot volume trend
ggplot(zee_data, aes(x = Date, y = Volume)) +
geom_line(color = "blue") +
labs(title = "Trading Volume Trend",
x = "Date",
y = "Volume") +
theme_minimal()
ggplot(zee_data, aes(x = Date, y = Volume)) +
geom_line(color = "purple") +
labs(title = "Sales Volume Trend",
x = "Date",
y = "Volume") +
theme_minimal()
#Moving_Average_Trends
# Moving averages
zee_data$MA15 <- zoo::rollmean(zee_data$Close, 15, fill = NA)
zee_data$MA30 <- zoo::rollmean(zee_data$Close, 30, fill = NA)
zee_data$MA45 <- zoo::rollmean(zee_data$Close, 45, fill = NA)
# Plot moving averages
ggplot(zee_data, aes(x = Date)) +
geom_line(aes(y = Close, color = "Close Price")) +
geom_line(aes(y = MA15, color = "15 Days MA")) +
geom_line(aes(y = MA30, color = "30 Days MA")) +
geom_line(aes(y = MA45, color = "45 Days MA")) +
labs(title = "Moving Average Trend",
y = "Price") +
theme_minimal()
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 29 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_line()`).
ggplot(zee_data, aes(x = Percent_Change)) +
geom_histogram(fill = "orange", bins = 30) +
labs(title = "Distribution of Percentage Change",
x = "Percentage Change",
y = "Frequency") +
theme_minimal()
#Correlation Analysis
#Select Numerical Variables
correlation_data <- zee_data %>%
select(Prev.Close, Open, High, Low, Volume, Close)
#Correlation Matrix
#Select Numerical Variables
cor_matrix <- cor(correlation_data, use = "complete.obs")
print(cor_matrix)
## Prev.Close Open High Low Volume
## Prev.Close 1.00000000 0.99942268 0.99885060 0.99838428 -0.04934908
## Open 0.99942268 1.00000000 0.99913584 0.99869374 -0.04832402
## High 0.99885060 0.99913584 1.00000000 0.99814370 -0.03974465
## Low 0.99838428 0.99869374 0.99814370 1.00000000 -0.05694402
## Volume -0.04934908 -0.04832402 -0.03974465 -0.05694402 1.00000000
## Close 0.99790147 0.99811311 0.99880555 0.99911821 -0.04744036
## Close
## Prev.Close 0.99790147
## Open 0.99811311
## High 0.99880555
## Low 0.99911821
## Volume -0.04744036
## Close 1.00000000
#Correlation Plot
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45)
#Highly Correlated Variables
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45)
#Regression Analysis
#Select Variables
regression_data <- zee_data %>%
select(Close, Prev.Close, Open, High, Low, Volume) %>%
na.omit()
#Split Data into Training and Testing Sets
set.seed(123)
train_index <- createDataPartition(regression_data$Close,
p = 0.8,
list = FALSE)
train_data <- regression_data[train_index, ]
test_data <- regression_data[-train_index, ]
cat("Training Set Size:", nrow(train_data), "\n")
## Training Set Size: 4246
cat("Testing Set Size:", nrow(test_data), "\n")
## Testing Set Size: 1060
#Train Linear Regression Model
model <- lm(Close ~ Prev.Close + Open + High + Low + Volume,
data = train_data)
summary(model)
##
## Call:
## lm(formula = Close ~ Prev.Close + Open + High + Low + Volume,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.142 -1.524 -0.303 1.283 67.085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.610e-01 1.524e-01 3.024 0.00251 **
## Prev.Close -8.569e-02 1.306e-02 -6.560 6.01e-11 ***
## Open -4.596e-01 1.608e-02 -28.577 < 2e-16 ***
## High 7.316e-01 1.059e-02 69.111 < 2e-16 ***
## Low 8.125e-01 8.928e-03 91.005 < 2e-16 ***
## Volume 3.001e-08 9.532e-09 3.148 0.00165 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.015 on 4240 degrees of freedom
## Multiple R-squared: 0.9992, Adjusted R-squared: 0.9992
## F-statistic: 1.071e+06 on 5 and 4240 DF, p-value: < 2.2e-16
#Predict on Test Data
predictions <- predict(model, test_data)
#Evaluate Model Performance
# RMSE
rmse <- sqrt(mean((test_data$Close - predictions)^2))
# MAE
mae <- mean(abs(test_data$Close - predictions))
# R-squared
r2 <- cor(test_data$Close, predictions)^2
cat("RMSE:", rmse, "\n")
## RMSE: 4.107454
cat("MAE:", mae, "\n")
## MAE: 2.25627
cat("R-squared:", r2, "\n")
## R-squared: 0.9993964
#Actual vs Predicted Scatter Plot
plot(test_data$Close,
predictions,
main = "Actual vs Predicted Closing Prices",
xlab = "Actual Close",
ylab = "Predicted Close",
col = "blue",
pch = 16)
abline(a = 0, b = 1, col = "red", lwd = 2)