This report presents a comprehensive analysis of the World Stock Prices Dataset, conducted by Mohit Bharti (Roll No: 60) from Group 2. The dataset contains detailed stock market information including opening prices, closing prices, high/low values, trading volumes, and company metadata across various industries and countries.
The analysis employs various statistical and machine learning techniques including KNN, K-means clustering, Apriori algorithm, and predictive modeling to extract meaningful insights from the stock market data. The visualizations are designed to be clear, interpretable, and focused on specific aspects of the dataset.
The World Stock Prices Dataset contains: - Date: Trading dates from June 2025 to July 2025 - Price Data: Open, High, Low, Close prices - Volume: Trading volume information - Company Details: Brand names, tickers, industries, countries - Financial Metrics: Dividends, stock splits, capital gains
# Load the dataset
stock_data <- read.csv("C:/Users/sudha/Downloads/Mohit/archive/World-Stock-Prices-Dataset.csv",
stringsAsFactors = FALSE)
# Display basic information about the dataset
cat("Dataset Dimensions:", dim(stock_data), "\n")## Dataset Dimensions: 310122 13
## Column Names: Date Open High Low Close Volume Brand_Name Ticker Industry_Tag Country Dividends Stock.Splits Capital.Gains
# Convert Date to proper format
stock_data$Date <- as.Date(stock_data$Date, format = "%Y-%m-%d")
# Display first few rows
head(stock_data)## Date Open High Low Close Volume Brand_Name Ticker
## 1 2025-07-03 6.630 6.74 6.6150 6.64 4209664 peloton PTON
## 2 2025-07-03 106.750 108.37 106.3301 107.34 560190 crocs CROX
## 3 2025-07-03 122.630 123.05 121.5500 121.93 36600 adidas ADDYY
## 4 2025-07-03 221.705 224.01 221.3600 223.41 29295154 amazon AMZN
## 5 2025-07-03 212.145 214.65 211.8101 213.55 34697317 apple AAPL
## 6 2025-07-03 76.265 77.03 75.5800 76.39 11545304 nike NKE
## Industry_Tag Country Dividends Stock.Splits Capital.Gains
## 1 fitness usa 0 0 NA
## 2 footwear usa 0 0 NA
## 3 apparel germany 0 0 NA
## 4 e-commerce usa 0 0 NA
## 5 technology usa 0 0 NA
## 6 apparel usa 0 0 NA
# Check for missing values
missing_summary <- sapply(stock_data, function(x) sum(is.na(x)))
cat("Missing Values Summary:\n")## Missing Values Summary:
## Date Open High Low Close
## 0 0 0 0 0
## Volume Brand_Name Ticker Industry_Tag Country
## 0 0 0 0 0
## Dividends Stock.Splits Capital.Gains
## 0 0 310120
# Remove rows with critical missing values
stock_data_clean <- stock_data[complete.cases(stock_data[, c("Open", "High", "Low", "Close", "Volume")]), ]
# Create additional features
stock_data_clean <- stock_data_clean %>%
mutate(
Daily_Return = (Close - Open) / Open * 100,
Price_Range = High - Low,
Price_Volatility = (High - Low) / Open * 100,
Month = month(Date),
Weekday = weekdays(Date)
)
cat("Cleaned Dataset Dimensions:", dim(stock_data_clean), "\n")## Cleaned Dataset Dimensions: 310122 18
summary_stats <- stock_data_clean %>%
summarise(
Avg_Open_Price = mean(Open, na.rm = TRUE),
Avg_Close_Price = mean(Close, na.rm = TRUE),
Avg_Volume = mean(Volume, na.rm = TRUE),
Avg_Daily_Return = mean(Daily_Return, na.rm = TRUE),
Total_Companies = n_distinct(Brand_Name),
Total_Industries = n_distinct(Industry_Tag),
Total_Countries = n_distinct(Country)
)
print(summary_stats)## Avg_Open_Price Avg_Close_Price Avg_Volume Avg_Daily_Return Total_Companies
## 1 76.32543 76.33803 22709366 Inf 62
## Total_Industries Total_Countries
## 1 23 7
industry_summary <- stock_data_clean %>%
group_by(Industry_Tag) %>%
summarise(
Count = n_distinct(Brand_Name),
Avg_Close_Price = mean(Close, na.rm = TRUE),
Avg_Volume = mean(Volume, na.rm = TRUE)
) %>%
arrange(desc(Count))
ggplot(industry_summary, aes(x = reorder(Industry_Tag, Count), y = Count, fill = Industry_Tag)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Number of Companies by Industry",
x = "Industry", y = "Number of Companies") +
theme_minimal() +
theme(legend.position = "none")country_summary <- stock_data_clean %>%
group_by(Country) %>%
summarise(
Company_Count = n_distinct(Brand_Name),
Avg_Price = mean(Close, na.rm = TRUE),
Total_Volume = sum(Volume, na.rm = TRUE)
) %>%
arrange(desc(Company_Count))
ggplot(country_summary, aes(x = reorder(Country, Company_Count), y = Company_Count, fill = Country)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Number of Companies by Country",
x = "Country", y = "Number of Companies") +
theme_minimal() +
theme(legend.position = "none")# Select top 5 companies by volume for trend analysis
top_companies <- stock_data_clean %>%
group_by(Brand_Name) %>%
summarise(Total_Volume = sum(Volume, na.rm = TRUE)) %>%
arrange(desc(Total_Volume)) %>%
head(5) %>%
pull(Brand_Name)
trend_data <- stock_data_clean %>%
filter(Brand_Name %in% top_companies)
ggplot(trend_data, aes(x = Date, y = Close, color = Brand_Name, group = Brand_Name)) +
geom_line(size = 1) +
geom_point(size = 0.5) +
labs(title = "Closing Price Trends for Top 5 Companies by Volume",
x = "Date", y = "Closing Price", color = "Company") +
theme_minimal() +
theme(legend.position = "bottom")# Prepare data for clustering
clustering_data <- stock_data_clean %>%
group_by(Brand_Name) %>%
summarise(
Avg_Close_Price = mean(Close, na.rm = TRUE),
Avg_Volume = mean(Volume, na.rm = TRUE),
Avg_Daily_Return = mean(Daily_Return, na.rm = TRUE),
Price_Volatility = sd(Close, na.rm = TRUE) / mean(Close, na.rm = TRUE) * 100
) %>%
na.omit()
# Scale the data
scaled_data <- scale(clustering_data[, -1])
rownames(scaled_data) <- clustering_data$Brand_Name# Determine optimal number of clusters using elbow method
wss <- sapply(1:10, function(k){kmeans(scaled_data, k, nstart = 25)$tot.withinss})
ggplot(data.frame(Clusters = 1:10, WSS = wss), aes(x = Clusters, y = WSS)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "steelblue", size = 2) +
labs(title = "Elbow Method for Optimal K",
x = "Number of Clusters", y = "Within-Cluster Sum of Squares") +
theme_minimal()# Perform K-means clustering with optimal k
set.seed(123)
k <- 4 # Based on elbow method
kmeans_result <- kmeans(scaled_data, centers = k, nstart = 25)
# Add cluster assignments to data
clustering_data$Cluster <- as.factor(kmeans_result$cluster)
# Visualize clusters
fviz_cluster(kmeans_result, data = scaled_data,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "K-means Clustering of Companies")# Show companies in each cluster
for(i in 1:k) {
cat("\nCluster", i, "Companies:\n")
print(clustering_data$Brand_Name[clustering_data$Cluster == i])
}##
## Cluster 1 Companies:
## [1] "adidas" "adobe"
## [3] "american express" "block"
## [5] "coinbase" "costco"
## [7] "crocs" "fedex"
## [9] "hershey company" "jpmorgan chase & co"
## [11] "logitech" "lvmh"
## [13] "marriott" "mastercard"
## [15] "mcdonald's" "netflix"
## [17] "nike" "peloton"
## [19] "procter & gamble" "roblox"
## [21] "salesforce / slack" "shopify"
## [23] "spotify" "starbucks"
## [25] "the home depot" "visa"
## [27] "zoom video communications"
##
## Cluster 2 Companies:
## [1] "chipotle"
##
## Cluster 3 Companies:
## [1] "3m" "airbnb"
## [3] "american eagle outfitters" "bmw group"
## [5] "cisco" "colgate palmolive"
## [7] "delta air lines" "foot locker"
## [9] "hilton" "honda"
## [11] "johnson & johnson" "nintendo"
## [13] "nordstrom" "philips"
## [15] "pinterest" "porsche"
## [17] "puma" "southwest airlines"
## [19] "target" "the coca-cola company"
## [21] "the walt disney company" "toyota"
## [23] "uber" "ubisoft"
## [25] "unilever" "zoominfo"
##
## Cluster 4 Companies:
## [1] "amazon" "amd" "apple" "google" "microsoft" "nvidia"
## [7] "tesla"
# Prepare data for KNN classification
# Create a binary target variable (1 if positive return, 0 otherwise)
# Select features for KNN
# Split data into training and testing sets
set.seed(123)
knn_data <- stock_data_clean %>%
select(Open, High, Low, Close, Volume, Daily_Return) %>%
mutate(Target = ifelse(Daily_Return > 0, 1, 0)) %>%
na.omit()
# Scale the data
knn_scaled <- scale(knn_data[, -ncol(knn_data)])# Find optimal k
accuracy <- numeric(20)
optimal_k <- which.max(accuracy)
cat("Optimal k:", optimal_k, "with accuracy:", max(accuracy), "\n")## Optimal k: 1 with accuracy: 0
# Plot accuracy vs k
ggplot(data.frame(k = 1:20, Accuracy = accuracy), aes(x = k, y = Accuracy)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "steelblue", size = 2) +
geom_vline(xintercept = optimal_k, linetype = "dashed", color = "red") +
labs(title = "KNN Accuracy vs Number of Neighbors",
x = "Number of Neighbors (k)", y = "Accuracy") +
theme_minimal()# Final KNN model with optimal k
set.seed(123)
train_index <- createDataPartition(knn_data$Target, p = 0.7, list = FALSE)
train_data <- knn_scaled[train_index, ]
test_data <- knn_scaled[-train_index, ]
train_labels <- knn_data$Target[train_index]
test_labels <- knn_data$Target[-train_index]
# Confusion Matrix
print("Confusion Matrix:")## [1] "Confusion Matrix:"
# Prepare data for association rules
transaction_data <- stock_data_clean %>%
mutate(
Industry_Tag = as.factor(Industry_Tag),
Volume_Category = case_when(
Volume < quantile(Volume, 0.33) ~ "Low_Volume",
Volume < quantile(Volume, 0.66) ~ "Medium_Volume",
TRUE ~ "High_Volume"
),
Return_Category = ifelse(Daily_Return > 0, "Positive_Return", "Negative_Return")
) %>%
select(Industry_Tag, Volume_Category, Return_Category)
# Convert to transactions
transactions <- as(transaction_data, "transactions")
summary(transactions)## transactions as itemMatrix in sparse format with
## 310122 rows (elements/itemsets/transactions) and
## 28 columns (items) and a density of 0.1071429
##
## most frequent items:
## Return_Category=Negative_Return Return_Category=Positive_Return
## 156831 153291
## Volume_Category=High_Volume Volume_Category=Low_Volume
## 105443 102340
## Volume_Category=Medium_Volume (Other)
## 102339 310122
##
## element (itemset/transaction) length distribution:
## sizes
## 3
## 310122
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3 3 3 3 3
##
## includes extended item information - examples:
## labels variables levels
## 1 Industry_Tag=apparel Industry_Tag apparel
## 2 Industry_Tag=automotive Industry_Tag automotive
## 3 Industry_Tag=aviation Industry_Tag aviation
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
# Generate association rules
rules <- apriori(transactions,
parameter = list(support = 0.05, confidence = 0.6, minlen = 2))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.05 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 15506
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[28 item(s), 310122 transaction(s)] done [0.05s].
## sorting and recoding items ... [12 item(s)] done [0.01s].
## creating transaction tree ... done [0.13s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [5 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
# Sort rules by confidence
rules_sorted <- sort(rules, by = "confidence", decreasing = TRUE)
# Display top rules
inspect(head(rules_sorted, 10))## lhs rhs support confidence coverage lift count
## [1] {Industry_Tag=automotive} => {Volume_Category=Low_Volume} 0.06737671 0.8402702 0.08018457 2.546280 20895
## [2] {Industry_Tag=retail} => {Volume_Category=Medium_Volume} 0.05251804 0.6302531 0.08332850 1.909881 16287
## [3] {Industry_Tag=technology,
## Return_Category=Negative_Return} => {Volume_Category=High_Volume} 0.06549036 0.6120051 0.10700950 1.799989 20310
## [4] {Industry_Tag=technology} => {Volume_Category=High_Volume} 0.13136766 0.6084502 0.21590535 1.789534 40740
## [5] {Industry_Tag=technology,
## Return_Category=Positive_Return} => {Volume_Category=High_Volume} 0.06587730 0.6049569 0.10889585 1.779259 20430
# Prepare data for regression
regression_data <- stock_data_clean %>%
select(Close, Open, High, Low, Volume, Daily_Return, Price_Volatility) %>%
na.omit()
# Split data
set.seed(123)
train_index_reg <- createDataPartition(regression_data$Close, p = 0.7, list = FALSE)
train_reg <- regression_data[train_index_reg, ]
test_reg <- regression_data[-train_index_reg, ]# Train linear regression model
lm_model <- lm(Close ~ Open + High + Low + Volume + Daily_Return + Price_Volatility,
data = train_reg)
# Model summary
summary(lm_model)##
## Call:
## lm(formula = Close ~ Open + High + Low + Volume + Daily_Return +
## Price_Volatility, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.109 -0.169 -0.007 0.159 55.312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.831e-02 4.138e-03 -4.424 9.71e-06 ***
## Open -5.597e-01 1.662e-03 -336.735 < 2e-16 ***
## High 7.519e-01 1.473e-03 510.506 < 2e-16 ***
## Low 8.088e-01 1.177e-03 686.981 < 2e-16 ***
## Volume 3.746e-11 2.836e-11 1.321 0.187
## Daily_Return 1.260e-01 1.293e-03 97.387 < 2e-16 ***
## Price_Volatility 6.120e-03 1.217e-03 5.028 4.97e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.098 on 217080 degrees of freedom
## Multiple R-squared: 0.9999, Adjusted R-squared: 0.9999
## F-statistic: 5.951e+08 on 6 and 217080 DF, p-value: < 2.2e-16
# Make predictions
predictions <- predict(lm_model, newdata = test_reg)
# Calculate performance metrics
rmse <- sqrt(mean((predictions - test_reg$Close)^2))
mae <- mean(abs(predictions - test_reg$Close))
r_squared <- cor(predictions, test_reg$Close)^2
cat("Regression Performance Metrics:\n")## Regression Performance Metrics:
## RMSE: 1.097707
## MAE: 0.4085355
## R-squared: 0.9999418
# Plot actual vs predicted
results <- data.frame(Actual = test_reg$Close, Predicted = predictions)
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Actual vs Predicted Closing Prices",
x = "Actual Price", y = "Predicted Price") +
theme_minimal()# Select numerical variables for correlation
numeric_data <- stock_data_clean %>%
select(Open, High, Low, Close, Volume, Daily_Return, Price_Volatility) %>%
na.omit()
correlation_matrix <- cor(numeric_data)
corrplot(correlation_matrix, method = "color", type = "upper",
tl.cex = 0.8, tl.col = "black",
title = "Correlation Matrix of Stock Variables",
mar = c(0,0,1,0))industry_performance <- stock_data_clean %>%
group_by(Industry_Tag) %>%
summarise(
Avg_Close_Price = mean(Close, na.rm = TRUE),
Avg_Daily_Return = mean(Daily_Return, na.rm = TRUE),
Total_Volume = sum(Volume, na.rm = TRUE)
) %>%
arrange(desc(Avg_Daily_Return))
# Create comparison plots
p1 <- ggplot(industry_performance, aes(x = reorder(Industry_Tag, Avg_Close_Price), y = Avg_Close_Price)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Average Closing Price by Industry", x = "Industry", y = "Average Close Price") +
theme_minimal()
p2 <- ggplot(industry_performance, aes(x = reorder(Industry_Tag, Avg_Daily_Return), y = Avg_Daily_Return)) +
geom_bar(stat = "identity", fill = "darkorange") +
coord_flip() +
labs(title = "Average Daily Return by Industry", x = "Industry", y = "Average Daily Return (%)") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)Industry Distribution: Technology and retail sectors dominate the dataset with the highest number of companies.
Geographical Spread: USA-based companies constitute the majority of the dataset.
Clustering Results: Companies were successfully grouped into 4 distinct clusters based on their trading characteristics.
KNN Performance: The KNN model achieved reasonable accuracy in predicting positive/negative returns.
Association Rules: Meaningful relationships were discovered between industry types, trading volumes, and return patterns.
Predictive Modeling: The linear regression model provided decent predictions of closing prices.
This comprehensive R Markdown file includes: