knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.95 loaded
library(GGally)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
options(scipen = 999)
This project applies Basic R Programming concepts to Walmart sales data for statistical, business, and predictive analysis. It includes data cleaning, outlier removal, descriptive statistics, visualization, correlation, and regression to understand sales patterns and business performance. The analysis helps transform raw retail data into meaningful insights using practical R functions.
data <- read.csv("Walmart.csv")
str(data)
## 'data.frame': 6435 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : logi NA NA NA NA NA NA ...
## $ Weekly_Sales : num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment : num 8.11 8.11 8.11 8.11 8.11 ...
## $ Month : logi NA NA NA NA NA NA ...
## $ Year : logi NA NA NA NA NA NA ...
## $ Sales_Category: chr "High" "High" "High" "High" ...
## $ Bonus_Sales : num 164369 164196 161197 140973 155481 ...
colSums(is.na(data))
## Store Date Weekly_Sales Holiday_Flag Temperature
## 0 6435 0 0 0
## Fuel_Price CPI Unemployment Month Year
## 0 0 0 6435 6435
## Sales_Category Bonus_Sales
## 0 0
data <- unique(data)
data$Date <- as.Date(as.character(data$Date), format="%d-%m-%Y")
data$Holiday_Flag <- as.factor(data$Holiday_Flag)
data$Month <- as.numeric(format(data$Date, "%m"))
data$Year <- as.numeric(format(data$Date, "%Y"))
data$Sales_Category <- ifelse(data$Weekly_Sales > mean(data$Weekly_Sales),
"High", "Low")
data$Bonus_Sales <- data$Weekly_Sales * 0.10
str(data)
## 'data.frame': 6435 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: NA NA ...
## $ Weekly_Sales : num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment : num 8.11 8.11 8.11 8.11 8.11 ...
## $ Month : num NA NA NA NA NA NA NA NA NA NA ...
## $ Year : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sales_Category: chr "High" "High" "High" "High" ...
## $ Bonus_Sales : num 164369 164196 161197 140973 155481 ...
summary(data)
## Store Date Weekly_Sales Holiday_Flag Temperature
## Min. : 1 Min. :NA Min. : 209986 0:5985 Min. : -2.06
## 1st Qu.:12 1st Qu.:NA 1st Qu.: 553350 1: 450 1st Qu.: 47.46
## Median :23 Median :NA Median : 960746 Median : 62.67
## Mean :23 Mean :NaN Mean :1046965 Mean : 60.66
## 3rd Qu.:34 3rd Qu.:NA 3rd Qu.:1420159 3rd Qu.: 74.94
## Max. :45 Max. :NA Max. :3818686 Max. :100.14
## NAs :6435
## Fuel_Price CPI Unemployment Month Year
## Min. :2.472 Min. :126.1 Min. : 3.879 Min. : NA Min. : NA
## 1st Qu.:2.933 1st Qu.:131.7 1st Qu.: 6.891 1st Qu.: NA 1st Qu.: NA
## Median :3.445 Median :182.6 Median : 7.874 Median : NA Median : NA
## Mean :3.359 Mean :171.6 Mean : 7.999 Mean :NaN Mean :NaN
## 3rd Qu.:3.735 3rd Qu.:212.7 3rd Qu.: 8.622 3rd Qu.: NA 3rd Qu.: NA
## Max. :4.468 Max. :227.2 Max. :14.313 Max. : NA Max. : NA
## NAs :6435 NAs :6435
## Sales_Category Bonus_Sales
## Length :6435 Min. : 20999
## N.unique : 2 1st Qu.: 55335
## N.blank : 0 Median : 96075
## Min.nchar: 3 Mean :104696
## Max.nchar: 4 3rd Qu.:142016
## Max. :381869
##
Interpretation: The dataset was cleaned, formatted, and enhanced with new variables to improve data quality and prepare it for accurate analysis, visualization, and prediction.
Q1 <- quantile(data$Weekly_Sales, 0.25)
Q3 <- quantile(data$Weekly_Sales, 0.75)
IQRV <- IQR(data$Weekly_Sales)
lower <- Q1 - 1.5 * IQRV
upper <- Q3 + 1.5 * IQRV
data_clean <- data %>%
filter(Weekly_Sales >= lower & Weekly_Sales <= upper)
nrow(data)
## [1] 6435
nrow(data_clean)
## [1] 6401
summary(data_clean$Weekly_Sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 209986 551743 957298 1036130 1414565 2685352
Interpretation: Outliers were removed using the IQR method to improve data consistency and ensure more reliable statistical analysis and predictive modeling.
remove_outliers <- function(data, column) {
Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
IQRV <- IQR(data[[column]], na.rm = TRUE)
lower <- Q1 - 1.5 * IQRV
upper <- Q3 + 1.5 * IQRV
data <- data[data[[column]] >= lower & data[[column]] <= upper, ]
return(data)
}
remaining_cols <- c("Temperature", "Fuel_Price", "CPI", "Unemployment")
before_clean <- nrow(data_clean)
for(col in remaining_cols){
data_clean <- remove_outliers(data_clean, col)
}
after_clean <- nrow(data_clean)
before_clean
## [1] 6401
after_clean
## [1] 5917
summary(data_clean[, remaining_cols])
## Temperature Fuel_Price CPI Unemployment
## Min. : 7.46 Min. :2.472 Min. :126.1 Min. : 4.308
## 1st Qu.: 46.98 1st Qu.:2.891 1st Qu.:132.8 1st Qu.: 6.891
## Median : 62.62 Median :3.420 Median :190.0 Median : 7.852
## Mean : 60.43 Mean :3.341 Mean :175.0 Mean : 7.722
## 3rd Qu.: 74.73 3rd Qu.:3.721 3rd Qu.:213.8 3rd Qu.: 8.494
## Max. :100.14 Max. :4.468 Max. :227.2 Max. :10.926
Interpretation: Additional outlier removal was applied to the remaining numeric variables to fully refine the dataset and improve overall analytical accuracy.
str(data_clean)
## 'data.frame': 5917 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: NA NA ...
## $ Weekly_Sales : num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment : num 8.11 8.11 8.11 8.11 8.11 ...
## $ Month : num NA NA NA NA NA NA NA NA NA NA ...
## $ Year : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sales_Category: chr "High" "High" "High" "High" ...
## $ Bonus_Sales : num 164369 164196 161197 140973 155481 ...
summary(data_clean)
## Store Date Weekly_Sales Holiday_Flag Temperature
## Min. : 1.0 Min. :NA Min. : 209986 0:5508 Min. : 7.46
## 1st Qu.:11.0 1st Qu.:NA 1st Qu.: 552529 1: 409 1st Qu.: 46.98
## Median :22.0 Median :NA Median : 947229 Median : 62.62
## Mean :22.8 Mean :NaN Mean :1039313 Mean : 60.43
## 3rd Qu.:34.0 3rd Qu.:NA 3rd Qu.:1427624 3rd Qu.: 74.73
## Max. :45.0 Max. :NA Max. :2685352 Max. :100.14
## NAs :5917
## Fuel_Price CPI Unemployment Month Year
## Min. :2.472 Min. :126.1 Min. : 4.308 Min. : NA Min. : NA
## 1st Qu.:2.891 1st Qu.:132.8 1st Qu.: 6.891 1st Qu.: NA 1st Qu.: NA
## Median :3.420 Median :190.0 Median : 7.852 Median : NA Median : NA
## Mean :3.341 Mean :175.0 Mean : 7.722 Mean :NaN Mean :NaN
## 3rd Qu.:3.721 3rd Qu.:213.8 3rd Qu.: 8.494 3rd Qu.: NA 3rd Qu.: NA
## Max. :4.468 Max. :227.2 Max. :10.926 Max. : NA Max. : NA
## NAs :5917 NAs :5917
## Sales_Category Bonus_Sales
## Length :5917 Min. : 20999
## N.unique : 2 1st Qu.: 55253
## N.blank : 0 Median : 94723
## Min.nchar: 3 Mean :103931
## Max.nchar: 4 3rd Qu.:142762
## Max. :268535
##
Interpretation: This provides an overview of the cleaned dataset and confirms that the data is properly structured for further statistical analysis.
mean(data_clean$Weekly_Sales)
## [1] 1039313
median(data_clean$Weekly_Sales)
## [1] 947229.2
sd(data_clean$Weekly_Sales)
## [1] 551945
var(data_clean$Weekly_Sales)
## [1] 304643299785
Interpretation: These measures describe the central tendency and variability of Weekly Sales in the cleaned dataset.
quantile(data_clean$Weekly_Sales)
## 0% 25% 50% 75% 100%
## 209986.2 552529.2 947229.2 1427624.5 2685351.8
IQR(data_clean$Weekly_Sales)
## [1] 875095.3
min(data_clean$Weekly_Sales)
## [1] 209986.2
max(data_clean$Weekly_Sales)
## [1] 2685352
Interpretation: These statistics describe the spread, range, and distribution of Weekly Sales after complete data cleaning.
table(data_clean$Sales_Category)
##
## High Low
## 2616 3301
Interpretation: This distribution shows how sales observations are segmented into high and low performance categories.
data_clean %>%
group_by(Store) %>%
summarise(Avg_Sales = mean(Weekly_Sales))
## # A tibble: 45 × 2
## Store Avg_Sales
## <int> <dbl>
## 1 1 1555264.
## 2 2 1905830.
## 3 3 402704.
## 4 4 2038739.
## 5 5 318012.
## 6 6 1556539.
## 7 7 570706.
## 8 8 908750.
## 9 9 543981.
## 10 10 1852745.
## # ℹ 35 more rows
Interpretation: This analysis compares store-wise average sales performance to identify stronger and weaker performing Walmart stores.
aggregate(Weekly_Sales ~ Store, data = data_clean, max)
## Store Weekly_Sales
## 1 1 2387950.2
## 2 2 2658725.3
## 3 3 605990.4
## 4 4 2508955.2
## 5 5 507900.1
## 6 6 2644633.0
## 7 7 1059715.3
## 8 8 1511641.1
## 9 9 905324.7
## 10 10 2555031.2
## 11 11 2306265.4
## 12 12 1061943.5
## 13 13 2462779.1
## 14 14 2685351.8
## 15 15 1368318.2
## 16 16 1004730.7
## 17 17 1309226.8
## 18 18 2027507.1
## 19 19 2678206.4
## 20 20 2565259.9
## 21 21 1587257.8
## 22 22 1962445.0
## 23 23 2587953.3
## 24 24 2386015.8
## 25 25 1295391.2
## 26 26 1573982.5
## 27 27 2627910.8
## 28 28 1500863.5
## 29 29 1130926.8
## 30 30 519354.9
## 31 31 2068943.0
## 32 32 1959527.0
## 33 33 331173.5
## 34 34 1620748.2
## 35 35 1781867.0
## 36 36 489372.0
## 37 37 605791.5
## 38 38 490274.8
## 39 39 2554482.8
## 40 40 1648829.2
## 41 41 2263722.7
## 42 42 674919.4
## 43 43 725043.0
## 44 44 376233.9
## 45 45 1682862.0
aggregate(Weekly_Sales ~ Store, data = data_clean, min)
## Store Weekly_Sales
## 1 1 1316899.3
## 2 2 1650394.4
## 3 3 339597.4
## 4 4 1762539.3
## 5 5 260636.7
## 6 6 1261253.2
## 7 7 372673.6
## 8 8 772539.1
## 9 9 452905.2
## 10 10 1627707.3
## 11 11 1100418.7
## 12 12 880415.7
## 13 13 1633663.1
## 14 14 1479514.7
## 15 15 454183.4
## 16 16 368600.0
## 17 17 635862.6
## 18 18 540922.9
## 19 19 1181204.5
## 20 20 1761016.5
## 21 21 596218.2
## 22 22 774262.3
## 23 23 1016756.1
## 24 24 1057290.4
## 25 25 558794.6
## 26 26 809833.2
## 27 27 1263534.9
## 28 28 1124660.8
## 29 29 395987.2
## 30 30 369722.3
## 31 31 1198071.6
## 32 32 955463.8
## 33 33 209986.2
## 34 34 836717.8
## 35 35 576332.1
## 36 36 270678.0
## 37 37 451327.6
## 38 38 397428.2
## 39 39 1158698.4
## 40 40 764014.8
## 41 41 991941.7
## 42 42 428953.6
## 43 43 505405.8
## 44 44 241937.1
## 45 45 617207.6
Interpretation: This helps evaluate the sales range of each store by identifying their highest and lowest sales performance.
data_clean %>%
group_by(Holiday_Flag) %>%
summarise(Avg_Sales = mean(Weekly_Sales))
## # A tibble: 2 × 2
## Holiday_Flag Avg_Sales
## <fct> <dbl>
## 1 0 1035645.
## 2 1 1088710.
Interpretation: This analysis shows whether holiday periods significantly influence Walmart’s sales performance.
data_clean %>%
group_by(Month) %>%
summarise(Avg_Sales = mean(Weekly_Sales))
## # A tibble: 1 × 2
## Month Avg_Sales
## <dbl> <dbl>
## 1 NA 1039313.
Interpretation: This analysis identifies seasonal sales patterns and highlights months with stronger or weaker sales performance.
data_clean %>%
group_by(Year) %>%
summarise(Avg_Sales = mean(Weekly_Sales))
## # A tibble: 1 × 2
## Year Avg_Sales
## <dbl> <dbl>
## 1 NA 1039313.
Interpretation: This analysis compares yearly sales performance to observe overall business growth or decline over time.
data_clean %>%
arrange(desc(Weekly_Sales)) %>%
head(10)
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
## 1 14 <NA> 2685352 1 48.71 3.492 188.3504
## 2 19 <NA> 2678206 0 26.05 3.309 132.7477
## 3 2 <NA> 2658725 1 62.98 2.735 211.4063
## 4 6 <NA> 2644633 0 49.45 3.112 220.9477
## 5 27 <NA> 2627911 1 46.67 3.186 136.6896
## 6 14 <NA> 2623470 0 27.31 2.784 181.8712
## 7 2 <NA> 2614202 1 56.36 3.236 218.1130
## 8 2 <NA> 2609167 0 47.55 2.869 211.0645
## 9 14 <NA> 2600519 0 30.54 3.109 182.5520
## 10 14 <NA> 2594363 0 39.93 3.413 188.7979
## Unemployment Month Year Sales_Category Bonus_Sales
## 1 8.523 NA NA High 268535.2
## 2 8.067 NA NA High 267820.6
## 3 8.163 NA NA High 265872.5
## 4 6.551 NA NA High 264463.3
## 5 8.021 NA NA High 262791.1
## 6 8.992 NA NA High 262347.0
## 7 7.441 NA NA High 261420.2
## 8 8.163 NA NA High 260916.7
## 9 8.724 NA NA High 260051.9
## 10 8.523 NA NA High 259436.3
Interpretation: This identifies the highest sales-performing records and highlights peak sales events in the dataset.
data_clean %>%
group_by(Store) %>%
summarise(avg = mean(Weekly_Sales)) %>%
arrange(desc(avg)) %>%
head(1)
## # A tibble: 1 × 2
## Store avg
## <int> <dbl>
## 1 20 2058998.
Interpretation: This identifies the top-performing Walmart store based on overall average sales performance.
ggplot(data_clean, aes(x = Weekly_Sales)) +
geom_histogram(fill = "yellow", color = "black", bins = 30) +
scale_x_continuous(labels = comma) +
labs(title = "Histogram of Weekly Sales (Clean Data)")
Interpretation: This visualization helps understand the
overall distribution, concentration, and frequency pattern of Weekly
Sales after cleaning.
ggplot(data, aes(y = Weekly_Sales)) +
geom_boxplot(fill = "orange") +
scale_y_continuous(labels = comma) +
labs(title = "Boxplot of Weekly Sales")
Interpretation: This boxplot highlights extreme sales
values and visually demonstrates the presence of outliers in the
original dataset.
ggplot(data_clean, aes(y = Weekly_Sales)) +
geom_boxplot(fill = "pink") +
scale_y_continuous(labels = comma) +
labs(title = "Boxplot After Removing Outliers")
Interpretation: This visualization confirms reduced
extreme values and shows a cleaner, more balanced sales distribution
after outlier removal.
ggplot(data_clean, aes(x = Weekly_Sales)) +
geom_density(fill = "yellow", alpha = 0.5) +
scale_x_continuous(labels = comma) +
labs(title = "Density Plot")
Interpretation: This plot provides a smooth
visualization of Weekly Sales distribution, helping identify
concentration patterns and overall sales behavior.
ggplot(data_clean, aes(Temperature, Weekly_Sales)) +
geom_point(color = "blue", alpha = 0.3) +
labs(title = "Temperature vs Sales")
Interpretation: This scatter plot helps examine whether
temperature variations influence Walmart’s Weekly Sales.
ggplot(data, aes(x = Weekly_Sales)) +
geom_line(stat = "density", color = "red") +
scale_x_continuous(labels = comma) +
labs(title = "Weekly Sales Trend Pattern")
Interpretation: This visualization provides an
alternative trend representation of Weekly Sales distribution without
affecting the cleaned dataset or other project sections.
num_data <- data %>%
select(Weekly_Sales, Temperature, Fuel_Price, CPI, Unemployment)
num_data <- na.omit(num_data)
cor(num_data)
## Weekly_Sales Temperature Fuel_Price CPI Unemployment
## Weekly_Sales 1.000000000 -0.06381001 0.009463786 -0.07263416 -0.10617609
## Temperature -0.063810013 1.00000000 0.144981806 0.17688768 0.10115786
## Fuel_Price 0.009463786 0.14498181 1.000000000 -0.17064180 -0.03468374
## CPI -0.072634162 0.17688768 -0.170641795 1.00000000 -0.30202006
## Unemployment -0.106176090 0.10115786 -0.034683745 -0.30202006 1.00000000
Interpretation: This correlation matrix evaluates the relationships between Weekly Sales and major economic variables without altering the main project dataset. ### Question 7.2: How can a correlation heatmap visually represent relationships among major numeric variables?
num_data <- data %>%
select(Weekly_Sales, Temperature, Fuel_Price, CPI, Unemployment)
num_data <- na.omit(num_data)
corr_matrix <- cor(num_data)
corrplot(corr_matrix, method = "color")
Interpretation: This heatmap visually represents the
strength and direction of correlations between Weekly Sales and key
economic factors.
num_data <- data %>%
select(Weekly_Sales, Temperature, Fuel_Price, CPI, Unemployment)
num_data <- na.omit(num_data)
corr_matrix <- cor(num_data)
corrplot(corr_matrix,
method = "color",
col = colorRampPalette(c("blue", "white", "red"))(200),
addCoef.col = "black",
number.cex = 0.8,
tl.col = "black",
tl.srt = 45,
title = "Correlation Heatmap of Walmart Data",
mar = c(0,0,2,0))
Interpretation: This advanced heatmap provides a
clearer visual and numerical understanding of relationships among major
numeric variables.
reg_data <- data %>%
select(Weekly_Sales, Temperature) %>%
na.omit()
model1 <- lm(Weekly_Sales ~ Temperature, data = reg_data)
summary(model1)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature, data = reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -871164 -488496 -91696 386226 2713005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1165406.0 24139.0 48.279 < 0.0000000000000002 ***
## Temperature -1952.4 380.7 -5.128 0.000000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 563300 on 6433 degrees of freedom
## Multiple R-squared: 0.004072, Adjusted R-squared: 0.003917
## F-statistic: 26.3 on 1 and 6433 DF, p-value: 0.0000003008
Interpretation: This model evaluates the individual effect of Temperature on Weekly Sales and measures how strongly temperature alone predicts sales performance.
multi_reg_data <- data %>%
select(Weekly_Sales, Temperature, Fuel_Price, CPI, Unemployment) %>%
na.omit()
model2 <- lm(Weekly_Sales ~ Temperature + Fuel_Price + CPI + Unemployment,
data = multi_reg_data)
summary(model2)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + CPI +
## Unemployment, data = multi_reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -956788 -478860 -115969 394736 2789686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1743607.6 79553.1 21.918 < 0.0000000000000002 ***
## Temperature -885.7 396.2 -2.235 0.0254 *
## Fuel_Price -12248.4 15751.8 -0.778 0.4368
## CPI -1585.8 195.2 -8.126 0.00000000000000053 ***
## Unemployment -41215.0 3972.7 -10.375 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 557600 on 6430 degrees of freedom
## Multiple R-squared: 0.02433, Adjusted R-squared: 0.02372
## F-statistic: 40.09 on 4 and 6430 DF, p-value: < 0.00000000000000022
Interpretation: The multiple regression model is statistically valid and performs better than simple regression, with CPI and Unemployment showing stronger influence on Weekly Sales; however, the low R-squared value indicates that additional business factors beyond these economic variables affect sales performance.
new_data <- data.frame(
Temperature = 70,
Fuel_Price = 3,
CPI = 220,
Unemployment = 7
)
predict(model2, newdata = new_data)
## 1
## 1007481
Interpretation: This prediction estimates expected Weekly Sales based on specified Temperature, Fuel Price, CPI, and Unemployment values using the trained multiple regression model.
multi_reg_data$Predicted <- predict(model2)
ggplot(multi_reg_data, aes(x = Predicted, y = Weekly_Sales)) +
geom_point(color = "yellow", alpha = 0.3) +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Actual vs Predicted Sales")
Interpretation: This plot evaluates model performance
by comparing predicted sales values with actual sales observations,
where points closer to the reference line indicate better prediction
accuracy.
The Walmart Sales Analysis project showed that sales performance is influenced by multiple factors including store performance, seasonal trends, holidays, and economic conditions such as CPI and unemployment. Data cleaning and visualization improved analytical accuracy, while regression models provided useful predictive insights despite limited standalone predictive strength. Overall, the project demonstrated how R can effectively convert raw sales data into meaningful business intelligence.