## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
## Rows: 1000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, City, State, Region, Product, Category, Payment_Method
## dbl (11): Order_ID, Customer_ID, Age, Quantity, Unit_Price, Discount_%, Tot...
## date (2): Order_Date, Delivery_Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(ecommerce_csv)
# Creating a placeholder dataframe for now so the document can knit
data <- data.frame(
id = 1:100,
category = sample(c("A", "B", "C"), 100, replace = TRUE),
numeric_var1 = rnorm(100, mean = 50, sd = 10),
numeric_var2 = rnorm(100, mean = 200, sd = 50),
date_col = as.Date('2026-01-01') + 1:100
)
# Introducing some missing values and duplicates for demonstration
data$numeric_var1[c(10, 25)] <- NA
data <- rbind(data, data[1:5, ])#Understanding vectors, lists, matrices, and data frames through our imported data.
## Rows: 105
## Columns: 5
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ category <chr> "B", "C", "B", "C", "A", "B", "C", "C", "C", "B", "A", "C…
## $ numeric_var1 <dbl> 39.85995, 48.98786, 50.36490, 52.99155, 34.65931, 55.4595…
## $ numeric_var2 <dbl> 98.9371, 249.3218, 198.4747, 162.2575, 176.2155, 169.7382…
## $ date_col <date> 2026-01-02, 2026-01-03, 2026-01-04, 2026-01-05, 2026-01-…
#Selecting specific columns of interest using
select().
## id category numeric_var1 numeric_var2
## 1 1 B 39.85995 98.9371
## 2 2 C 48.98786 249.3218
## 3 3 B 50.36490 198.4747
## 4 4 C 52.99155 162.2575
## 5 5 A 34.65931 176.2155
## 6 6 B 55.45951 169.7382
#Filtering rows based on a condition and sorting the results using
filter() and arrange()`.
filtered_data <- data %>%
filter(category == "A") %>%
arrange(desc(numeric_var1))
head(filtered_data)## id category numeric_var1 numeric_var2 date_col
## 1 20 A 73.26981 170.5331 2026-01-21
## 2 89 A 72.71375 184.2594 2026-03-31
## 3 71 A 66.23843 152.2848 2026-03-13
## 4 55 A 65.63658 243.3121 2026-02-25
## 5 79 A 63.74478 188.9867 2026-03-21
## 6 54 A 63.23276 252.8675 2026-02-24
#Using mutate() to create new calculated columns.
## id category numeric_var1 numeric_var2 date_col total_score
## 1 1 B 39.85995 98.9371 2026-01-02 138.7970
## 2 2 C 48.98786 249.3218 2026-01-03 298.3096
## 3 3 B 50.36490 198.4747 2026-01-04 248.8396
## 4 4 C 52.99155 162.2575 2026-01-05 215.2490
## 5 5 A 34.65931 176.2155 2026-01-06 210.8748
## 6 6 B 55.45951 169.7382 2026-01-07 225.1977
#Grouping data by a categorical variable and summarizing it.
group_summary <- data %>%
group_by(category) %>%
summarise(avg_score = mean(numeric_var2, na.rm = TRUE),
count = n())
print(group_summary)## # A tibble: 3 × 3
## category avg_score count
## <chr> <dbl> <int>
## 1 A 191. 43
## 2 B 200. 30
## 3 C 210. 32
#Checking for missing data (NAs) in our dataset.
## id category numeric_var1 numeric_var2 date_col total_score
## 0 0 2 0 0 2
#Handling missing values by dropping them (deletion) or replacing them (imputation).
#Calculating Mean, Median, Standard Deviation, Percentiles, and Quartiles.
desc_stats <- data_unique %>%
summarise(
mean_val = mean(numeric_var2),
median_val = median(numeric_var2),
sd_val = sd(numeric_var2),
q1 = quantile(numeric_var2, 0.25),
q3 = quantile(numeric_var2, 0.75),
iqr_val = IQR(numeric_var2)
)
print(desc_stats)## mean_val median_val sd_val q1 q3 iqr_val
## 1 200.3784 201.3655 54.79817 168.2392 231.6115 63.37234
#Identifying outliers based on the Interquartile Range (IQR).
Q1 <- quantile(data_unique$numeric_var2, 0.25)
Q3 <- quantile(data_unique$numeric_var2, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- data_unique %>%
filter(numeric_var2 < lower_bound | numeric_var2 > upper_bound)
print(outliers)## id category numeric_var1 numeric_var2 date_col total_score
## 1 21 A 39.23921 355.50969 2026-01-22 394.7489
## 2 44 A 39.57592 28.80489 2026-02-14 68.3808
## 3 100 C 62.65562 333.06507 2026-04-11 395.7207
#Capping the outliers to the boundary values instead of removing them.
data_treated <- data_unique %>%
mutate(numeric_var2 = case_when(
numeric_var2 < lower_bound ~ lower_bound,
numeric_var2 > upper_bound ~ upper_bound,
TRUE ~ numeric_var2
))
data_treated## id category numeric_var1 numeric_var2 date_col total_score
## 1 1 B 39.85995 98.93710 2026-01-02 138.7970
## 2 2 C 48.98786 249.32177 2026-01-03 298.3096
## 3 3 B 50.36490 198.47475 2026-01-04 248.8396
## 4 4 C 52.99155 162.25748 2026-01-05 215.2490
## 5 5 A 34.65931 176.21549 2026-01-06 210.8748
## 6 6 B 55.45951 169.73824 2026-01-07 225.1977
## 7 7 C 56.70598 113.73791 2026-01-08 170.4439
## 8 8 C 46.91420 226.60533 2026-01-09 273.5195
## 9 9 C 50.42080 158.26636 2026-01-10 208.6872
## 10 10 B 49.04221 265.05636 2026-01-11 NA
## 11 11 A 34.32806 180.96633 2026-01-12 215.2944
## 12 12 C 32.71820 220.51275 2026-01-13 253.2310
## 13 13 B 56.42137 226.26129 2026-01-14 282.6827
## 14 14 C 51.49849 131.20701 2026-01-15 182.7055
## 15 15 B 52.63726 308.55053 2026-01-16 361.1878
## 16 16 A 50.45257 246.39870 2026-01-17 296.8513
## 17 17 B 63.51415 211.72606 2026-01-18 275.2402
## 18 18 B 55.60072 162.49053 2026-01-19 218.0912
## 19 19 A 51.58272 203.67153 2026-01-20 255.2543
## 20 20 A 73.26981 170.53306 2026-01-21 243.8029
## 21 21 A 39.23921 326.67005 2026-01-22 394.7489
## 22 22 A 46.42611 179.25899 2026-01-23 225.6851
## 23 23 A 62.31168 121.09582 2026-01-24 183.4075
## 24 24 A 48.90092 175.74931 2026-01-25 224.6502
## 25 25 C 49.04221 267.40506 2026-01-26 NA
## 26 26 B 46.87650 263.45259 2026-01-27 310.3291
## 27 27 B 43.09011 255.21540 2026-01-28 298.3055
## 28 28 C 52.58801 271.75538 2026-01-29 324.3434
## 29 29 A 46.91217 168.85035 2026-01-30 215.7625
## 30 30 A 37.58700 231.21649 2026-01-31 268.8035
## 31 31 C 34.16090 212.78577 2026-02-01 246.9467
## 32 32 C 54.60080 150.11601 2026-02-02 204.7168
## 33 33 B 67.02165 254.98503 2026-02-03 322.0067
## 34 34 B 40.81152 159.33973 2026-02-04 200.1512
## 35 35 A 44.70594 182.80998 2026-02-05 227.5159
## 36 36 A 26.07692 200.55865 2026-02-06 226.6356
## 37 37 C 65.56477 223.52177 2026-02-07 289.0865
## 38 38 B 48.56802 169.46934 2026-02-08 218.0374
## 39 39 C 36.41932 172.29022 2026-02-09 208.7095
## 40 40 C 45.34651 295.63363 2026-02-10 340.9801
## 41 41 B 55.29533 175.57176 2026-02-11 230.8671
## 42 42 C 47.42027 224.43262 2026-02-12 271.8529
## 43 43 A 32.06835 147.46822 2026-02-13 179.5366
## 44 44 A 39.57592 73.18067 2026-02-14 68.3808
## 45 45 A 48.95339 230.64140 2026-02-15 279.5948
## 46 46 B 50.23032 221.86367 2026-02-16 272.0940
## 47 47 C 54.22648 316.31125 2026-02-17 370.5377
## 48 48 C 51.70440 171.80343 2026-02-18 223.5078
## 49 49 C 48.38006 197.60050 2026-02-19 245.9806
## 50 50 B 35.75537 217.71664 2026-02-20 253.4720
## 51 51 C 47.91436 105.51448 2026-02-21 153.4288
## 52 52 A 60.06634 119.79176 2026-02-22 179.8581
## 53 53 B 48.94042 239.35558 2026-02-23 288.2960
## 54 54 A 63.23276 252.86748 2026-02-24 316.1002
## 55 55 A 65.63658 243.31207 2026-02-25 308.9486
## 56 56 A 45.93554 126.93304 2026-02-26 172.8686
## 57 57 A 44.02113 143.42979 2026-02-27 187.4509
## 58 58 B 52.94525 187.45893 2026-02-28 240.4042
## 59 59 B 44.92189 249.15144 2026-03-01 294.0733
## 60 60 C 43.70929 205.87222 2026-03-02 249.5815
## 61 61 A 48.24575 206.20189 2026-03-03 254.4476
## 62 62 A 46.23323 270.37539 2026-03-04 316.6086
## 63 63 A 41.84434 166.40570 2026-03-05 208.2500
## 64 64 B 57.67300 159.73859 2026-03-06 217.4116
## 65 65 B 38.44412 219.63020 2026-03-07 258.0743
## 66 66 A 42.84237 202.17243 2026-03-08 245.0148
## 67 67 A 52.66946 188.68933 2026-03-09 241.3588
## 68 68 A 38.76294 303.53483 2026-03-10 342.2978
## 69 69 A 52.16975 109.94417 2026-03-11 162.1139
## 70 70 A 60.30823 194.43387 2026-03-12 254.7421
## 71 71 A 66.23843 152.28479 2026-03-13 218.5232
## 72 72 B 39.15441 221.26416 2026-03-14 260.4186
## 73 73 A 57.32951 207.22794 2026-03-15 264.5574
## 74 74 A 52.96127 136.56567 2026-03-16 189.5269
## 75 75 C 38.20470 194.03696 2026-03-17 232.2417
## 76 76 C 55.42658 237.31522 2026-03-18 292.7418
## 77 77 A 36.80885 160.78113 2026-03-19 197.5900
## 78 78 A 47.35815 232.79665 2026-03-20 280.1548
## 79 79 A 63.74478 188.98671 2026-03-21 252.7315
## 80 80 B 53.89121 145.08385 2026-03-22 198.9751
## 81 81 B 52.42439 197.27036 2026-03-23 249.6947
## 82 82 A 42.23141 220.64185 2026-03-24 262.8733
## 83 83 B 55.50918 118.93522 2026-03-25 174.4444
## 84 84 C 67.03558 209.77708 2026-03-26 276.8127
## 85 85 A 51.61191 235.39655 2026-03-27 287.0085
## 86 86 B 62.88723 229.19803 2026-03-28 292.0853
## 87 87 A 49.65666 211.64789 2026-03-29 261.3046
## 88 88 C 45.88871 85.55198 2026-03-30 131.4407
## 89 89 A 72.71375 184.25944 2026-03-31 256.9732
## 90 90 A 46.55780 199.64657 2026-04-01 246.2044
## 91 91 C 37.72447 208.35729 2026-04-02 246.0818
## 92 92 C 59.29492 220.02876 2026-04-03 279.3237
## 93 93 A 52.72457 192.19643 2026-04-04 244.9210
## 94 94 B 41.88835 234.24719 2026-04-05 276.1355
## 95 95 C 44.93246 225.71638 2026-04-06 270.6488
## 96 96 C 30.33183 235.21114 2026-04-07 265.5430
## 97 97 B 50.62589 143.84534 2026-04-08 194.4712
## 98 98 A 44.70461 184.44824 2026-04-09 229.1528
## 99 99 C 36.24491 273.08086 2026-04-10 309.3258
## 100 100 C 62.65562 326.67005 2026-04-11 395.7207
#Visualizing categorical data frequencies.
ggplot(data_treated, aes(x = category, fill = category)) +
geom_bar() +
labs(title = "Count of Observations per Category", x = "Category", y = "Count") +
theme_minimal()#Visualizing the distribution of a continuous variable.
ggplot(data_treated, aes(x = numeric_var1)) +
geom_histogram(bins = 15, fill = "steelblue", color = "white") +
labs(title = "Distribution of Numeric Variable 1", x = "Value", y = "Frequency") +
theme_minimal()#Exploring the smooth distribution shape (symmetry and skewness).
ggplot(data_treated, aes(x = numeric_var2, fill = category)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot by Category", x = "Numeric Variable 2", y = "Density") +
theme_minimal()#Visualizing distributions, medians, and potential outliers.
ggplot(data_treated, aes(x = category, y = numeric_var2, fill = category)) +
geom_boxplot() +
labs(title = "Boxplot of Numeric Variable 2 by Category", x = "Category", y = "Value") +
theme_minimal()#Exploring pairwise relationships between two numeric variables.
ggplot(data_treated, aes(x = numeric_var1, y = numeric_var2, color = category)) +
geom_point(size = 3, alpha = 0.7) +
labs(title = "Scatter Plot: Var1 vs Var2", x = "Numeric Variable 1", y = "Numeric Variable 2") +
theme_minimal()#Using advanced ggplot2 grammar of graphics to create subplots.
ggplot(data_treated, aes(x = numeric_var1, y = numeric_var2)) +
geom_point(color = "darkred") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
facet_wrap(~ category) +
labs(title = "Faceted Scatter Plot with Trend Lines", x = "Var 1", y = "Var 2") +
theme_bw()## `geom_smooth()` using formula = 'y ~ x'
#Applying extensive customizations: Titles, labels, legends, color scales.
ggplot(data_treated, aes(x = date_col, y = total_score, color = category)) +
geom_line(size = 1) +
scale_color_brewer(palette = "Set1") +
labs(title = "Time Series Trend of Total Score",
subtitle = "Grouped by Category",
x = "Date",
y = "Total Score",
color = "Group Category") +
theme_classic() +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold", hjust = 0.5))## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Computing the correlation matrix for numeric variables.
numeric_data <- data_treated %>% select_if(is.numeric) %>% select(-id)
cor_matrix <- cor(numeric_data, use = "complete.obs")
print(cor_matrix)## numeric_var1 numeric_var2 total_score
## numeric_var1 1.0000000 -0.0314806 0.1496947
## numeric_var2 -0.0314806 1.0000000 0.9793225
## total_score 0.1496947 0.9793225 1.0000000
corrplot(cor_matrix, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black", tl.srt = 45,
title = "Correlation Heatmap", mar = c(0,0,1,0))# Basic pairwise exploration using facet_grid or plotting individual scatters
# Here we show a combined plot approach
ggplot(data_treated, aes(x = numeric_var1, y = total_score)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Pairwise Exploration: Var1 vs Total Score") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
##
## Call:
## lm(formula = numeric_var2 ~ numeric_var1, data = data_treated)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128.961 -32.851 -0.344 31.188 128.527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 208.9965 27.8678 7.500 2.92e-11 ***
## numeric_var1 -0.1732 0.5558 -0.312 0.756
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.87 on 98 degrees of freedom
## Multiple R-squared: 0.00099, Adjusted R-squared: -0.009204
## F-statistic: 0.09711 on 1 and 98 DF, p-value: 0.756
multi_model <- lm(total_score ~ numeric_var1 + numeric_var2 + category, data = data_treated)
summary(multi_model)##
## Call:
## lm(formula = total_score ~ numeric_var1 + numeric_var2 + category,
## data = data_treated)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.179 -1.024 0.093 1.355 24.369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.243502 3.423357 -2.700 0.00824 **
## numeric_var1 1.031660 0.054159 19.049 < 2e-16 ***
## numeric_var2 1.038180 0.010053 103.270 < 2e-16 ***
## categoryB -0.043201 1.271610 -0.034 0.97297
## categoryC 0.003071 1.250160 0.002 0.99805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.132 on 93 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.9917, Adjusted R-squared: 0.9913
## F-statistic: 2776 on 4 and 93 DF, p-value: < 2.2e-16
This project successfully applies R programming concepts from basic
data manipulation to advanced regression analysis, completely utilizing
tidyverse and corrplot as requested.