library(tidyverse) # Includes dplyr, ggplot2, readr, etc.

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'tidyr' was built under R version 4.5.3

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'purrr' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

library(dplyr)

Part 1: Data Structures and Data Manipulation

1. Data Import

library(readr)
ecommerce_csv <- read_csv("C:/Users/Siddy/Downloads/ecommerce.csv.csv")

## Rows: 1000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): Gender, City, State, Region, Product, Category, Payment_Method
## dbl  (11): Order_ID, Customer_ID, Age, Quantity, Unit_Price, Discount_%, Tot...
## date  (2): Order_Date, Delivery_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(ecommerce_csv)

# Creating a placeholder dataframe for now so the document can knit
data <- data.frame(
  id = 1:100,
  category = sample(c("A", "B", "C"), 100, replace = TRUE),
  numeric_var1 = rnorm(100, mean = 50, sd = 10),
  numeric_var2 = rnorm(100, mean = 200, sd = 50),
  date_col = as.Date('2026-01-01') + 1:100
)
# Introducing some missing values and duplicates for demonstration
data$numeric_var1[c(10, 25)] <- NA
data <- rbind(data, data[1:5, ])

2. Checking Data Structure

#Understanding vectors, lists, matrices, and data frames through our imported data.

# Glimpse provides a clean view of the data frame structure and data types
glimpse(data)

## Rows: 105
## Columns: 5
## $ id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ category     <chr> "B", "C", "B", "C", "A", "B", "C", "C", "C", "B", "A", "C…
## $ numeric_var1 <dbl> 39.85995, 48.98786, 50.36490, 52.99155, 34.65931, 55.4595…
## $ numeric_var2 <dbl> 98.9371, 249.3218, 198.4747, 162.2575, 176.2155, 169.7382…
## $ date_col     <date> 2026-01-02, 2026-01-03, 2026-01-04, 2026-01-05, 2026-01-…

3. Subsetting and Indexing

#Selecting specific columns of interest using select().

subset_data <- data %>%
  select(id, category, numeric_var1, numeric_var2)
head(subset_data)

##   id category numeric_var1 numeric_var2
## 1  1        B     39.85995      98.9371
## 2  2        C     48.98786     249.3218
## 3  3        B     50.36490     198.4747
## 4  4        C     52.99155     162.2575
## 5  5        A     34.65931     176.2155
## 6  6        B     55.45951     169.7382

4. Sorting and Filtering

#Filtering rows based on a condition and sorting the results using filter() and arrange()`.

filtered_data <- data %>%
  filter(category == "A") %>%
  arrange(desc(numeric_var1))
head(filtered_data)

##   id category numeric_var1 numeric_var2   date_col
## 1 20        A     73.26981     170.5331 2026-01-21
## 2 89        A     72.71375     184.2594 2026-03-31
## 3 71        A     66.23843     152.2848 2026-03-13
## 4 55        A     65.63658     243.3121 2026-02-25
## 5 79        A     63.74478     188.9867 2026-03-21
## 6 54        A     63.23276     252.8675 2026-02-24

5. Creating New Variables

#Using mutate() to create new calculated columns.

data <- data %>%
  mutate(total_score = numeric_var1 + numeric_var2)
head(data)

##   id category numeric_var1 numeric_var2   date_col total_score
## 1  1        B     39.85995      98.9371 2026-01-02    138.7970
## 2  2        C     48.98786     249.3218 2026-01-03    298.3096
## 3  3        B     50.36490     198.4747 2026-01-04    248.8396
## 4  4        C     52.99155     162.2575 2026-01-05    215.2490
## 5  5        A     34.65931     176.2155 2026-01-06    210.8748
## 6  6        B     55.45951     169.7382 2026-01-07    225.1977

6. Group-wise Operations

#Grouping data by a categorical variable and summarizing it.

group_summary <- data %>%
  group_by(category) %>%
  summarise(avg_score = mean(numeric_var2, na.rm = TRUE),
            count = n())
print(group_summary)

## # A tibble: 3 × 3
##   category avg_score count
##   <chr>        <dbl> <int>
## 1 A             191.    43
## 2 B             200.    30
## 3 C             210.    32

7. Identifying Missing Values

#Checking for missing data (NAs) in our dataset.

missing_counts <- colSums(is.na(data))
print(missing_counts)

##           id     category numeric_var1 numeric_var2     date_col  total_score 
##            0            0            2            0            0            2

8. Imputation and Deletion

#Handling missing values by dropping them (deletion) or replacing them (imputation).

# Option 1: Deletion
clean_data <- data %>% drop_na()

# Option 2: Imputation (replacing NA with mean)
data_imputed <- data %>%
  mutate(numeric_var1 = ifelse(is.na(numeric_var1), 
                               mean(numeric_var1, na.rm = TRUE), 
                               numeric_var1))

9. Removing Duplicates

#Identifying and removing duplicate rows.

# Number of rows before removing duplicates
nrow(data_imputed)

## [1] 105

data_unique <- data_imputed %>% distinct()

# Number of rows after removing duplicates
nrow(data_unique)

## [1] 100

Part 2: Exploratory Data Analysis (EDA)

10. Descriptive Statistics

#Calculating Mean, Median, Standard Deviation, Percentiles, and Quartiles.

desc_stats <- data_unique %>%
  summarise(
    mean_val = mean(numeric_var2),
    median_val = median(numeric_var2),
    sd_val = sd(numeric_var2),
    q1 = quantile(numeric_var2, 0.25),
    q3 = quantile(numeric_var2, 0.75),
    iqr_val = IQR(numeric_var2)
  )
print(desc_stats)

##   mean_val median_val   sd_val       q1       q3  iqr_val
## 1 200.3784   201.3655 54.79817 168.2392 231.6115 63.37234

11. Outlier Detection using IQR Method

#Identifying outliers based on the Interquartile Range (IQR).

Q1 <- quantile(data_unique$numeric_var2, 0.25)
Q3 <- quantile(data_unique$numeric_var2, 0.75)
IQR_value <- Q3 - Q1

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

outliers <- data_unique %>%
  filter(numeric_var2 < lower_bound | numeric_var2 > upper_bound)
print(outliers)

##    id category numeric_var1 numeric_var2   date_col total_score
## 1  21        A     39.23921    355.50969 2026-01-22    394.7489
## 2  44        A     39.57592     28.80489 2026-02-14     68.3808
## 3 100        C     62.65562    333.06507 2026-04-11    395.7207

12. Handling Outliers (Treatment)

#Capping the outliers to the boundary values instead of removing them.

data_treated <- data_unique %>%
  mutate(numeric_var2 = case_when(
    numeric_var2 < lower_bound ~ lower_bound,
    numeric_var2 > upper_bound ~ upper_bound,
    TRUE ~ numeric_var2
  ))
data_treated

##      id category numeric_var1 numeric_var2   date_col total_score
## 1     1        B     39.85995     98.93710 2026-01-02    138.7970
## 2     2        C     48.98786    249.32177 2026-01-03    298.3096
## 3     3        B     50.36490    198.47475 2026-01-04    248.8396
## 4     4        C     52.99155    162.25748 2026-01-05    215.2490
## 5     5        A     34.65931    176.21549 2026-01-06    210.8748
## 6     6        B     55.45951    169.73824 2026-01-07    225.1977
## 7     7        C     56.70598    113.73791 2026-01-08    170.4439
## 8     8        C     46.91420    226.60533 2026-01-09    273.5195
## 9     9        C     50.42080    158.26636 2026-01-10    208.6872
## 10   10        B     49.04221    265.05636 2026-01-11          NA
## 11   11        A     34.32806    180.96633 2026-01-12    215.2944
## 12   12        C     32.71820    220.51275 2026-01-13    253.2310
## 13   13        B     56.42137    226.26129 2026-01-14    282.6827
## 14   14        C     51.49849    131.20701 2026-01-15    182.7055
## 15   15        B     52.63726    308.55053 2026-01-16    361.1878
## 16   16        A     50.45257    246.39870 2026-01-17    296.8513
## 17   17        B     63.51415    211.72606 2026-01-18    275.2402
## 18   18        B     55.60072    162.49053 2026-01-19    218.0912
## 19   19        A     51.58272    203.67153 2026-01-20    255.2543
## 20   20        A     73.26981    170.53306 2026-01-21    243.8029
## 21   21        A     39.23921    326.67005 2026-01-22    394.7489
## 22   22        A     46.42611    179.25899 2026-01-23    225.6851
## 23   23        A     62.31168    121.09582 2026-01-24    183.4075
## 24   24        A     48.90092    175.74931 2026-01-25    224.6502
## 25   25        C     49.04221    267.40506 2026-01-26          NA
## 26   26        B     46.87650    263.45259 2026-01-27    310.3291
## 27   27        B     43.09011    255.21540 2026-01-28    298.3055
## 28   28        C     52.58801    271.75538 2026-01-29    324.3434
## 29   29        A     46.91217    168.85035 2026-01-30    215.7625
## 30   30        A     37.58700    231.21649 2026-01-31    268.8035
## 31   31        C     34.16090    212.78577 2026-02-01    246.9467
## 32   32        C     54.60080    150.11601 2026-02-02    204.7168
## 33   33        B     67.02165    254.98503 2026-02-03    322.0067
## 34   34        B     40.81152    159.33973 2026-02-04    200.1512
## 35   35        A     44.70594    182.80998 2026-02-05    227.5159
## 36   36        A     26.07692    200.55865 2026-02-06    226.6356
## 37   37        C     65.56477    223.52177 2026-02-07    289.0865
## 38   38        B     48.56802    169.46934 2026-02-08    218.0374
## 39   39        C     36.41932    172.29022 2026-02-09    208.7095
## 40   40        C     45.34651    295.63363 2026-02-10    340.9801
## 41   41        B     55.29533    175.57176 2026-02-11    230.8671
## 42   42        C     47.42027    224.43262 2026-02-12    271.8529
## 43   43        A     32.06835    147.46822 2026-02-13    179.5366
## 44   44        A     39.57592     73.18067 2026-02-14     68.3808
## 45   45        A     48.95339    230.64140 2026-02-15    279.5948
## 46   46        B     50.23032    221.86367 2026-02-16    272.0940
## 47   47        C     54.22648    316.31125 2026-02-17    370.5377
## 48   48        C     51.70440    171.80343 2026-02-18    223.5078
## 49   49        C     48.38006    197.60050 2026-02-19    245.9806
## 50   50        B     35.75537    217.71664 2026-02-20    253.4720
## 51   51        C     47.91436    105.51448 2026-02-21    153.4288
## 52   52        A     60.06634    119.79176 2026-02-22    179.8581
## 53   53        B     48.94042    239.35558 2026-02-23    288.2960
## 54   54        A     63.23276    252.86748 2026-02-24    316.1002
## 55   55        A     65.63658    243.31207 2026-02-25    308.9486
## 56   56        A     45.93554    126.93304 2026-02-26    172.8686
## 57   57        A     44.02113    143.42979 2026-02-27    187.4509
## 58   58        B     52.94525    187.45893 2026-02-28    240.4042
## 59   59        B     44.92189    249.15144 2026-03-01    294.0733
## 60   60        C     43.70929    205.87222 2026-03-02    249.5815
## 61   61        A     48.24575    206.20189 2026-03-03    254.4476
## 62   62        A     46.23323    270.37539 2026-03-04    316.6086
## 63   63        A     41.84434    166.40570 2026-03-05    208.2500
## 64   64        B     57.67300    159.73859 2026-03-06    217.4116
## 65   65        B     38.44412    219.63020 2026-03-07    258.0743
## 66   66        A     42.84237    202.17243 2026-03-08    245.0148
## 67   67        A     52.66946    188.68933 2026-03-09    241.3588
## 68   68        A     38.76294    303.53483 2026-03-10    342.2978
## 69   69        A     52.16975    109.94417 2026-03-11    162.1139
## 70   70        A     60.30823    194.43387 2026-03-12    254.7421
## 71   71        A     66.23843    152.28479 2026-03-13    218.5232
## 72   72        B     39.15441    221.26416 2026-03-14    260.4186
## 73   73        A     57.32951    207.22794 2026-03-15    264.5574
## 74   74        A     52.96127    136.56567 2026-03-16    189.5269
## 75   75        C     38.20470    194.03696 2026-03-17    232.2417
## 76   76        C     55.42658    237.31522 2026-03-18    292.7418
## 77   77        A     36.80885    160.78113 2026-03-19    197.5900
## 78   78        A     47.35815    232.79665 2026-03-20    280.1548
## 79   79        A     63.74478    188.98671 2026-03-21    252.7315
## 80   80        B     53.89121    145.08385 2026-03-22    198.9751
## 81   81        B     52.42439    197.27036 2026-03-23    249.6947
## 82   82        A     42.23141    220.64185 2026-03-24    262.8733
## 83   83        B     55.50918    118.93522 2026-03-25    174.4444
## 84   84        C     67.03558    209.77708 2026-03-26    276.8127
## 85   85        A     51.61191    235.39655 2026-03-27    287.0085
## 86   86        B     62.88723    229.19803 2026-03-28    292.0853
## 87   87        A     49.65666    211.64789 2026-03-29    261.3046
## 88   88        C     45.88871     85.55198 2026-03-30    131.4407
## 89   89        A     72.71375    184.25944 2026-03-31    256.9732
## 90   90        A     46.55780    199.64657 2026-04-01    246.2044
## 91   91        C     37.72447    208.35729 2026-04-02    246.0818
## 92   92        C     59.29492    220.02876 2026-04-03    279.3237
## 93   93        A     52.72457    192.19643 2026-04-04    244.9210
## 94   94        B     41.88835    234.24719 2026-04-05    276.1355
## 95   95        C     44.93246    225.71638 2026-04-06    270.6488
## 96   96        C     30.33183    235.21114 2026-04-07    265.5430
## 97   97        B     50.62589    143.84534 2026-04-08    194.4712
## 98   98        A     44.70461    184.44824 2026-04-09    229.1528
## 99   99        C     36.24491    273.08086 2026-04-10    309.3258
## 100 100        C     62.65562    326.67005 2026-04-11    395.7207

Part 3: Data Visualization

13. Basic Bar Chart

#Visualizing categorical data frequencies.

ggplot(data_treated, aes(x = category, fill = category)) +
  geom_bar() +
  labs(title = "Count of Observations per Category", x = "Category", y = "Count") +
  theme_minimal()

14. Histogram

#Visualizing the distribution of a continuous variable.

ggplot(data_treated, aes(x = numeric_var1)) +
  geom_histogram(bins = 15, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Numeric Variable 1", x = "Value", y = "Frequency") +
  theme_minimal()

15. Density Plot

#Exploring the smooth distribution shape (symmetry and skewness).

ggplot(data_treated, aes(x = numeric_var2, fill = category)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot by Category", x = "Numeric Variable 2", y = "Density") +
  theme_minimal()

16. Boxplot

#Visualizing distributions, medians, and potential outliers.

ggplot(data_treated, aes(x = category, y = numeric_var2, fill = category)) +
  geom_boxplot() +
  labs(title = "Boxplot of Numeric Variable 2 by Category", x = "Category", y = "Value") +
  theme_minimal()

17. Scatter Plot

#Exploring pairwise relationships between two numeric variables.

ggplot(data_treated, aes(x = numeric_var1, y = numeric_var2, color = category)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Scatter Plot: Var1 vs Var2", x = "Numeric Variable 1", y = "Numeric Variable 2") +
  theme_minimal()

18. Faceting and Themes

#Using advanced ggplot2 grammar of graphics to create subplots.

ggplot(data_treated, aes(x = numeric_var1, y = numeric_var2)) +
  geom_point(color = "darkred") +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ category) +
  labs(title = "Faceted Scatter Plot with Trend Lines", x = "Var 1", y = "Var 2") +
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

19. Customizing Plots

#Applying extensive customizations: Titles, labels, legends, color scales.

ggplot(data_treated, aes(x = date_col, y = total_score, color = category)) +
  geom_line(size = 1) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Time Series Trend of Total Score",
       subtitle = "Grouped by Category",
       x = "Date",
       y = "Total Score",
       color = "Group Category") +
  theme_classic() +
  theme(legend.position = "bottom",
        plot.title = element_text(face = "bold", hjust = 0.5))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Part 4: Correlation Analysis

20. Correlation Computation

#Computing the correlation matrix for numeric variables.

numeric_data <- data_treated %>% select_if(is.numeric) %>% select(-id)
cor_matrix <- cor(numeric_data, use = "complete.obs")
print(cor_matrix)

##              numeric_var1 numeric_var2 total_score
## numeric_var1    1.0000000   -0.0314806   0.1496947
## numeric_var2   -0.0314806    1.0000000   0.9793225
## total_score     0.1496947    0.9793225   1.0000000

21. Correlation Visualization

Using corrplot to visualize the strength and direction of correlations.

corrplot(cor_matrix, method = "color", type = "upper", 
         addCoef.col = "black", tl.col = "black", tl.srt = 45,
         title = "Correlation Heatmap", mar = c(0,0,1,0))

22. Pairwise Scatterplots

Visualizing multiple relationships simultaneously (using ggplot since GGally is not in syllabus).

# Basic pairwise exploration using facet_grid or plotting individual scatters
# Here we show a combined plot approach
ggplot(data_treated, aes(x = numeric_var1, y = total_score)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Pairwise Exploration: Var1 vs Total Score") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Part 5: Regression Analysis

23. Simple Linear Regression

Fitting a simple linear regression model to understand the relationship between one independent and one dependent variable.

simple_model <- lm(numeric_var2 ~ numeric_var1, data = data_treated)
summary(simple_model)

## 
## Call:
## lm(formula = numeric_var2 ~ numeric_var1, data = data_treated)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -128.961  -32.851   -0.344   31.188  128.527 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  208.9965    27.8678   7.500 2.92e-11 ***
## numeric_var1  -0.1732     0.5558  -0.312    0.756    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.87 on 98 degrees of freedom
## Multiple R-squared:  0.00099,    Adjusted R-squared:  -0.009204 
## F-statistic: 0.09711 on 1 and 98 DF,  p-value: 0.756

24. Multiple Linear Regression

Expanding the model to include multiple independent variables (numeric and categorical).

multi_model <- lm(total_score ~ numeric_var1 + numeric_var2 + category, data = data_treated)
summary(multi_model)

## 
## Call:
## lm(formula = total_score ~ numeric_var1 + numeric_var2 + category, 
##     data = data_treated)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.179  -1.024   0.093   1.355  24.369 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.243502   3.423357  -2.700  0.00824 ** 
## numeric_var1  1.031660   0.054159  19.049  < 2e-16 ***
## numeric_var2  1.038180   0.010053 103.270  < 2e-16 ***
## categoryB    -0.043201   1.271610  -0.034  0.97297    
## categoryC     0.003071   1.250160   0.002  0.99805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.132 on 93 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.9917, Adjusted R-squared:  0.9913 
## F-statistic:  2776 on 4 and 93 DF,  p-value: < 2.2e-16

25. Model Diagnostics

Analyzing residuals and R-squared values to check regression assumptions.

# Setting up a 2x2 grid for diagnostic plots
par(mfrow = c(2, 2))
plot(multi_model)

par(mfrow = c(1, 1)) # Reset plot layout

Conclusion

This project successfully applies R programming concepts from basic data manipulation to advanced regression analysis, completely utilizing tidyverse and corrplot as requested.

Comprehensive Data Analysis Project

Siddharth Raj Shrivas

05/05/2026