library(readxl) # For reading Excel (.xls/.xlsx) files into R
library(dplyr) # For data manipulation (filter, select, mutate, group_by, summarise, etc.)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) #  For tidying and reshaping data (e.g., handling missing values, pivoting)
library(ggplot2)# For creating powerful and customizable data visualizations
library(writexl)# For writing data frames to Excel (.xlsx) files
library(lubridate)# For easy handling of dates and times (e.g., extract year/month, parse dates)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Always ensure you are in the correct Directory when loading data
setwd("D:\\SCHOOL OF STATISTICIANS\\R ASSIGNMENTS")
getwd()
## [1] "D:/SCHOOL OF STATISTICIANS/R ASSIGNMENTS"
# Load the Excel Data set
Project_Data <-read_excel("Week2_R_Project_Data.xlsx")

Understanding the Dataset

# Question; What does each column in the data set represent?
# View the Structure
str(Project_Data)
## tibble [5,000 × 6] (S3: tbl_df/tbl/data.frame)
##  $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
##  $ Region    : chr [1:5000] "North" "South" "East" "North" ...
##  $ Product   : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
##  $ Quantity  : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
##  $ Price     : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
##  $ Date      : chr [1:5000] "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" ...
# Show First 5 rows
# Display a sample of the data
head(Project_Data)
## # A tibble: 6 × 6
##   CustomerID Region Product  Quantity Price Date      
##        <dbl> <chr>  <chr>       <dbl> <dbl> <chr>     
## 1       1001 North  Widget C        5    30 2024-01-01
## 2       1002 South  Widget C       10    30 2024-01-02
## 3       1003 East   Widget C       10    30 2024-01-03
## 4       1004 North  Widget C       10    30 2024-01-04
## 5       1005 North  Widget C        8    30 2024-01-05
## 6       1006 South  Widget A        9    20 2024-01-06
# Loads the last 5 columns
tail(Project_Data)
## # A tibble: 6 × 6
##   CustomerID Region Product  Quantity Price Date      
##        <dbl> <chr>  <chr>       <dbl> <dbl> <chr>     
## 1       5995 North  Widget A        5    20 2037-09-03
## 2       5996 East   Widget C        4    30 2037-09-04
## 3       5997 West   Widget C        1    30 2037-09-05
## 4       5998 North  Widget B        7    15 2037-09-06
## 5       5999 West   Widget B        9    15 2037-09-07
## 6       6000 South  Widget A        6    20 2037-09-08
colnames(Project_Data)
## [1] "CustomerID" "Region"     "Product"    "Quantity"   "Price"     
## [6] "Date"

This shows the dataset columns.Date — transaction date Region — geographic market Product — item sold Quantity — units sold Revenue — total value of sale

 # Question 2; Are there any missing or inconsistent values in the data set?
colSums(is.na(Project_Data))
## CustomerID     Region    Product   Quantity      Price       Date 
##          0          0          0          0          0          0

Question 3; What is the range of dates in the Dataset?

# Question 3: What is the range of dates?
Project_Data$Date <- as.Date(Project_Data$Date) # Ensure it is in Date format.
range(Project_Data$Date, na.rm = TRUE)
## [1] "2024-01-01" "2037-09-08"

Data Cleaning with dplyr

# Question 4: Data Cleaning with dplyr
# Remove Rows with Missing Values

clean_Project_Data <- Project_Data %>% drop_na()

# Removes: Removes any rows that have NA in any column
nrow(Project_Data)  # Original number of rows
## [1] 5000
nrow(clean_Project_Data)  # After dropping NAs
## [1] 5000
nrow(Project_Data) - nrow(clean_Project_Data)
## [1] 0
# Question 5:  Do any columns have incorrect or unnecessary values?

  summary(clean_Project_Data)
##    CustomerID      Region            Product             Quantity     
##  Min.   :1001   Length:5000        Length:5000        Min.   : 1.000  
##  1st Qu.:2251   Class :character   Class :character   1st Qu.: 3.000  
##  Median :3500   Mode  :character   Mode  :character   Median : 6.000  
##  Mean   :3500                                         Mean   : 5.545  
##  3rd Qu.:4750                                         3rd Qu.: 8.000  
##  Max.   :6000                                         Max.   :10.000  
##      Price            Date           
##  Min.   :15.00   Min.   :2024-01-01  
##  1st Qu.:15.00   1st Qu.:2027-06-03  
##  Median :20.00   Median :2030-11-04  
##  Mean   :21.65   Mean   :2030-11-04  
##  3rd Qu.:30.00   3rd Qu.:2034-04-07  
##  Max.   :30.00   Max.   :2037-09-08
str(clean_Project_Data)
## tibble [5,000 × 6] (S3: tbl_df/tbl/data.frame)
##  $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
##  $ Region    : chr [1:5000] "North" "South" "East" "North" ...
##  $ Product   : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
##  $ Quantity  : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
##  $ Price     : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
##  $ Date      : Date[1:5000], format: "2024-01-01" "2024-01-02" ...
unique(Project_Data$Product)
## [1] "Widget C" "Widget A" "Widget B"
unique(Project_Data$Price)
## [1] 30 20 15
# Question 6: Are there duplicate rows?
sum(duplicated(clean_Project_Data))
## [1] 0
# To remove duplicates
clean_Project_Data <- clean_Project_Data %>% distinct()
# Confirm  that duplicates have been removed after running 
# Check how many duplicates were there before
sum(duplicated(clean_Project_Data))
## [1] 0
# Compare the number of rows before and after
# Check number of rows before removing duplicates
n_before <- nrow(clean_Project_Data)

# Remove duplicates
clean_Project_Data <- Project_Data %>% distinct()

# Check number of rows after
n_after <- nrow(clean_Project_Data)
colnames(clean_Project_Data)
## [1] "CustomerID" "Region"     "Product"    "Quantity"   "Price"     
## [6] "Date"

Grouping and Summarizing

# Q7.How can I group the data by Region and Product?

grouped_data <- clean_Project_Data %>%
  group_by(Region, Product)
# Returns the grouped variables
group_vars(grouped_data)
## [1] "Region"  "Product"
# To view 
str(clean_Project_Data)
## tibble [5,000 × 6] (S3: tbl_df/tbl/data.frame)
##  $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
##  $ Region    : chr [1:5000] "North" "South" "East" "North" ...
##  $ Product   : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
##  $ Quantity  : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
##  $ Price     : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
##  $ Date      : Date[1:5000], format: "2024-01-01" "2024-01-02" ...

Grouping and Summarizing

# Output after grouping
group_vars(grouped_data)
## [1] "Region"  "Product"
str(grouped_data)
## gropd_df [5,000 × 6] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
##  $ Region    : chr [1:5000] "North" "South" "East" "North" ...
##  $ Product   : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
##  $ Quantity  : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
##  $ Price     : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
##  $ Date      : Date[1:5000], format: "2024-01-01" "2024-01-02" ...
##  - attr(*, "groups")= tibble [12 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ Region : chr [1:12] "East" "East" "East" "North" ...
##   ..$ Product: chr [1:12] "Widget A" "Widget B" "Widget C" "Widget A" ...
##   ..$ .rows  : list<int> [1:12] 
##   .. ..$ : int [1:436] 39 60 64 83 117 138 155 162 172 186 ...
##   .. ..$ : int [1:414] 36 38 73 75 88 100 125 132 133 134 ...
##   .. ..$ : int [1:445] 3 7 8 16 22 31 32 58 63 68 ...
##   .. ..$ : int [1:408] 9 11 12 13 14 20 42 55 59 61 ...
##   .. ..$ : int [1:381] 41 48 51 79 135 146 151 152 161 169 ...
##   .. ..$ : int [1:420] 1 4 5 40 53 65 85 86 101 110 ...
##   .. ..$ : int [1:428] 6 24 29 37 54 56 57 69 80 84 ...
##   .. ..$ : int [1:432] 17 30 33 47 82 91 92 104 108 116 ...
##   .. ..$ : int [1:396] 2 15 18 19 25 44 45 46 67 77 ...
##   .. ..$ : int [1:386] 10 21 49 97 109 114 118 123 127 156 ...
##   .. ..$ : int [1:452] 23 27 28 34 35 43 66 70 71 72 ...
##   .. ..$ : int [1:402] 26 50 52 74 76 90 99 102 103 106 ...
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
# Q8. How do I calculate total quantity and total revenue for each group?

summary_table <- grouped_data %>%
  group_by(Region, Product) %>%
  summarise(
    total_quantity = sum(Quantity, na.rn = TRUE),
    total_revenue = sum(Quantity * Price, na.rn = TRUE),
    .groups = "drop"
  )
print(summary_table)
## # A tibble: 12 × 4
##    Region Product  total_quantity total_revenue
##    <chr>  <chr>             <dbl>         <dbl>
##  1 East   Widget A           2451         49001
##  2 East   Widget B           2291         34351
##  3 East   Widget C           2460         73771
##  4 North  Widget A           2346         46901
##  5 North  Widget B           2200         32986
##  6 North  Widget C           2350         70471
##  7 South  Widget A           2444         48861
##  8 South  Widget B           2398         35956
##  9 South  Widget C           2157         64681
## 10 West   Widget A           2041         40801
## 11 West   Widget B           2417         36241
## 12 West   Widget C           2180         65371
# Q9: Can I sort the summarized results in descending order of total revenue?

summary_sorted <- summary_table %>%
  arrange(desc(total_revenue))
# To view the summary descending order
print(summary_sorted)
## # A tibble: 12 × 4
##    Region Product  total_quantity total_revenue
##    <chr>  <chr>             <dbl>         <dbl>
##  1 East   Widget C           2460         73771
##  2 North  Widget C           2350         70471
##  3 West   Widget C           2180         65371
##  4 South  Widget C           2157         64681
##  5 East   Widget A           2451         49001
##  6 South  Widget A           2444         48861
##  7 North  Widget A           2346         46901
##  8 West   Widget A           2041         40801
##  9 West   Widget B           2417         36241
## 10 South  Widget B           2398         35956
## 11 East   Widget B           2291         34351
## 12 North  Widget B           2200         32986
# Q10: How can I export the summarized data to a CSV file?

write.csv(summary_sorted, "summary_output.csv", row.names = FALSE)

# summary_output.csv"   The name of the output file to save row.names = FALSE   Prevents R from adding row numbers to the CSV
# Q11. Where is the output file saved, and how can I access it?
getwd()
## [1] "D:/SCHOOL OF STATISTICIANS/R ASSIGNMENTS"

#Reflection & Insight Questions

Q12: What insights can you draw? Certain regions dominate sales for specific products. Some products have high revenue but low quantity — indicating higher price.

ggplot(summary_sorted, aes(x = reorder(Product, -total_revenue), y = total_revenue, fill = Region)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Total Revenue by Product and Region",
    x = "Product",
    y = "Total Revenue (USD)"
  ) +
  theme_minimal()

INSIGHTS FROM THE CHART 1. Widget C is the highest revenue generator overall All four regions (East, North, South, West) generated more revenue from Widget C than any other product.

East region leads, making it the top contributor for Widget C revenue.

2. Widget A performs moderately Generates medium-level revenue across all regions.

Revenue is fairly balanced across East, North, and South, with West slightly lower.

Suggests broad market acceptance but not a standout performer.

3. Widget B is the lowest revenue product in all regions Consistently under performs compared to Widget A and C.

Marginal difference between regions — none exceed 40,000 USD in revenue.

May need marketing push, product enhancement, or even re-evaluation

# Scatter Plot: Quantity vs Revenue per Product
ggplot(summary_sorted, aes(x = total_quantity, y = total_revenue, color = Region)) +
  geom_point(size = 4) +
  geom_text(aes(label = Product), vjust = -1, size = 3) +
  labs(
    title = "Quantity vs. Revenue by Product and Region",
    x = "Total Quantity Sold",
    y = "Total Revenue"
  ) +
  theme_minimal()

Q13: What if customer demographics were added? You could analyze who buys what, enabling targeted marketing.

You could compare purchasing patterns across age/gender segments.

Q14: How to reuse this process? Wrap it in a function or .Rmd file.

Replace only the file path when a new data_set arrives.

Automate cleaning, summary, and export.