library(readxl) # For reading Excel (.xls/.xlsx) files into R
library(dplyr) # For data manipulation (filter, select, mutate, group_by, summarise, etc.)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # For tidying and reshaping data (e.g., handling missing values, pivoting)
library(ggplot2)# For creating powerful and customizable data visualizations
library(writexl)# For writing data frames to Excel (.xlsx) files
library(lubridate)# For easy handling of dates and times (e.g., extract year/month, parse dates)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Always ensure you are in the correct Directory when loading data
setwd("D:\\SCHOOL OF STATISTICIANS\\R ASSIGNMENTS")
getwd()
## [1] "D:/SCHOOL OF STATISTICIANS/R ASSIGNMENTS"
# Load the Excel Data set
Project_Data <-read_excel("Week2_R_Project_Data.xlsx")
# Question; What does each column in the data set represent?
# View the Structure
str(Project_Data)
## tibble [5,000 × 6] (S3: tbl_df/tbl/data.frame)
## $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
## $ Region : chr [1:5000] "North" "South" "East" "North" ...
## $ Product : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
## $ Quantity : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
## $ Price : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
## $ Date : chr [1:5000] "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" ...
# Show First 5 rows
# Display a sample of the data
head(Project_Data)
## # A tibble: 6 × 6
## CustomerID Region Product Quantity Price Date
## <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 1001 North Widget C 5 30 2024-01-01
## 2 1002 South Widget C 10 30 2024-01-02
## 3 1003 East Widget C 10 30 2024-01-03
## 4 1004 North Widget C 10 30 2024-01-04
## 5 1005 North Widget C 8 30 2024-01-05
## 6 1006 South Widget A 9 20 2024-01-06
# Loads the last 5 columns
tail(Project_Data)
## # A tibble: 6 × 6
## CustomerID Region Product Quantity Price Date
## <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 5995 North Widget A 5 20 2037-09-03
## 2 5996 East Widget C 4 30 2037-09-04
## 3 5997 West Widget C 1 30 2037-09-05
## 4 5998 North Widget B 7 15 2037-09-06
## 5 5999 West Widget B 9 15 2037-09-07
## 6 6000 South Widget A 6 20 2037-09-08
colnames(Project_Data)
## [1] "CustomerID" "Region" "Product" "Quantity" "Price"
## [6] "Date"
This shows the dataset columns.Date — transaction date Region — geographic market Product — item sold Quantity — units sold Revenue — total value of sale
# Question 2; Are there any missing or inconsistent values in the data set?
colSums(is.na(Project_Data))
## CustomerID Region Product Quantity Price Date
## 0 0 0 0 0 0
# Question 3: What is the range of dates?
Project_Data$Date <- as.Date(Project_Data$Date) # Ensure it is in Date format.
range(Project_Data$Date, na.rm = TRUE)
## [1] "2024-01-01" "2037-09-08"
# Question 4: Data Cleaning with dplyr
# Remove Rows with Missing Values
clean_Project_Data <- Project_Data %>% drop_na()
# Removes: Removes any rows that have NA in any column
nrow(Project_Data) # Original number of rows
## [1] 5000
nrow(clean_Project_Data) # After dropping NAs
## [1] 5000
nrow(Project_Data) - nrow(clean_Project_Data)
## [1] 0
# Question 5: Do any columns have incorrect or unnecessary values?
summary(clean_Project_Data)
## CustomerID Region Product Quantity
## Min. :1001 Length:5000 Length:5000 Min. : 1.000
## 1st Qu.:2251 Class :character Class :character 1st Qu.: 3.000
## Median :3500 Mode :character Mode :character Median : 6.000
## Mean :3500 Mean : 5.545
## 3rd Qu.:4750 3rd Qu.: 8.000
## Max. :6000 Max. :10.000
## Price Date
## Min. :15.00 Min. :2024-01-01
## 1st Qu.:15.00 1st Qu.:2027-06-03
## Median :20.00 Median :2030-11-04
## Mean :21.65 Mean :2030-11-04
## 3rd Qu.:30.00 3rd Qu.:2034-04-07
## Max. :30.00 Max. :2037-09-08
str(clean_Project_Data)
## tibble [5,000 × 6] (S3: tbl_df/tbl/data.frame)
## $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
## $ Region : chr [1:5000] "North" "South" "East" "North" ...
## $ Product : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
## $ Quantity : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
## $ Price : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
## $ Date : Date[1:5000], format: "2024-01-01" "2024-01-02" ...
unique(Project_Data$Product)
## [1] "Widget C" "Widget A" "Widget B"
unique(Project_Data$Price)
## [1] 30 20 15
# Question 6: Are there duplicate rows?
sum(duplicated(clean_Project_Data))
## [1] 0
# To remove duplicates
clean_Project_Data <- clean_Project_Data %>% distinct()
# Confirm that duplicates have been removed after running
# Check how many duplicates were there before
sum(duplicated(clean_Project_Data))
## [1] 0
# Compare the number of rows before and after
# Check number of rows before removing duplicates
n_before <- nrow(clean_Project_Data)
# Remove duplicates
clean_Project_Data <- Project_Data %>% distinct()
# Check number of rows after
n_after <- nrow(clean_Project_Data)
colnames(clean_Project_Data)
## [1] "CustomerID" "Region" "Product" "Quantity" "Price"
## [6] "Date"
# Q7.How can I group the data by Region and Product?
grouped_data <- clean_Project_Data %>%
group_by(Region, Product)
# Returns the grouped variables
group_vars(grouped_data)
## [1] "Region" "Product"
# To view
str(clean_Project_Data)
## tibble [5,000 × 6] (S3: tbl_df/tbl/data.frame)
## $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
## $ Region : chr [1:5000] "North" "South" "East" "North" ...
## $ Product : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
## $ Quantity : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
## $ Price : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
## $ Date : Date[1:5000], format: "2024-01-01" "2024-01-02" ...
# Output after grouping
group_vars(grouped_data)
## [1] "Region" "Product"
str(grouped_data)
## gropd_df [5,000 × 6] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ CustomerID: num [1:5000] 1001 1002 1003 1004 1005 ...
## $ Region : chr [1:5000] "North" "South" "East" "North" ...
## $ Product : chr [1:5000] "Widget C" "Widget C" "Widget C" "Widget C" ...
## $ Quantity : num [1:5000] 5 10 10 10 8 9 2 5 7 6 ...
## $ Price : num [1:5000] 30 30 30 30 30 20 30 30 20 20 ...
## $ Date : Date[1:5000], format: "2024-01-01" "2024-01-02" ...
## - attr(*, "groups")= tibble [12 × 3] (S3: tbl_df/tbl/data.frame)
## ..$ Region : chr [1:12] "East" "East" "East" "North" ...
## ..$ Product: chr [1:12] "Widget A" "Widget B" "Widget C" "Widget A" ...
## ..$ .rows : list<int> [1:12]
## .. ..$ : int [1:436] 39 60 64 83 117 138 155 162 172 186 ...
## .. ..$ : int [1:414] 36 38 73 75 88 100 125 132 133 134 ...
## .. ..$ : int [1:445] 3 7 8 16 22 31 32 58 63 68 ...
## .. ..$ : int [1:408] 9 11 12 13 14 20 42 55 59 61 ...
## .. ..$ : int [1:381] 41 48 51 79 135 146 151 152 161 169 ...
## .. ..$ : int [1:420] 1 4 5 40 53 65 85 86 101 110 ...
## .. ..$ : int [1:428] 6 24 29 37 54 56 57 69 80 84 ...
## .. ..$ : int [1:432] 17 30 33 47 82 91 92 104 108 116 ...
## .. ..$ : int [1:396] 2 15 18 19 25 44 45 46 67 77 ...
## .. ..$ : int [1:386] 10 21 49 97 109 114 118 123 127 156 ...
## .. ..$ : int [1:452] 23 27 28 34 35 43 66 70 71 72 ...
## .. ..$ : int [1:402] 26 50 52 74 76 90 99 102 103 106 ...
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# Q8. How do I calculate total quantity and total revenue for each group?
summary_table <- grouped_data %>%
group_by(Region, Product) %>%
summarise(
total_quantity = sum(Quantity, na.rn = TRUE),
total_revenue = sum(Quantity * Price, na.rn = TRUE),
.groups = "drop"
)
print(summary_table)
## # A tibble: 12 × 4
## Region Product total_quantity total_revenue
## <chr> <chr> <dbl> <dbl>
## 1 East Widget A 2451 49001
## 2 East Widget B 2291 34351
## 3 East Widget C 2460 73771
## 4 North Widget A 2346 46901
## 5 North Widget B 2200 32986
## 6 North Widget C 2350 70471
## 7 South Widget A 2444 48861
## 8 South Widget B 2398 35956
## 9 South Widget C 2157 64681
## 10 West Widget A 2041 40801
## 11 West Widget B 2417 36241
## 12 West Widget C 2180 65371
# Q9: Can I sort the summarized results in descending order of total revenue?
summary_sorted <- summary_table %>%
arrange(desc(total_revenue))
# To view the summary descending order
print(summary_sorted)
## # A tibble: 12 × 4
## Region Product total_quantity total_revenue
## <chr> <chr> <dbl> <dbl>
## 1 East Widget C 2460 73771
## 2 North Widget C 2350 70471
## 3 West Widget C 2180 65371
## 4 South Widget C 2157 64681
## 5 East Widget A 2451 49001
## 6 South Widget A 2444 48861
## 7 North Widget A 2346 46901
## 8 West Widget A 2041 40801
## 9 West Widget B 2417 36241
## 10 South Widget B 2398 35956
## 11 East Widget B 2291 34351
## 12 North Widget B 2200 32986
# Q10: How can I export the summarized data to a CSV file?
write.csv(summary_sorted, "summary_output.csv", row.names = FALSE)
# summary_output.csv" The name of the output file to save row.names = FALSE Prevents R from adding row numbers to the CSV
# Q11. Where is the output file saved, and how can I access it?
getwd()
## [1] "D:/SCHOOL OF STATISTICIANS/R ASSIGNMENTS"
#Reflection & Insight Questions
Q12: What insights can you draw? Certain regions dominate sales for specific products. Some products have high revenue but low quantity — indicating higher price.
ggplot(summary_sorted, aes(x = reorder(Product, -total_revenue), y = total_revenue, fill = Region)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Total Revenue by Product and Region",
x = "Product",
y = "Total Revenue (USD)"
) +
theme_minimal()
INSIGHTS FROM THE CHART 1. Widget C is the highest revenue generator overall All four regions (East, North, South, West) generated more revenue from Widget C than any other product.
East region leads, making it the top contributor for Widget C revenue.
2. Widget A performs moderately Generates medium-level revenue across all regions.
Revenue is fairly balanced across East, North, and South, with West slightly lower.
Suggests broad market acceptance but not a standout performer.
3. Widget B is the lowest revenue product in all regions Consistently under performs compared to Widget A and C.
Marginal difference between regions — none exceed 40,000 USD in revenue.
May need marketing push, product enhancement, or even re-evaluation
# Scatter Plot: Quantity vs Revenue per Product
ggplot(summary_sorted, aes(x = total_quantity, y = total_revenue, color = Region)) +
geom_point(size = 4) +
geom_text(aes(label = Product), vjust = -1, size = 3) +
labs(
title = "Quantity vs. Revenue by Product and Region",
x = "Total Quantity Sold",
y = "Total Revenue"
) +
theme_minimal()
Q13: What if customer demographics were added? You could analyze who buys what, enabling targeted marketing.
You could compare purchasing patterns across age/gender segments.
Q14: How to reuse this process? Wrap it in a function or .Rmd file.
Replace only the file path when a new data_set arrives.
Automate cleaning, summary, and export.