# Load tools library
library(tools)
# Load tidyverse library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.2
## âś” ggplot2 4.0.0 âś” tibble 3.3.0
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Expoloratory Data Analysis (EDA) Assn using R
Load a data set into R Studio
Describe the rows and columns
Data Cleaning (resolve the noise in the data)
Univariate & Multivariate Summary
Detect Outliers
Other things:
# Call the data table bus_data
# Upload from my Google Drive
# NO CHANGES NEEDED
#https://drive.google.com/file/d/18mVzAExO_toSlyZ3bqeFbg-5LdfMEYcc/view?usp=sharing
# Replace with actual file ID
file_id <- "18mVzAExO_toSlyZ3bqeFbg-5LdfMEYcc"
data_url <- paste0("https://docs.google.com/uc?export=download&id=", file_id)
#Read the data into R
bus_data <- read.csv(data_url, header = TRUE)
head(bus_data)
## Customer_ID Name Age Purchase_Date Product Quantity Price
## 1 1145 Timothy Murray 45 6/15/23 Headphones 1 500
## 2 1286 Kathryn Lawson 36 10/30/23 Headphones 1 300
## 3 1294 James Mitchell 36 6/7/23 Headphones 3 500
## 4 1415 Emily Davis 59 11/2/23 Headphones 2 200
## 5 1437 Kyle Santana 36 12/9/22 Headphones 2 300
## 6 1593 Kelly Dean 35 6/19/24 Headphones 1 300
## Total_Sale Payment_Method NameAge
## 1 500 cash Timothy Murray45
## 2 300 debit Kathryn Lawson36
## 3 1500 Cahs James Mitchell36
## 4 400 Cash Emily Davis59
## 5 600 cash Kyle Santana36
## 6 300 Cash Kelly Dean35
# View the first few lines of the data with head()
head(bus_data)
## Customer_ID Name Age Purchase_Date Product Quantity Price
## 1 1145 Timothy Murray 45 6/15/23 Headphones 1 500
## 2 1286 Kathryn Lawson 36 10/30/23 Headphones 1 300
## 3 1294 James Mitchell 36 6/7/23 Headphones 3 500
## 4 1415 Emily Davis 59 11/2/23 Headphones 2 200
## 5 1437 Kyle Santana 36 12/9/22 Headphones 2 300
## 6 1593 Kelly Dean 35 6/19/24 Headphones 1 300
## Total_Sale Payment_Method NameAge
## 1 500 cash Timothy Murray45
## 2 300 debit Kathryn Lawson36
## 3 1500 Cahs James Mitchell36
## 4 400 Cash Emily Davis59
## 5 600 cash Kyle Santana36
## 6 300 Cash Kelly Dean35
# View the last few lines of the data with tail()
### CHANGE THE COMAMAND FROM head TO tail AND REPLACE ****** WITH bus_data
### REMOVE THE # FROM THE COMMAND LINE
tail(bus_data)
## Customer_ID Name Age Purchase_Date Product Quantity Price
## 495 3572 Frances Moran 46 3/14/23 Tblet 3 300
## 496 6388 Anthony Hogan 54 3/29/23 Tblet 1 1000
## 497 6976 James Pittman 66 5/15/23 Tblet 2 150
## 498 7926 Charles Ward 52 9/12/23 Tblet 2 150
## 499 8327 Kimberly Hall 66 1/27/24 Tblet 2 1000
## 500 8588 Brenda Sanchez 59 9/10/23 Tblet 1 500
## Total_Sale Payment_Method NameAge
## 495 900 Cash Frances Moran46
## 496 1000 Cash Anthony Hogan54
## 497 300 Credit James Pittman66
## 498 300 Cash Charles Ward52
## 499 2000 Cash Kimberly Hall66
## 500 500 Debit Brenda Sanchez59
# Use summary() to summarize each of the columns (variables)
# REPLACE THE ******* WITH THE DATA TABLE NAME OF bus_data & REMOVE THE #
summary(bus_data)
## Customer_ID Name Age Purchase_Date
## Min. :1008 Length:500 Min. :18.00 Length:500
## 1st Qu.:3334 Class :character 1st Qu.:29.00 Class :character
## Median :5454 Mode :character Median :42.00 Mode :character
## Mean :5461 Mean :42.83
## 3rd Qu.:7629 3rd Qu.:56.00
## Max. :9999 Max. :70.00
## NA's :15
## Product Quantity Price Total_Sale
## Length:500 Min. :1.00 Min. : 150.0 Min. : 0.0
## Class :character 1st Qu.:1.00 1st Qu.: 300.0 1st Qu.: 300.0
## Mode :character Median :2.00 Median : 400.0 Median : 600.0
## Mean :1.89 Mean : 475.8 Mean : 878.3
## 3rd Qu.:3.00 3rd Qu.: 600.0 3rd Qu.:1000.0
## Max. :4.00 Max. :1000.0 Max. :3200.0
## NA's :8 NA's :2
## Payment_Method NameAge
## Length:500 Length:500
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
# NO CHANGES NEEDED
print("Count of total missing values for Name")
## [1] "Count of total missing values for Name"
sum(is.na(bus_data$Name))
## [1] 0
print("Count of total missing values for Purchase_Date")
## [1] "Count of total missing values for Purchase_Date"
sum(is.na(bus_data$Purchase_Date))
## [1] 0
print("Count of total missing values for Product")
## [1] "Count of total missing values for Product"
sum(is.na(bus_data$Product))
## [1] 0
print("Count of total missing values for Payment Method")
## [1] "Count of total missing values for Payment Method"
sum(is.na(bus_data$Payment_Method))
## [1] 62
print("Count of total missing values for NameAge")
## [1] "Count of total missing values for NameAge"
sum(is.na(bus_data$NameAge))
## [1] 0
# Create a new data table without the NameAge column
# Call the new data table bd
# REPLACE ********* WITH NameAge
# REMOVE THE #
bd <- subset(bus_data, select = -NameAge)
# remove all rows with NA's in any variables/columns
# REPLACE *** WITH bd IN TWO PLACES BELOW, REMOVE THE # ON THOSE LINES
bd <- na.omit(bd)
# summarize
summary(bd)
## Customer_ID Name Age Purchase_Date
## Min. :1008 Length:416 Min. :18.00 Length:416
## 1st Qu.:3334 Class :character 1st Qu.:29.00 Class :character
## Median :5388 Mode :character Median :42.00 Mode :character
## Mean :5484 Mean :42.85
## 3rd Qu.:7684 3rd Qu.:56.00
## Max. :9999 Max. :70.00
## Product Quantity Price Total_Sale
## Length:416 Min. :1.000 Min. : 150.0 Min. : 150
## Class :character 1st Qu.:1.000 1st Qu.: 300.0 1st Qu.: 300
## Mode :character Median :2.000 Median : 300.0 Median : 600
## Mean :1.935 Mean : 462.7 Mean : 887
## 3rd Qu.:3.000 3rd Qu.: 600.0 3rd Qu.:1000
## Max. :4.000 Max. :1000.0 Max. :3000
## Payment_Method
## Length:416
## Class :character
## Mode :character
##
##
##
# remove all rows with NA's in any variables/columns
bd <- na.omit(bd)
#summarize
summary(bd)
## Customer_ID Name Age Purchase_Date
## Min. :1008 Length:416 Min. :18.00 Length:416
## 1st Qu.:3334 Class :character 1st Qu.:29.00 Class :character
## Median :5388 Mode :character Median :42.00 Mode :character
## Mean :5484 Mean :42.85
## 3rd Qu.:7684 3rd Qu.:56.00
## Max. :9999 Max. :70.00
## Product Quantity Price Total_Sale
## Length:416 Min. :1.000 Min. : 150.0 Min. : 150
## Class :character 1st Qu.:1.000 1st Qu.: 300.0 1st Qu.: 300
## Mode :character Median :2.000 Median : 300.0 Median : 600
## Mean :1.935 Mean : 462.7 Mean : 887
## 3rd Qu.:3.000 3rd Qu.: 600.0 3rd Qu.:1000
## Max. :4.000 Max. :1000.0 Max. :3000
## Payment_Method
## Length:416
## Class :character
## Mode :character
##
##
##
# Find the number of unique Payment_Methods
# Create table for Payment_Method
# REPLACE *** WITH bd
# REPLACE ***** WITH the variable name Payment_Method,
# REMOVE THE # IN FRONT OF THE TWO LINES OF CODE
length(unique(bd$Payment_Method))
## [1] 12
table(bd$Payment_Method)
##
## Cahs Cahsh cash Cash Crdit credit Credit Creditt Debbit Debiit
## 6 6 13 124 3 14 119 3 11 2
## debit Debit
## 11 104
# mutate Payment_Method to be title case
# Find the number of unique Payment_Methods
# Create table for Payment_Method
## REPLACE *** WITH bd
## REPLACE ***** WITH the variable name
## REMOVE THE # FROM THOSE LINES
bd <- mutate(bd,Payment_Method = toTitleCase(bd$Payment_Method))
length(unique(bd$Payment_Method))
## [1] 9
table(bd$Payment_Method)
##
## Cahs Cahsh Cash Crdit Credit Creditt Debbit Debiit Debit
## 6 6 137 3 133 3 11 2 115
# Change Cahs & Cahsh to Cash
### NO CHANGES NECESSARY
bd <- mutate( bd ,Payment_Method = gsub('Cahsh', 'Cash', Payment_Method ))
bd <- mutate( bd ,Payment_Method = gsub('Cahs', 'Cash', Payment_Method ))
# Change Crdit & Creditt to Credit
### MAKE CHANGES TO THE LINES BELOW TO FIX THE MIS-SPELLINGS OF Credit
bd <- mutate( bd ,Payment_Method = gsub('Crdit', 'Credit', Payment_Method ))
bd <- mutate( bd ,Payment_Method = gsub('Creditt', 'Credit', Payment_Method ))
### Change Debbit & Debiit to Debit
### MAKE CHANGES TO THE LINES BELOW TO FIX THE MIS-SPELLINGS OF Debit
bd <- mutate( bd ,Payment_Method = gsub('Debbit', 'Debit', Payment_Method ))
bd <- mutate( bd ,Payment_Method = gsub('Debiit', 'Debit', Payment_Method ))
# Find the number of unique Payment_Methods
# Create table for Payment_Method
### NO CHANGES NECESSARY
length(unique(bd$Payment_Method))
## [1] 3
table(bd$Payment_Method)
##
## Cash Credit Debit
## 149 139 128
# Find the number of unique Product
# Create table for Product
### NO CHANGES NECESSARY
length(unique(bd$Product))
## [1] 13
table(bd$Product)
##
## headphones Headphones laptop Laptop Monitir monitor Monitor
## 20 68 12 86 7 10 67
## Monittor smartphone Smartphone Tablet Tablt Tblet
## 4 27 42 63 4 6
### NO CHANGES NECESSARY
# mutate Product to be title case
bd <- mutate(bd,Product = toTitleCase(bd$Product))
# Find the number of unique Product
# Create table for Product
length(unique(bd$Product))
## [1] 9
table(bd$Product)
##
## Headphones Laptop Monitir Monitor Monittor Smartphone Tablet
## 88 98 7 77 4 69 63
## Tablt Tblet
## 4 6
### Change Monitir & Monittor to Monitor
### NO CHANGES NECESSARY
bd <- mutate( bd ,Product = gsub('Monitir', 'Monitor', Product ))
bd <- mutate( bd ,Product = gsub('Monittor', 'Monitor', Product ))
### Change Tablt & Tblet to Tablet
### MAKE CHANGES TO THE LINES BELOW TO FIX THE MIS-SPELLINGS OF Tablet
bd <- mutate( bd ,Product = gsub('Tablt', 'Tablet', Product ))
bd <- mutate( bd ,Product = gsub('Tblet', 'Tablet', Product ))
# Find the number of unique Product
# Create table for Product
### NO CHANGES NECESSARY
length(unique(bd$Product))
## [1] 5
table(bd$Product)
##
## Headphones Laptop Monitor Smartphone Tablet
## 88 98 88 69 73
# Create summary statistics for Total_Sale
summary(bd$Total_Sale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 150 300 600 887 1000 3000
# Create a histogram of Total_Sale
hist(bd$Total_Sale)
# Create a boxplot for Total_Sale
boxplot(bd$Total_Sale)
#### MAKE CHANGES NECESSARY TO CREATE SUMMARY STAT, HISTOGRAM AND BOXPLOT FOR QUANTITY
# Create summary statistics for Quantity
summary(bd$Quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.935 3.000 4.000
# Create a histogram of Quantity
hist(bd$Quantity)
# Create a boxplot for Quantity
boxplot(bd$Quantity)
# Total sales by Product
tapply(bd$Total_Sale, bd$Product,sum)
## Headphones Laptop Monitor Smartphone Tablet
## 49500 94850 77100 75200 72350
# Total quantity by Product
#### MAKE CHANGES NECESSARY TO CREATE SUMMARY of QUANTITY by product
tapply(bd$Quantity, bd$Product,sum)
## Headphones Laptop Monitor Smartphone Tablet
## 161 209 168 124 143
#MAKE CHANGES NECESSARY TO SUMMARIZE TOTAL SALES BY Payment_Method
# Total sales by Product
tapply(bd$Total_Sale, bd$Payment_Method,sum)
## Cash Credit Debit
## 126400 123950 118650
# Total quantity by Product
tapply(bd$Quantity, bd$Payment_Method,sum)
## Cash Credit Debit
## 279 272 254
### NO CHANGES NECESSARY
table(bd$Payment_Method,bd$Product)
##
## Headphones Laptop Monitor Smartphone Tablet
## Cash 33 44 26 21 25
## Credit 15 30 36 34 24
## Debit 40 24 26 14 24
### NO CHANGES NECESSARY
# Get the length of each string in the 'Purchase_Date' column
string_lengths <- nchar(bd$Purchase_Date)
# Extract the last two characters from each string
# The `start` argument is calculated as the total length minus 1
# The `stop` argument is the total length
year <- substr(bd$Purchase_Date, start = string_lengths - 1, stop = string_lengths)
# Add the new column to the data frame
bd$Year <- year
# Summarize Total Sales by Year
tapply(bd$Total_Sale,bd$Year, sum)
## 22 23 24 74 88
## 69050 164050 134600 1000 300
# remove rows with year of 74 or 88
### REPLACE THE ** WITH 74 AND 88, remove the # in the row
# Find all rows where the "Year" is "74" or "88'
rows_to_delete <- which(bd$Year == "74" | bd$Year =="88" )
# Keep all rows *except* the ones in the 'rows_to_delete' list
bd <- bd[-rows_to_delete, ]
# Summarize Total Sales by Year
tapply(bd$Total_Sale,bd$Year, sum)
## 22 23 24
## 69050 164050 134600
#Create a Bar Chart for total Sales by Year
### NO CHANGES NECESSARY
# Create the new data frame for the Bar Chart
SY <- data.frame(
Yr = unique(bd$Year),
TotalSales = tapply(bd$Total_Sale, bd$Year, sum)
)
# Create the bar chart using base R
barplot(
SY$TotalSales,
names.arg = SY$Year,
main = "Total Sales by Year",
xlab = "Year",
ylab = "Total Sales",
col = "skyblue"
)
library(ggplot2)
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Summarize the data
summary_table <- bd %>%
group_by(Year, Payment_Method) %>%
summarise(Count = n(), .groups = 'drop')
# Create the ggplot
p <- ggplot(summary_table, aes(x = Year, y = Count, fill = Payment_Method)) +
geom_bar(stat = "identity") +
labs(
title = "Payment Method Distribution by Year",
x = "Year",
y = "Number of Transactions",
fill = "Payment Method"
) +
theme_minimal()
ggplotly(p)
library(dplyr)
# Convert date if needed
bd$Purchase_Date <- as.Date(bd$Purchase_Date, format = "%m/%d/%y")
# Pick three random customers
cast <- bd %>% sample_n(3)
# Extract their info
hero <- cast[1, ]
rival <- cast[2, ]
wildcard <- cast[3, ]
# Build the story
story <- paste0(
"🎠On ", format(hero$Purchase_Date, "%B %d, %Y"), ", the store became a stage for high drama.\n\n",
hero$Name, ", age ", hero$Age, ", entered with purpose. Their eyes locked on a shelf stacked with ",
hero$Product, ". But just as they reached out, ", rival$Name, ", age ", rival$Age, ", swooped in.\n\n",
"“Back off,” hissed ", rival$Name, ", clutching ", rival$Quantity, " unit(s) of ", rival$Product,
". “I’ve waited all year for this.”\n\n",
"Before things could escalate, a third figure emerged—", wildcard$Name, ", age ", wildcard$Age,
", sipping a smoothie and radiating chaos. “Why fight?” they said. “I just bought the last one online.”\n\n",
"The crowd gasped. The manager fainted. But in a twist of fate, a hidden stash was revealed.\n\n",
"All three walked away with their treasures:\n",
"- ", hero$Name, " paid $", hero$Total_Sale, " via ", hero$Payment_Method, ".\n",
"- ", rival$Name, " paid $", rival$Total_Sale, " via ", rival$Payment_Method, ".\n",
"- ", wildcard$Name, " paid $", wildcard$Total_Sale, " via ", wildcard$Payment_Method, ".\n\n",
"And thus ended the Great Triple Checkout Clash of ", hero$Year, ". Legends were born. Receipts were printed."
)
cat(story)
## 🎠On September 22, 2023, the store became a stage for high drama.
##
## Kenneth Espinoza, age 56, entered with purpose. Their eyes locked on a shelf stacked with Tablet. But just as they reached out, Cheryl Franco, age 48, swooped in.
##
## “Back off,” hissed Cheryl Franco, clutching 1 unit(s) of Smartphone. “I’ve waited all year for this.”
##
## Before things could escalate, a third figure emerged—Barry Moore, age 38, sipping a smoothie and radiating chaos. “Why fight?” they said. “I just bought the last one online.”
##
## The crowd gasped. The manager fainted. But in a twist of fate, a hidden stash was revealed.
##
## All three walked away with their treasures:
## - Kenneth Espinoza paid $3000 via Credit.
## - Cheryl Franco paid $1000 via Cash.
## - Barry Moore paid $800 via Cash.
##
## And thus ended the Great Triple Checkout Clash of 23. Legends were born. Receipts were printed.
library(dplyr)
# Convert date if needed
bd$Purchase_Date <- as.Date(bd$Purchase_Date, format = "%m/%d/%y")
# Pick three random customers
cast <- bd %>% sample_n(3)
# Extract their info
planner <- cast[1, ]
saboteur <- cast[2, ]
underdog <- cast[3, ]
# Build the story
story <- paste0(
"⏳ *Retail Rush: Countdown to Chaos* — ", format(planner$Purchase_Date, "%B %d, %Y"), "\n\n",
"The store opened at 9:00 AM sharp. By 9:01, the air was electric.\n\n",
planner$Name, ", age ", planner$Age, ", had planned this purchase for weeks. They needed ", planner$Quantity,
" unit(s) of ", planner$Product, " to complete a secret project. But they weren’t alone.\n\n",
saboteur$Name, ", age ", saboteur$Age, ", had hacked the inventory system the night before. Their goal? Delay shipments and sow chaos. ",
"\"Nobody gets what they came for,\" they muttered, slipping through aisles.\n\n",
underdog$Name, ", age ", underdog$Age, ", had no plan—just hope. They’d saved for months to buy ", underdog$Product,
". But when they arrived, the shelves were bare.\n\n",
"Then came the twist: a secret stash hidden behind the clearance rack. The race was on.\n\n",
"With seconds to spare, each customer made their move:\n",
"- ", planner$Name, " paid $", planner$Total_Sale, " via ", planner$Payment_Method, ".\n",
"- ", saboteur$Name, " paid $", saboteur$Total_Sale, " via ", saboteur$Payment_Method, "—but left a trail of corrupted receipts.\n",
"- ", underdog$Name, " paid $", underdog$Total_Sale, " via ", underdog$Payment_Method, ", clutching their prize with tears in their eyes.\n\n",
"The store manager declared it the most intense morning in retail history. And somewhere in the shadows, the saboteur smiled.\n\n",
"*To be continued...*"
)
cat(story)
## ⏳ *Retail Rush: Countdown to Chaos* — January 28, 2023
##
## The store opened at 9:00 AM sharp. By 9:01, the air was electric.
##
## Robert Patel, age 58, had planned this purchase for weeks. They needed 2 unit(s) of Monitor to complete a secret project. But they weren’t alone.
##
## April Bryan, age 47, had hacked the inventory system the night before. Their goal? Delay shipments and sow chaos. "Nobody gets what they came for," they muttered, slipping through aisles.
##
## Michelle Gardner, age 23, had no plan—just hope. They’d saved for months to buy Tablet. But when they arrived, the shelves were bare.
##
## Then came the twist: a secret stash hidden behind the clearance rack. The race was on.
##
## With seconds to spare, each customer made their move:
## - Robert Patel paid $1000 via Credit.
## - April Bryan paid $900 via Credit—but left a trail of corrupted receipts.
## - Michelle Gardner paid $600 via Debit, clutching their prize with tears in their eyes.
##
## The store manager declared it the most intense morning in retail history. And somewhere in the shadows, the saboteur smiled.
##
## *To be continued...*
#generate_message <- function() {
# Create a string with a generalized message.
# Use paste() or paste0() to concatenate strings.
message <- paste0(
"---------------------------------------------------\n",
"🎉 Congratulations!\n\n",
"You have successfully completed the assignment. \n",
"Great work! \n\n",
"Keep up the fantastic effort.\n",
"---------------------------------------------------\n"
)
# Print the message
cat(message)
## ---------------------------------------------------
## 🎉 Congratulations!
##
## You have successfully completed the assignment.
## Great work!
##
## Keep up the fantastic effort.
## ---------------------------------------------------