# Load tools library
library(tools)
# Load tidyverse library
library(tidyverse)
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Expoloratory Data Analysis (EDA) Assn using R
Load a data set into R Studio
Describe the rows and columns
Data Cleaning (resolve the noise in the data)
Univariate & Multivariate Summary
Detect Outliers
Other things:
# Call the data table bus_data
# Upload from my Google Drive
# NO CHANGES NEEDED
#https://drive.google.com/file/d/18mVzAExO_toSlyZ3bqeFbg-5LdfMEYcc/view?usp=sharing
# Replace with actual file ID
file_id <- "18mVzAExO_toSlyZ3bqeFbg-5LdfMEYcc"
data_url <- paste0("https://docs.google.com/uc?export=download&id=", file_id)
#Read the data into R
bus_data <- read.csv(data_url, header = TRUE)
head(bus_data)
## Customer_ID Name Age Purchase_Date Product Quantity Price
## 1 1145 Timothy Murray 45 6/15/23 Headphones 1 500
## 2 1286 Kathryn Lawson 36 10/30/23 Headphones 1 300
## 3 1294 James Mitchell 36 6/7/23 Headphones 3 500
## 4 1415 Emily Davis 59 11/2/23 Headphones 2 200
## 5 1437 Kyle Santana 36 12/9/22 Headphones 2 300
## 6 1593 Kelly Dean 35 6/19/24 Headphones 1 300
## Total_Sale Payment_Method NameAge
## 1 500 cash Timothy Murray45
## 2 300 debit Kathryn Lawson36
## 3 1500 Cahs James Mitchell36
## 4 400 Cash Emily Davis59
## 5 600 cash Kyle Santana36
## 6 300 Cash Kelly Dean35
# View the first few lines of the data with head()
head(bus_data)
## Customer_ID Name Age Purchase_Date Product Quantity Price
## 1 1145 Timothy Murray 45 6/15/23 Headphones 1 500
## 2 1286 Kathryn Lawson 36 10/30/23 Headphones 1 300
## 3 1294 James Mitchell 36 6/7/23 Headphones 3 500
## 4 1415 Emily Davis 59 11/2/23 Headphones 2 200
## 5 1437 Kyle Santana 36 12/9/22 Headphones 2 300
## 6 1593 Kelly Dean 35 6/19/24 Headphones 1 300
## Total_Sale Payment_Method NameAge
## 1 500 cash Timothy Murray45
## 2 300 debit Kathryn Lawson36
## 3 1500 Cahs James Mitchell36
## 4 400 Cash Emily Davis59
## 5 600 cash Kyle Santana36
## 6 300 Cash Kelly Dean35
# View the last few lines of the data with tail()
### CHANGE THE COMAMAND FROM head TO tail AND REPLACE ****** WITH bus_data
### REMOVE THE # FROM THE COMMAND LINE
tail(bus_data)
## Customer_ID Name Age Purchase_Date Product Quantity Price
## 495 3572 Frances Moran 46 3/14/23 Tblet 3 300
## 496 6388 Anthony Hogan 54 3/29/23 Tblet 1 1000
## 497 6976 James Pittman 66 5/15/23 Tblet 2 150
## 498 7926 Charles Ward 52 9/12/23 Tblet 2 150
## 499 8327 Kimberly Hall 66 1/27/24 Tblet 2 1000
## 500 8588 Brenda Sanchez 59 9/10/23 Tblet 1 500
## Total_Sale Payment_Method NameAge
## 495 900 Cash Frances Moran46
## 496 1000 Cash Anthony Hogan54
## 497 300 Credit James Pittman66
## 498 300 Cash Charles Ward52
## 499 2000 Cash Kimberly Hall66
## 500 500 Debit Brenda Sanchez59
# Use summary() to summarize each of the columns (variables)
# REPLACE THE ******* WITH THE DATA TABLE NAME OF bus_data & REMOVE THE #
summary(bus_data)
## Customer_ID Name Age Purchase_Date
## Min. :1008 Length:500 Min. :18.00 Length:500
## 1st Qu.:3334 Class :character 1st Qu.:29.00 Class :character
## Median :5454 Mode :character Median :42.00 Mode :character
## Mean :5461 Mean :42.83
## 3rd Qu.:7629 3rd Qu.:56.00
## Max. :9999 Max. :70.00
## NA's :15
## Product Quantity Price Total_Sale
## Length:500 Min. :1.00 Min. : 150.0 Min. : 0.0
## Class :character 1st Qu.:1.00 1st Qu.: 300.0 1st Qu.: 300.0
## Mode :character Median :2.00 Median : 400.0 Median : 600.0
## Mean :1.89 Mean : 475.8 Mean : 878.3
## 3rd Qu.:3.00 3rd Qu.: 600.0 3rd Qu.:1000.0
## Max. :4.00 Max. :1000.0 Max. :3200.0
## NA's :8 NA's :2
## Payment_Method NameAge
## Length:500 Length:500
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
# NO CHANGES NEEDED
print("Count of total missing values for Name")
## [1] "Count of total missing values for Name"
sum(is.na(bus_data$Name))
## [1] 0
print("Count of total missing values for Purchase_Date")
## [1] "Count of total missing values for Purchase_Date"
sum(is.na(bus_data$Purchase_Date))
## [1] 0
print("Count of total missing values for Product")
## [1] "Count of total missing values for Product"
sum(is.na(bus_data$Product))
## [1] 0
print("Count of total missing values for Payment Method")
## [1] "Count of total missing values for Payment Method"
sum(is.na(bus_data$Payment_Method))
## [1] 62
print("Count of total missing values for NameAge")
## [1] "Count of total missing values for NameAge"
sum(is.na(bus_data$NameAge))
## [1] 0
# Create a new data table without the NameAge column
# Call the new data table bd
# REPLACE ********* WITH NameAge
# REMOVE THE #
bd <- subset(bus_data, select = -NameAge)
# remove all rows with NA's in any variables/columns
# REPLACE *** WITH bd IN TWO PLACES BELOW, REMOVE THE # ON THOSE LINES
bd <- na.omit(bd)
# summarize
summary(bd)
## Customer_ID Name Age Purchase_Date
## Min. :1008 Length:416 Min. :18.00 Length:416
## 1st Qu.:3334 Class :character 1st Qu.:29.00 Class :character
## Median :5388 Mode :character Median :42.00 Mode :character
## Mean :5484 Mean :42.85
## 3rd Qu.:7684 3rd Qu.:56.00
## Max. :9999 Max. :70.00
## Product Quantity Price Total_Sale
## Length:416 Min. :1.000 Min. : 150.0 Min. : 150
## Class :character 1st Qu.:1.000 1st Qu.: 300.0 1st Qu.: 300
## Mode :character Median :2.000 Median : 300.0 Median : 600
## Mean :1.935 Mean : 462.7 Mean : 887
## 3rd Qu.:3.000 3rd Qu.: 600.0 3rd Qu.:1000
## Max. :4.000 Max. :1000.0 Max. :3000
## Payment_Method
## Length:416
## Class :character
## Mode :character
##
##
##
# remove all rows with NA's in any variables/columns
bd <- na.omit(bd)
summarize
## function (.data, ..., .by = NULL, .groups = NULL)
## {
## by <- enquo(.by)
## if (!quo_is_null(by) && !is.null(.groups)) {
## abort("Can't supply both `.by` and `.groups`.")
## }
## UseMethod("summarise")
## }
## <bytecode: 0x1075fe160>
## <environment: namespace:dplyr>
summary(bd)
## Customer_ID Name Age Purchase_Date
## Min. :1008 Length:416 Min. :18.00 Length:416
## 1st Qu.:3334 Class :character 1st Qu.:29.00 Class :character
## Median :5388 Mode :character Median :42.00 Mode :character
## Mean :5484 Mean :42.85
## 3rd Qu.:7684 3rd Qu.:56.00
## Max. :9999 Max. :70.00
## Product Quantity Price Total_Sale
## Length:416 Min. :1.000 Min. : 150.0 Min. : 150
## Class :character 1st Qu.:1.000 1st Qu.: 300.0 1st Qu.: 300
## Mode :character Median :2.000 Median : 300.0 Median : 600
## Mean :1.935 Mean : 462.7 Mean : 887
## 3rd Qu.:3.000 3rd Qu.: 600.0 3rd Qu.:1000
## Max. :4.000 Max. :1000.0 Max. :3000
## Payment_Method
## Length:416
## Class :character
## Mode :character
##
##
##
# Find the number of unique Payment_Methods
# Create table for Payment_Method
# REPLACE *** WITH bd
# REPLACE ***** WITH the variable name Payment_Method,
# REMOVE THE # IN FRONT OF THE TWO LINES OF CODE
length(unique(bd$Payment_Method))
## [1] 12
table(bd$Payment_Method)
##
## Cahs Cahsh cash Cash Crdit credit Credit Creditt Debbit Debiit
## 6 6 13 124 3 14 119 3 11 2
## debit Debit
## 11 104
# mutate Payment_Method to be title case
# Find the number of unique Payment_Methods
# Create table for Payment_Method
## REPLACE *** WITH bd
## REPLACE ***** WITH the variable name
## REMOVE THE # FROM THOSE LINES
bd <- mutate(bd,Payment_Method = toTitleCase(bd$Payment_Method))
length(unique(bd$Payment_Method))
## [1] 9
table(bd$Payment_Method)
##
## Cahs Cahsh Cash Crdit Credit Creditt Debbit Debiit Debit
## 6 6 137 3 133 3 11 2 115
# Change Cahs & Cahsh to Cash
### NO CHANGES NECESSARY
bd <- mutate( bd ,Payment_Method = gsub('Cahsh', 'Cash', Payment_Method ))
bd <- mutate( bd ,Payment_Method = gsub('Cahs', 'Cash', Payment_Method ))
# Change Crdit & Creditt to Credit
### MAKE CHANGES TO THE LINES BELOW TO FIX THE MIS-SPELLINGS OF Credit
bd <- mutate( bd ,Payment_Method = gsub('Crdit', 'Credit', Payment_Method ))
bd <- mutate( bd ,Payment_Method = gsub('Creditt', 'Credit', Payment_Method ))
### Change Debbit & Debiit to Debit
### MAKE CHANGES TO THE LINES BELOW TO FIX THE MIS-SPELLINGS OF Debit
bd <- mutate( bd ,Payment_Method = gsub('Debbit', 'Debit', Payment_Method ))
bd <- mutate( bd ,Payment_Method = gsub('Debiit', 'Debit', Payment_Method ))
# Find the number of unique Payment_Methods
# Create table for Payment_Method
### NO CHANGES NECESSARY
length(unique(bd$Payment_Method))
## [1] 3
table(bd$Payment_Method)
##
## Cash Credit Debit
## 149 139 128
# Find the number of unique Product
# Create table for Product
### NO CHANGES NECESSARY
length(unique(bd$Product))
## [1] 13
table(bd$Product)
##
## headphones Headphones laptop Laptop Monitir monitor Monitor
## 20 68 12 86 7 10 67
## Monittor smartphone Smartphone Tablet Tablt Tblet
## 4 27 42 63 4 6
### NO CHANGES NECESSARY
# mutate Product to be title case
bd <- mutate(bd,Product = toTitleCase(bd$Product))
# Find the number of unique Product
# Create table for Product
length(unique(bd$Product))
## [1] 9
table(bd$Product)
##
## Headphones Laptop Monitir Monitor Monittor Smartphone Tablet
## 88 98 7 77 4 69 63
## Tablt Tblet
## 4 6
### Change Monitir & Monittor to Monitor
### NO CHANGES NECESSARY
bd <- mutate( bd ,Product = gsub('Monitir', 'Monitor', Product ))
bd <- mutate( bd ,Product = gsub('Monittor', 'Monitor', Product ))
### Change Tablt & Tblet to Tablet
### MAKE CHANGES TO THE LINES BELOW TO FIX THE MIS-SPELLINGS OF Tablet
bd <- mutate( bd ,Product = gsub('Tablt', 'Tablet', Product ))
bd <- mutate( bd ,Product = gsub('Tblet', 'Tablet', Product ))
# Find the number of unique Product
# Create table for Product
### NO CHANGES NECESSARY
length(unique(bd$Product))
## [1] 5
table(bd$Product)
##
## Headphones Laptop Monitor Smartphone Tablet
## 88 98 88 69 73
# Create summary statistics for Total_Sale
summary(bd$Total_Sale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 150 300 600 887 1000 3000
# Create a histogram of Total_Sale
hist(bd$Total_Sale)
# Create a boxplot for Total_Sale
boxplot(bd$Total_Sale)
#### MAKE CHANGES NECESSARY TO CREATE SUMMARY STAT, HISTOGRAM AND BOXPLOT FOR QUANTITY
# Create summary statistics for Total_Sale
summary(bd$Quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.935 3.000 4.000
# Create a histogram of Total_Sale
hist(bd$Quantity)
# Create a boxplot for Total_Sale
boxplot(bd$Quantity)
# Total sales by Product
tapply(bd$Total_Sale, bd$Product,sum)
## Headphones Laptop Monitor Smartphone Tablet
## 49500 94850 77100 75200 72350
# Total quantity by Product
#### MAKE CHANGES NECESSARY TO CREATE SUMMARY of QUANTITY by product
tapply(bd$Quantity, bd$Product,sum)
## Headphones Laptop Monitor Smartphone Tablet
## 161 209 168 124 143
#MAKE CHANGES NECESSARY TO SUMMARIZE TOTAL SALES BY Payment_Method
# Total sales by Product
tapply(bd$Total_Sale, bd$Payment_Method,sum)
## Cash Credit Debit
## 126400 123950 118650
# Total quantity by Product
tapply(bd$Quantity, bd$Payment_Method,sum)
## Cash Credit Debit
## 279 272 254
### NO CHANGES NECESSARY
table(bd$Payment_Method,bd$Product)
##
## Headphones Laptop Monitor Smartphone Tablet
## Cash 33 44 26 21 25
## Credit 15 30 36 34 24
## Debit 40 24 26 14 24
### NO CHANGES NECESSARY
# Get the length of each string in the 'Purchase_Date' column
string_lengths <- nchar(bd$Purchase_Date)
# Extract the last two characters from each string
# The `start` argument is calculated as the total length minus 1
# The `stop` argument is the total length
year <- substr(bd$Purchase_Date, start = string_lengths - 1, stop = string_lengths)
# Add the new column to the data frame
bd$Year <- year
# Summarize Total Sales by Year
tapply(bd$Total_Sale,bd$Year, sum)
## 22 23 24 74 88
## 69050 164050 134600 1000 300
# remove rows with year of 74 or 88
### REPLACE THE ** WITH 74 AND 88, remove the # in the row
# Find all rows where the "Year" is "74" or "88'
rows_to_delete <- which(bd$Year == "74" | bd$Year =="88" )
# Keep all rows *except* the ones in the 'rows_to_delete' list
bd <- bd[-rows_to_delete, ]
# Summarize Total Sales by Year
tapply(bd$Total_Sale,bd$Year, sum)
## 22 23 24
## 69050 164050 134600
#Create a Bar Chart for total Sales by Year
### NO CHANGES NECESSARY
# Create the new data frame for the Bar Chart
SY <- data.frame(
Yr = unique(bd$Year),
TotalSales = tapply(bd$Total_Sale, bd$Year, sum)
)
# Create the bar chart using base R
barplot(
SY$TotalSales,
names.arg = SY$Year,
main = "Total Sales by Year",
xlab = "Year",
ylab = "Total Sales",
col = "skyblue"
)
# Table made with ChatGPT
summary(bd[, c("Total_Sale", "Quantity")])
## Total_Sale Quantity
## Min. : 150.0 Min. :1.000
## 1st Qu.: 300.0 1st Qu.:1.000
## Median : 600.0 Median :2.000
## Mean : 888.2 Mean :1.937
## 3rd Qu.:1000.0 3rd Qu.:3.000
## Max. :3000.0 Max. :4.000
# Barplot of total sales by product
barplot(tapply(bd$Total_Sale, bd$Product, sum, na.rm = TRUE),
main = "Total Sales by Product",
xlab = "Product",
ylab = "Total Sales",
col = "skyblue")
# Barplot of quantity by product
barplot(tapply(bd$Quantity, bd$Product, sum, na.rm = TRUE),
main = "Total Quantity Sold by Product",
xlab = "Product",
ylab = "Quantity",
col = "lightgreen")
# break
#generate_message <- function() {
# Create a string with a generalized message.
# Use paste() or paste0() to concatenate strings.
message <- paste0(
"---------------------------------------------------\n",
"🎉 Congratulations!\n\n",
"You have successfully completed the assignment. \n",
"Great work! \n\n",
"Keep up the fantastic effort.\n",
"---------------------------------------------------\n"
)
# Print the message
cat(message)
## ---------------------------------------------------
## 🎉 Congratulations!
##
## You have successfully completed the assignment.
## Great work!
##
## Keep up the fantastic effort.
## ---------------------------------------------------