## Package Loading:
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
require(gapminder)
## Loading required package: gapminder
# Locating my working directory
setwd("~/Desktop/NTHU/Spring 2024/courses/PBA")
# Reading the data
library(readxl)
online_retail <- read_excel("Online Retail.xlsx")
# Verifying the dataset's dimensions by using head() and str() functions
head(online_retail)
## # A tibble: 6 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEAR… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEARTS… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLAG… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE … 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NES… 2 2010-12-01 08:26:00 7.65
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
str(online_retail)
## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
# Converting "InvoiceDate" to date class and check it turned to "Date" already
online_retail$InvoiceDate <- as.Date(online_retail$InvoiceDate)
# Filtering the transactions data from July to August 2011 by assigning "invoicedate_no" variable
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
invoicedate_no <- online_retail %>%
filter(online_retail$InvoiceDate >= as.Date('2011-07-01') &
online_retail$InvoiceDate <= as.Date('2011-08-31'))
# Verifying the number of unique in InvoiceNo
n_distinct(invoicedate_no$InvoiceNo, na.rm = TRUE)
## [1] 3664
3.1) Compute the mean of Quantity and UnitPrice
# Computing mean of "Quantity"
mean(invoicedate_no$Quantity)
## [1] 10.65901
# Next, computing mean of "UnitPrice"
mean(invoicedate_no$UnitPrice)
## [1] 4.308608
3.2) Determine the data types of each column
sapply(invoicedate_no,class)
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## "character" "character" "character" "numeric" "Date" "numeric"
## CustomerID Country
## "numeric" "character"
3.3) Compute the number of unique values in each column
sapply(invoicedate_no, n_distinct)
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 3664 2982 2946 287 52 447
## CustomerID Country
## 1541 28
4.1) Subset the data for transactions in the U.K., Netherlands, and Australia then perform the following analyses separately for each country
# Assigning a "subset_uk" variable for transactions in the U.K.
subset_uk <- filter(invoicedate_no, Country == "United Kingdom")
# Assigning a "subset_neth" variable for transactions in Netherlands
subset_neth <- filter(invoicedate_no, Country == "Netherlands")
# Assigning a "subset_aus" variable for transactions in Australia
subset_aus<- filter(invoicedate_no, Country == "Australia")
4.2) Report the average and standard deviation of UnitPrice for each country
# Calculating the average and SD of UnitPrice of the U.K.
cat("The average and SD of UnitPrice in the UK are", mean(subset_uk$UnitPrice),
"and",sd(subset_uk$UnitPrice))
## The average and SD of UnitPrice in the UK are 4.359879 and 99.41229
# Calculating the average and SD of UnitPrice of Netherlands
cat("The average and SD of UnitPrice in Netherlands are", mean(subset_neth$UnitPrice),
"and",sd(subset_neth$UnitPrice))
## The average and SD of UnitPrice in Netherlands are 2.862109 and 12.20678
# Calculating the average and SD of UnitPrice of Australia
cat("The average and SD of UnitPrice in Australia are", mean(subset_aus$UnitPrice),
"and",sd(subset_aus$UnitPrice))
## The average and SD of UnitPrice in Australia are 2.37929 and 2.032917
4.3) Report the number of unique transactions and customers in these countries
# Calculating the number of unique transactions in the UK
cat("The number of unique transactions and customers in the UK are",
n_distinct(subset_uk$InvoiceNo), "and", n_distinct(subset_uk$CustomerID))
## The number of unique transactions and customers in the UK are 3310 and 1376
# The number of unique transactions in Netherlands
cat("The number of unique transactions and customers in Netherlands are",
n_distinct(subset_neth$InvoiceNo), "and", n_distinct(subset_neth$CustomerID))
## The number of unique transactions and customers in Netherlands are 11 and 1
# The number of unique transactions in Australia
cat("The number of unique transactions and customers in Australia are",
n_distinct(subset_aus$InvoiceNo), "and", n_distinct(subset_aus$CustomerID))
## The number of unique transactions and customers in Australia are 11 and 3
# Identifying customers who made refund by filtering "C"
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
refund <- invoicedate_no %>%
filter(str_detect(InvoiceNo, "C"))
# Counting the number of customers who made refunds and storing IDs in "cust_refund"
cat("There are", n_distinct(refund), "customers who made refunds")
## There are 1351 customers who made refunds
cust_refund <- refund["CustomerID"]
cust_refund
## # A tibble: 1,353 × 1
## CustomerID
## <dbl>
## 1 16746
## 2 16746
## 3 16746
## 4 17338
## 5 17338
## 6 17338
## 7 17888
## 8 17888
## 9 17888
## 10 17888
## # ℹ 1,343 more rows
# Assigning customer with no ID
total_sales <- invoicedate_no|>
filter(is.na(CustomerID))
# Create a variable called Sales by multiplying Quantity and UnitPrice.
total_NA_sales <- total_sales |>
mutate("Sales" = Quantity*UnitPrice)
dplyr::glimpse(total_NA_sales)
## Rows: 19,638
## Columns: 9
## $ InvoiceNo <chr> "558663", "558680", "558680", "558680", "558680", "558680"…
## $ StockCode <chr> "23574", "20711", "21116", "21166", "21175", "21206", "213…
## $ Description <chr> NA, "JUMBO BAG TOYS", "OWL DOORSTOP", "COOK WITH WINE META…
## $ Quantity <dbl> 100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,…
## $ InvoiceDate <date> 2011-07-01, 2011-07-01, 2011-07-01, 2011-07-01, 2011-07-0…
## $ UnitPrice <dbl> 0.00, 4.13, 8.29, 4.13, 4.96, 3.29, 2.46, 8.29, 1.63, 4.96…
## $ CustomerID <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Country <chr> "United Kingdom", "United Kingdom", "United Kingdom", "Uni…
## $ Sales <dbl> 0.00, 4.13, 8.29, 4.13, 4.96, 3.29, 2.46, 8.29, 1.63, 4.96…
#Calculate the total sales amount for transactions where CustomerID is missing.
sum(total_NA_sales$Sales)
## [1] 173374.1
# Showing the gapminder dataset
require(gapminder)
dplyr::glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
require(ggplot2)
ggplot(data = gapminder,
mapping = aes(x = continent,
y = lifeExp)) +
labs(y = "Life expectancy at birth by years",
title = "Life Expectancy on Five Continents",
caption = "Source: Gapminder") +
geom_boxplot()
Answer: “Oceania” continent has the highest median of life expectancy.
Answer: We could find the median from the horizontal line in the boxplot