## Package Loading: 
require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
require(gapminder)
## Loading required package: gapminder

PART I

Question 1:

Locate the directory path where the dataset is stored, load it into R, and take a look at the data

# Locating my working directory
setwd("~/Desktop/NTHU/Spring 2024/courses/PBA")

# Reading the data
library(readxl)
online_retail <- read_excel("Online Retail.xlsx")

# Verifying the dataset's dimensions by using head() and str() functions
head(online_retail)
## # A tibble: 6 × 8
##   InvoiceNo StockCode Description         Quantity InvoiceDate         UnitPrice
##   <chr>     <chr>     <chr>                  <dbl> <dttm>                  <dbl>
## 1 536365    85123A    WHITE HANGING HEAR…        6 2010-12-01 08:26:00      2.55
## 2 536365    71053     WHITE METAL LANTERN        6 2010-12-01 08:26:00      3.39
## 3 536365    84406B    CREAM CUPID HEARTS…        8 2010-12-01 08:26:00      2.75
## 4 536365    84029G    KNITTED UNION FLAG…        6 2010-12-01 08:26:00      3.39
## 5 536365    84029E    RED WOOLLY HOTTIE …        6 2010-12-01 08:26:00      3.39
## 6 536365    22752     SET 7 BABUSHKA NES…        2 2010-12-01 08:26:00      7.65
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
str(online_retail)
## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
##  $ InvoiceNo  : chr [1:541909] "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ UnitPrice  : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
##  $ Country    : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

Question 2:

Convert the InvoiceDate to date class and filter the data to include only transactions from July to August 2011. Use this filtered dataset for all subsequent questions in Part I.

# Converting "InvoiceDate" to date class and check it turned to "Date" already
online_retail$InvoiceDate <- as.Date(online_retail$InvoiceDate)

# Filtering the transactions data from July to August 2011 by assigning "invoicedate_no" variable
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
invoicedate_no <- online_retail %>% 
  filter(online_retail$InvoiceDate >= as.Date('2011-07-01') &
           online_retail$InvoiceDate <= as.Date('2011-08-31'))

# Verifying the number of unique in InvoiceNo
n_distinct(invoicedate_no$InvoiceNo, na.rm = TRUE)
## [1] 3664

Question 3:

Perform basic data analysis on the dataset by completing the following tasks:

3.1) Compute the mean of Quantity and UnitPrice

# Computing mean of "Quantity"
mean(invoicedate_no$Quantity)
## [1] 10.65901
# Next, computing mean of "UnitPrice"
mean(invoicedate_no$UnitPrice)
## [1] 4.308608

3.2) Determine the data types of each column

sapply(invoicedate_no,class)
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
## "character" "character" "character"   "numeric"      "Date"   "numeric" 
##  CustomerID     Country 
##   "numeric" "character"

3.3) Compute the number of unique values in each column

sapply(invoicedate_no, n_distinct)
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##        3664        2982        2946         287          52         447 
##  CustomerID     Country 
##        1541          28

Question 4:

Conduct a country-specific analysis on the dataset.

4.1) Subset the data for transactions in the U.K., Netherlands, and Australia then perform the following analyses separately for each country

# Assigning a "subset_uk" variable for transactions in the U.K.
subset_uk <- filter(invoicedate_no, Country == "United Kingdom")

# Assigning a "subset_neth" variable for transactions in Netherlands
subset_neth <- filter(invoicedate_no, Country == "Netherlands")

# Assigning a "subset_aus" variable for transactions in Australia
subset_aus<- filter(invoicedate_no, Country == "Australia")

4.2) Report the average and standard deviation of UnitPrice for each country

# Calculating the average and SD of UnitPrice of the U.K.
cat("The average and SD of UnitPrice in the UK are", mean(subset_uk$UnitPrice),
    "and",sd(subset_uk$UnitPrice))
## The average and SD of UnitPrice in the UK are 4.359879 and 99.41229
# Calculating the average and SD of UnitPrice of Netherlands
cat("The average and SD of UnitPrice in Netherlands are", mean(subset_neth$UnitPrice),
    "and",sd(subset_neth$UnitPrice))
## The average and SD of UnitPrice in Netherlands are 2.862109 and 12.20678
# Calculating the average and SD of UnitPrice of Australia
cat("The average and SD of UnitPrice in Australia are", mean(subset_aus$UnitPrice),
    "and",sd(subset_aus$UnitPrice))
## The average and SD of UnitPrice in Australia are 2.37929 and 2.032917

4.3) Report the number of unique transactions and customers in these countries

# Calculating the number of unique transactions in the UK
cat("The number of unique transactions and customers in the UK are",
    n_distinct(subset_uk$InvoiceNo), "and", n_distinct(subset_uk$CustomerID))
## The number of unique transactions and customers in the UK are 3310 and 1376
# The number of unique transactions in Netherlands
cat("The number of unique transactions and customers in Netherlands are",
    n_distinct(subset_neth$InvoiceNo), "and", n_distinct(subset_neth$CustomerID))
## The number of unique transactions and customers in Netherlands are 11 and 1
# The number of unique transactions in Australia
cat("The number of unique transactions and customers in Australia are",
    n_distinct(subset_aus$InvoiceNo), "and", n_distinct(subset_aus$CustomerID))
## The number of unique transactions and customers in Australia are 11 and 3

Question 5:

Identify and count customers who made a refund and store their IDs in a vector called cust_refund

# Identifying customers who made refund by filtering "C"
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ✔ readr   2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
refund <- invoicedate_no %>%
  filter(str_detect(InvoiceNo, "C"))

# Counting the number of customers who made refunds and storing IDs in "cust_refund"
cat("There are", n_distinct(refund), "customers who made refunds")
## There are 1351 customers who made refunds
cust_refund <- refund["CustomerID"]
cust_refund
## # A tibble: 1,353 × 1
##    CustomerID
##         <dbl>
##  1      16746
##  2      16746
##  3      16746
##  4      17338
##  5      17338
##  6      17338
##  7      17888
##  8      17888
##  9      17888
## 10      17888
## # ℹ 1,343 more rows

Question 6:

Create a variable called Sales by multiplying Quantity and UnitPrice. And calculate the total sales amount for transactions where CustomerID is missing.

# Assigning customer with no ID
total_sales <- invoicedate_no|> 
  filter(is.na(CustomerID))

# Create a variable called Sales by multiplying Quantity and UnitPrice.
total_NA_sales <- total_sales |>
  mutate("Sales" = Quantity*UnitPrice)

dplyr::glimpse(total_NA_sales)
## Rows: 19,638
## Columns: 9
## $ InvoiceNo   <chr> "558663", "558680", "558680", "558680", "558680", "558680"…
## $ StockCode   <chr> "23574", "20711", "21116", "21166", "21175", "21206", "213…
## $ Description <chr> NA, "JUMBO BAG TOYS", "OWL DOORSTOP", "COOK WITH WINE META…
## $ Quantity    <dbl> 100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,…
## $ InvoiceDate <date> 2011-07-01, 2011-07-01, 2011-07-01, 2011-07-01, 2011-07-0…
## $ UnitPrice   <dbl> 0.00, 4.13, 8.29, 4.13, 4.96, 3.29, 2.46, 8.29, 1.63, 4.96…
## $ CustomerID  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Country     <chr> "United Kingdom", "United Kingdom", "United Kingdom", "Uni…
## $ Sales       <dbl> 0.00, 4.13, 8.29, 4.13, 4.96, 3.29, 2.46, 8.29, 1.63, 4.96…
#Calculate the total sales amount for transactions where CustomerID is missing.
sum(total_NA_sales$Sales)
## [1] 173374.1

PART II

Question 7:

Ensure that the gapminder and tidyverse packages are loaded properly. Use the glimpse() function to display basic details about the gapminder dataset. In the main text, tell us how many rows and columns there are in the data set and which of the variables are factors.

# Showing the gapminder dataset
require(gapminder)
dplyr::glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Question 8:

Let’s investigate how life expectancy varies across the continents. Using ggplot

require(ggplot2)
ggplot(data = gapminder,
       mapping = aes(x = continent,
                     y = lifeExp)) +
  labs(y = "Life expectancy at birth by years",
       title = "Life Expectancy on Five Continents",
       caption = "Source: Gapminder") +
  geom_boxplot()

Question 9:

Which continent has the highest median life expectancy?

Answer: “Oceania” continent has the highest median of life expectancy.

Which part of the boxplot can we determine this from?

Answer: We could find the median from the horizontal line in the boxplot