Setup

# Core packages
library(tidyverse)
library(lubridate)
library(stringr)  # for str_starts

# Part II data
library(gapminder)

# Helpful options
options(dplyr.summarise.inform = FALSE)

PART I — Online Retail (UCI)

Dataset note. From the assignment handout: InvoiceNo that starts with ‘C’ indicates a cancellation/refund. InvoiceDate is stored in mm/dd/yy format. We will use these facts below.

Question 1 (2 pts) — Load and inspect

# Adjust the file name/path if needed
stopifnot(file.exists("online_retail.csv"))
retail <- read_csv("online_retail.csv", show_col_types = FALSE)  # <-- uses your uploaded file

# Quick look
# Clean/normalize some fields
retail <- retail %>%
  mutate(
    InvoiceNo   = as.character(InvoiceNo),
    Country     = trimws(as.character(Country)),
    Description = as.character(Description)
  )

head(retail)
## # A tibble: 6 × 8
##   InvoiceNo StockCode Description      Quantity InvoiceDate UnitPrice CustomerID
##   <chr>     <chr>     <chr>               <dbl> <chr>           <dbl>      <dbl>
## 1 536365    85123A    WHITE HANGING H…        6 12/1/10 8:…      2.55      17850
## 2 536365    71053     WHITE METAL LAN…        6 12/1/10 8:…      3.39      17850
## 3 536365    84406B    CREAM CUPID HEA…        8 12/1/10 8:…      2.75      17850
## 4 536365    84029G    KNITTED UNION F…        6 12/1/10 8:…      3.39      17850
## 5 536365    84029E    RED WOOLLY HOTT…        6 12/1/10 8:…      3.39      17850
## 6 536365    22752     SET 7 BABUSHKA …        2 12/1/10 8:…      7.65      17850
## # ℹ 1 more variable: Country <chr>
str(retail)
## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
##  $ InvoiceNo  : chr [1:541909] "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr [1:541909] "12/1/10 8:26" "12/1/10 8:26" "12/1/10 8:26" "12/1/10 8:26" ...
##  $ UnitPrice  : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
##  $ Country    : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

Write-up. Loaded the CSV and displayed the first rows and structure to verify dimensions and column types.

Question 2 (3 pts) — Convert date & filter July–Aug 2011

# Convert InvoiceDate to Date class (mm/dd/yy)
retail <- retail %>%
  mutate(InvoiceDate = mdy_hm(InvoiceDate))

# Filter to 2011-07-01 through 2011-08-31
retail_f <- retail %>%
  filter(InvoiceDate >= as.Date("2011-07-01"),
         InvoiceDate <= as.Date("2011-08-31"))

# Verify number of unique invoices
n_unique_invoices <- n_distinct(retail_f$InvoiceNo)
n_unique_invoices
## [1] 3547

Write-up. Converted InvoiceDate using lubridate::mdy(). Filtered between 2011-07-01 and 2011-08-31 (inclusive). The filtered data has 3547 unique InvoiceNo values.

Note: Use this retail_f dataset for all remaining Part I questions.

Question 3 (6 pts) — Basic analysis

# 3.1 Means of Quantity and UnitPrice
means_tbl <- tibble(
  mean_Quantity = mean(retail_f$Quantity, na.rm = TRUE),
  mean_UnitPrice = mean(retail_f$UnitPrice, na.rm = TRUE)
)

# 3.2 Data types of each column
types_tbl <- sapply(retail_f, class)

# 3.3 Number of unique values in each column
unique_counts_tbl <- sapply(retail_f, n_distinct)

list(
  means = means_tbl,
  types = types_tbl,
  unique_counts = unique_counts_tbl
)
## $means
## # A tibble: 1 × 2
##   mean_Quantity mean_UnitPrice
##           <dbl>          <dbl>
## 1          10.8           4.30
## 
## $types
## $types$InvoiceNo
## [1] "character"
## 
## $types$StockCode
## [1] "character"
## 
## $types$Description
## [1] "character"
## 
## $types$Quantity
## [1] "numeric"
## 
## $types$InvoiceDate
## [1] "POSIXct" "POSIXt" 
## 
## $types$UnitPrice
## [1] "numeric"
## 
## $types$CustomerID
## [1] "numeric"
## 
## $types$Country
## [1] "character"
## 
## 
## $unique_counts
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##        3547        2977        2942         269        3245         435 
##  CustomerID     Country 
##        1525          28

Write-up. Reported: (1) means of Quantity and UnitPrice; (2) classes for all columns; (3) number of unique values per column.

Question 4 (6 pts) — Country-specific analysis

Tasks: Subset to United Kingdom, Netherlands, and Australia; for each, compute average & SD of UnitPrice, number of unique transactions and customers.

countries <- c("United Kingdom", "Netherlands", "Australia")

country_summary <- retail_f %>%
  filter(Country %in% countries) %>%
  group_by(Country) %>%
  summarise(
    mean_UnitPrice = mean(UnitPrice, na.rm = TRUE),
    sd_UnitPrice   = sd(UnitPrice, na.rm = TRUE),
    unique_txn     = n_distinct(InvoiceNo),
    unique_cust    = n_distinct(CustomerID),
    .groups = "drop"
  )

country_summary
## # A tibble: 3 × 5
##   Country        mean_UnitPrice sd_UnitPrice unique_txn unique_cust
##   <chr>                   <dbl>        <dbl>      <int>       <int>
## 1 Australia                2.38         2.03         11           3
## 2 Netherlands              2.88        12.3          10           1
## 3 United Kingdom           4.35       100.         3205        1361

Write-up. The table above reports the required statistics for each specified country.

Question 5 (5 pts) — Customers who made a refund

Logic: A refund is indicated by InvoiceNo starting with the letter 'C'.

refund_rows <- retail_f %>%
  filter(str_starts(InvoiceNo, "C"))

cust_refund <- refund_rows %>%
  filter(!is.na(CustomerID)) %>%
  distinct(CustomerID) %>%
  pull(CustomerID)

num_cust_refund <- length(cust_refund)

num_cust_refund
## [1] 361
cust_refund
##   [1] 16746 17338 17888 12901 16422 12709 16498 16700 18075 16570 14415 15033
##  [13] 15023 17841 13767 17063 13763 12619 14952 14338 18171 13115 13826 16729
##  [25] 13089 13534 15518 17017 14034 13021 16571 15985 17389 12494 13338 16393
##  [37] 17511 13420 17469 12507 12626 12379 12980 12472 17861 13510 17719 16581
##  [49] 17837 13984 18072 16550 13408 14680 13078 14905 15615 13769 13047 14267
##  [61] 15078 14016 14808 17126 16283 18024 13627 14690 13184 14796 17706 18142
##  [73] 14389 13267 16033 13745 12717 17337 15719 18193 17381 13013 13599 15416
##  [85] 17069 16701 13743 14483 16998 15493 16156 14088 14426 14562 17696 13815
##  [97] 14527 16656 14258 16779 17049 16782 17440 15769 17652 17068 12451 14572
## [109] 14659 16398 14159 15527 14390 17448 12930 15125 18087 13564 16717 17809
## [121] 15369 13154 17603 12705 16401 16488 12415 12971 12921 16145 17738 17651
## [133] 14911 12468 17065 14437 13941 15249 16271 17866 15802 18102 17581 16255
## [145] 14606 13217 15298 16013 17811 13488 14410 13668 12830 17677 16859 15152
## [157] 12722 15260 17593 18198 14875 13468 16875 13590 14733 14400 15005 14056
## [169] 17612 13098 16705 13050 14916 15187 15708 13802 13097 13230 12576 13520
## [181] 16161 15544 17175 16029 12431 17096 14329 16133 17422 13327 17675 15106
## [193] 12471 17693 15289 14534 14243 17649 15781 18268 17671 16558 13109 18223
## [205] 15827 17054 14194 15845 16572 15746 15370 13631 13136 14414 14145 15532
## [217] 14738 16774 15120 15602 15311 16316 15810 16628 12457 13505 16523 14857
## [229] 13081 15764 15189 12757 16180 17053 13106 12748 15625 16525 15804 12476
## [241] 12584 14621 13813 16743 15640 15228 13186 15299 15502 16951 13588 15157
## [253] 12406 15581 14112 13709 13694 16672 12929 16609 15194 15201 15628 18282
## [265] 14903 15110 17994 18172 17602 15671 13384 13388 14156 12693 12749 17429
## [277] 13319 17531 14305 17579 12610 12885 13722 16928 12656 17613 17644 16150
## [289] 17611 15660 16292 12409 13116 17988 15048 15342 15205 17048 14175 16755
## [301] 17949 12697 18248 14646 14291 17302 15251 13925 18178 14937 17340 17442
## [313] 17243 15706 12528 12362 13048 18241 13090 16330 17097 16184 12530 12428
## [325] 14407 12456 12753 12477 13576 14301 15270 15469 14189 13209 15301 16996
## [337] 12994 16302 13517 16607 16841 12853 16536 12569 13148 16493 12955 17676
## [349] 14732 15838 13266 18204 16722 15315 14188 13159 16768 17828 14096 17173
## [361] 17416

Write-up. Identified all refund transactions and returned the count of unique customers who made a refund. Also stored their IDs in vector cust_refund as required.

Question 6 (5 pts) — Missing CustomerID transactions

Create Sales = Quantity * UnitPrice and compute total sales where CustomerID is NA.

retail_f <- retail_f %>%
  mutate(Sales = Quantity * UnitPrice)

total_sales_missing_id <- retail_f %>%
  filter(is.na(CustomerID)) %>%
  summarise(total_sales = sum(Sales, na.rm = TRUE)) %>%
  pull(total_sales)

total_sales_missing_id
## [1] 168534.8

Write-up. Calculated the total sales volume attributable to transactions with missing CustomerID values.

PART II — Gapminder & ggplot

Question 7 (5 pts) — Glimpse & factors

glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Write-up. gapminder has 1704 rows and 6 columns. The variables stored as factors are country and continent.

Question 8 (10 pts) — Boxplots of life expectancy by continent

ggplot(gapminder, aes(x = continent, y = lifeExp)) +
  geom_boxplot() +
  labs(
    title = "Life Expectancy by Continent",
    x = "Continent",
    y = "Life expectancy at birth (years)",
    caption = "Source: Gapminder"
  )

Write-up. Recreated the requested boxplot showing the distribution of life expectancy for each continent with clear labels.

Question 9 (5 pts) — Highest median life expectancy

Answer. From the boxplot above, Europe has the highest median life expectancy.
We determine this by comparing the thick horizontal line inside each box (the median). The continent whose median line is highest on the y-axis has the highest median life expectancy.