# Core packages
library(tidyverse)
library(lubridate)
library(stringr) # for str_starts
# Part II data
library(gapminder)
# Helpful options
options(dplyr.summarise.inform = FALSE)
Dataset note. From the assignment handout:
InvoiceNo that starts with ‘C’ indicates a
cancellation/refund. InvoiceDate is stored
in mm/dd/yy format. We will use these facts below.
# Adjust the file name/path if needed
stopifnot(file.exists("online_retail.csv"))
retail <- read_csv("online_retail.csv", show_col_types = FALSE) # <-- uses your uploaded file
# Quick look
# Clean/normalize some fields
retail <- retail %>%
mutate(
InvoiceNo = as.character(InvoiceNo),
Country = trimws(as.character(Country)),
Description = as.character(Description)
)
head(retail)
## # A tibble: 6 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
## <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 536365 85123A WHITE HANGING H… 6 12/1/10 8:… 2.55 17850
## 2 536365 71053 WHITE METAL LAN… 6 12/1/10 8:… 3.39 17850
## 3 536365 84406B CREAM CUPID HEA… 8 12/1/10 8:… 2.75 17850
## 4 536365 84029G KNITTED UNION F… 6 12/1/10 8:… 3.39 17850
## 5 536365 84029E RED WOOLLY HOTT… 6 12/1/10 8:… 3.39 17850
## 6 536365 22752 SET 7 BABUSHKA … 2 12/1/10 8:… 7.65 17850
## # ℹ 1 more variable: Country <chr>
str(retail)
## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr [1:541909] "12/1/10 8:26" "12/1/10 8:26" "12/1/10 8:26" "12/1/10 8:26" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
Write-up. Loaded the CSV and displayed the first rows and structure to verify dimensions and column types.
# Convert InvoiceDate to Date class (mm/dd/yy)
retail <- retail %>%
mutate(InvoiceDate = mdy_hm(InvoiceDate))
# Filter to 2011-07-01 through 2011-08-31
retail_f <- retail %>%
filter(InvoiceDate >= as.Date("2011-07-01"),
InvoiceDate <= as.Date("2011-08-31"))
# Verify number of unique invoices
n_unique_invoices <- n_distinct(retail_f$InvoiceNo)
n_unique_invoices
## [1] 3547
Write-up. Converted InvoiceDate using
lubridate::mdy(). Filtered between
2011-07-01 and 2011-08-31 (inclusive).
The filtered data has 3547 unique
InvoiceNo values.
Note: Use this
retail_fdataset for all remaining Part I questions.
# 3.1 Means of Quantity and UnitPrice
means_tbl <- tibble(
mean_Quantity = mean(retail_f$Quantity, na.rm = TRUE),
mean_UnitPrice = mean(retail_f$UnitPrice, na.rm = TRUE)
)
# 3.2 Data types of each column
types_tbl <- sapply(retail_f, class)
# 3.3 Number of unique values in each column
unique_counts_tbl <- sapply(retail_f, n_distinct)
list(
means = means_tbl,
types = types_tbl,
unique_counts = unique_counts_tbl
)
## $means
## # A tibble: 1 × 2
## mean_Quantity mean_UnitPrice
## <dbl> <dbl>
## 1 10.8 4.30
##
## $types
## $types$InvoiceNo
## [1] "character"
##
## $types$StockCode
## [1] "character"
##
## $types$Description
## [1] "character"
##
## $types$Quantity
## [1] "numeric"
##
## $types$InvoiceDate
## [1] "POSIXct" "POSIXt"
##
## $types$UnitPrice
## [1] "numeric"
##
## $types$CustomerID
## [1] "numeric"
##
## $types$Country
## [1] "character"
##
##
## $unique_counts
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 3547 2977 2942 269 3245 435
## CustomerID Country
## 1525 28
Write-up. Reported: (1) means of
Quantity and UnitPrice; (2) classes for all
columns; (3) number of unique values per column.
Tasks: Subset to United Kingdom,
Netherlands, and Australia; for each,
compute average & SD of UnitPrice, number of unique
transactions and customers.
countries <- c("United Kingdom", "Netherlands", "Australia")
country_summary <- retail_f %>%
filter(Country %in% countries) %>%
group_by(Country) %>%
summarise(
mean_UnitPrice = mean(UnitPrice, na.rm = TRUE),
sd_UnitPrice = sd(UnitPrice, na.rm = TRUE),
unique_txn = n_distinct(InvoiceNo),
unique_cust = n_distinct(CustomerID),
.groups = "drop"
)
country_summary
## # A tibble: 3 × 5
## Country mean_UnitPrice sd_UnitPrice unique_txn unique_cust
## <chr> <dbl> <dbl> <int> <int>
## 1 Australia 2.38 2.03 11 3
## 2 Netherlands 2.88 12.3 10 1
## 3 United Kingdom 4.35 100. 3205 1361
Write-up. The table above reports the required statistics for each specified country.
Logic: A refund is indicated by
InvoiceNo starting with the letter 'C'.
refund_rows <- retail_f %>%
filter(str_starts(InvoiceNo, "C"))
cust_refund <- refund_rows %>%
filter(!is.na(CustomerID)) %>%
distinct(CustomerID) %>%
pull(CustomerID)
num_cust_refund <- length(cust_refund)
num_cust_refund
## [1] 361
cust_refund
## [1] 16746 17338 17888 12901 16422 12709 16498 16700 18075 16570 14415 15033
## [13] 15023 17841 13767 17063 13763 12619 14952 14338 18171 13115 13826 16729
## [25] 13089 13534 15518 17017 14034 13021 16571 15985 17389 12494 13338 16393
## [37] 17511 13420 17469 12507 12626 12379 12980 12472 17861 13510 17719 16581
## [49] 17837 13984 18072 16550 13408 14680 13078 14905 15615 13769 13047 14267
## [61] 15078 14016 14808 17126 16283 18024 13627 14690 13184 14796 17706 18142
## [73] 14389 13267 16033 13745 12717 17337 15719 18193 17381 13013 13599 15416
## [85] 17069 16701 13743 14483 16998 15493 16156 14088 14426 14562 17696 13815
## [97] 14527 16656 14258 16779 17049 16782 17440 15769 17652 17068 12451 14572
## [109] 14659 16398 14159 15527 14390 17448 12930 15125 18087 13564 16717 17809
## [121] 15369 13154 17603 12705 16401 16488 12415 12971 12921 16145 17738 17651
## [133] 14911 12468 17065 14437 13941 15249 16271 17866 15802 18102 17581 16255
## [145] 14606 13217 15298 16013 17811 13488 14410 13668 12830 17677 16859 15152
## [157] 12722 15260 17593 18198 14875 13468 16875 13590 14733 14400 15005 14056
## [169] 17612 13098 16705 13050 14916 15187 15708 13802 13097 13230 12576 13520
## [181] 16161 15544 17175 16029 12431 17096 14329 16133 17422 13327 17675 15106
## [193] 12471 17693 15289 14534 14243 17649 15781 18268 17671 16558 13109 18223
## [205] 15827 17054 14194 15845 16572 15746 15370 13631 13136 14414 14145 15532
## [217] 14738 16774 15120 15602 15311 16316 15810 16628 12457 13505 16523 14857
## [229] 13081 15764 15189 12757 16180 17053 13106 12748 15625 16525 15804 12476
## [241] 12584 14621 13813 16743 15640 15228 13186 15299 15502 16951 13588 15157
## [253] 12406 15581 14112 13709 13694 16672 12929 16609 15194 15201 15628 18282
## [265] 14903 15110 17994 18172 17602 15671 13384 13388 14156 12693 12749 17429
## [277] 13319 17531 14305 17579 12610 12885 13722 16928 12656 17613 17644 16150
## [289] 17611 15660 16292 12409 13116 17988 15048 15342 15205 17048 14175 16755
## [301] 17949 12697 18248 14646 14291 17302 15251 13925 18178 14937 17340 17442
## [313] 17243 15706 12528 12362 13048 18241 13090 16330 17097 16184 12530 12428
## [325] 14407 12456 12753 12477 13576 14301 15270 15469 14189 13209 15301 16996
## [337] 12994 16302 13517 16607 16841 12853 16536 12569 13148 16493 12955 17676
## [349] 14732 15838 13266 18204 16722 15315 14188 13159 16768 17828 14096 17173
## [361] 17416
Write-up. Identified all refund transactions and
returned the count of unique customers who made a
refund. Also stored their IDs in vector cust_refund as
required.
Create Sales = Quantity * UnitPrice and compute
total sales where CustomerID is
NA.
retail_f <- retail_f %>%
mutate(Sales = Quantity * UnitPrice)
total_sales_missing_id <- retail_f %>%
filter(is.na(CustomerID)) %>%
summarise(total_sales = sum(Sales, na.rm = TRUE)) %>%
pull(total_sales)
total_sales_missing_id
## [1] 168534.8
Write-up. Calculated the total sales volume
attributable to transactions with missing CustomerID
values.
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
Write-up. gapminder has 1704
rows and 6 columns. The variables stored as
factors are country and
continent.
ggplot(gapminder, aes(x = continent, y = lifeExp)) +
geom_boxplot() +
labs(
title = "Life Expectancy by Continent",
x = "Continent",
y = "Life expectancy at birth (years)",
caption = "Source: Gapminder"
)
Write-up. Recreated the requested boxplot showing the distribution of life expectancy for each continent with clear labels.
Answer. From the boxplot above,
Europe has the highest median life
expectancy.
We determine this by comparing the thick horizontal line inside
each box (the median). The continent whose median line is
highest on the y-axis has the highest median life expectancy.