MIDTERM EXAM
PATRICIA HUTAJULU
Mini Project 1: E-Commerce
Section A – Data Collection
Objective
To retrieve and combine data from multiple sources using R. The dataset consists of 5 different files in CSV, XLSX, JSON, TXT, and XML formats, which are automatically read using looping.
Load Library
library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(knitr)
library(ggplot2)
library(plotly)
library(scales)
library(tibble)
library(htmltools)
library(readr)
library(readxl)
library(jsonlite)
library(xml2)
# Beautiful pastel table — zero quote conflicts
rt <- function(df, page_size = 5, ...) {
hcols <- c("#F4A8C0","#93B4FF","#99D6B3","#FFE57A","#C9A3FF","#FFBA99","#80D0D0")
nc <- ncol(df)
n <- nrow(df)
cidx <- ((seq_len(nc) - 1L) %% length(hcols)) + 1L
make_th <- function(nm, bg) {
paste0("<th style=\"background:", bg,
";color:#2D3748;font-weight:700;padding:11px 16px;",
"text-align:center;font-family:Poppins,sans-serif;font-size:13px;",
"border-bottom:3px solid rgba(0,0,0,0.08)\">", nm, "</th>")
}
th <- paste(mapply(make_th, names(df), hcols[cidx]), collapse = "")
make_td <- function(v) {
if (is.na(v) || v == "NA") v <- "<em style=\"color:#bbb\">—</em>"
paste0("<td style=\"padding:9px 16px;text-align:center;font-size:13px;",
"border-bottom:1px solid #EEF0F8;color:#2D3748;",
"font-family:Poppins,sans-serif\">", v, "</td>")
}
make_tr <- function(i) {
bg <- if (i %% 2L == 0L) "#F8F9FF" else "#FFFFFF"
tds <- paste(sapply(seq_len(nc), function(j)
make_td(as.character(df[i, j, drop = TRUE]))), collapse = "")
paste0("<tr style=\"background:", bg, "\">", tds, "</tr>")
}
body <- paste(sapply(seq_len(min(n, 300L)), make_tr), collapse = "\n")
htmltools::HTML(paste0(
"<div style=\"overflow-x:auto;border-radius:16px;",
"box-shadow:0 4px 20px rgba(100,120,200,0.12);margin:20px 0\">",
"<table style=\"width:100%;border-collapse:collapse;",
"font-family:Poppins,sans-serif;background:#fff\">",
"<thead><tr>", th, "</tr></thead><tbody>", body, "</tbody></table></div>"
))
}Reading 5 Files Using Looping
Looping Requirement
All files must be read automatically using a for loop. For each file, display the number of rows, number of columns, and column names.
file_list <- list(
list(name = "ecommerce.csv", type = "csv"),
list(name = "ecommerce.xlsx", type = "xlsx"),
list(name = "ecommerce.json", type = "json"),
list(name = "ecommerce.txt", type = "txt"),
list(name = "ecommerce.xml", type = "xml")
)
all_dfs <- list()
for (i in seq_along(file_list)) {
fname <- file_list[[i]]$name
ftype <- file_list[[i]]$type
message("Reading: ", fname)
df <- tryCatch({
if (ftype == "csv") {
read_csv(fname, show_col_types = FALSE)
} else if (ftype == "xlsx") {
read_excel(fname)
} else if (ftype == "json") {
as_tibble(fromJSON(fname))
} else if (ftype == "txt") {
read_delim(fname, delim = "|", show_col_types = FALSE)
} else if (ftype == "xml") {
xml_doc <- read_xml(fname)
records <- xml_find_all(xml_doc, "//Record")
xml_data <- lapply(records, function(rec) {
nodes <- xml_children(rec)
setNames(as.list(xml_text(nodes)), xml_name(nodes))
})
bind_rows(lapply(xml_data, as_tibble))
}
}, error = function(e) {
message(" ERROR reading: ", fname, " | ", e$message)
return(NULL)
})
if (!is.null(df)) {
df$source_file <- fname
all_dfs[[fname]] <- df
message(" Success: ", fname, " | Rows: ", nrow(df), " | Columns: ", ncol(df))
}
}
summary_read <- tibble(
File = names(all_dfs),
Rows = sapply(all_dfs, nrow),
Columns = sapply(all_dfs, function(df) ncol(df) - 1),
Status = " Success"
)
rt(summary_read)| File | Rows | Columns | Status |
|---|---|---|---|
| ecommerce.csv | 2000 | 22 | Success |
| ecommerce.xlsx | 2000 | 22 | Success |
| ecommerce.json | 2000 | 22 | Success |
| ecommerce.txt | 2000 | 22 | Success |
| ecommerce.xml | 2000 | 22 | Success |
Check Column Compatibility (IF/IF-ELSE)
Validation Logic
Each file is compared based on its column structure with the reference file (CSV). If identical, the status is "Ready to merge", otherwise "Need adjustment".
col_sets <- lapply(all_dfs, function(df) sort(setdiff(names(df), "source_file")))
if (length(col_sets) == 0) stop(" No files were successfully read.")
ref_cols <- col_sets[[1]]
compat_df <- tibble(
File = names(col_sets),
Status = sapply(names(col_sets), function(fn) {
if (identical(col_sets[[fn]], ref_cols)) " Ready to merge"
else "️ Need adjustment"
}),
`Different Columns` = sapply(names(col_sets), function(fn) {
diff <- setdiff(col_sets[[fn]], ref_cols)
if (length(diff) == 0) "—" else paste(diff, collapse = ", ")
})
)
rt(compat_df)| File | Status | Different Columns |
|---|---|---|
| ecommerce.csv | Ready to merge | — |
| ecommerce.xlsx | Ready to merge | — |
| ecommerce.json | Ready to merge | — |
| ecommerce.txt | Ready to merge | — |
| ecommerce.xml | Ready to merge | — |
Merge into One Main Dataset
all_dfs_char <- lapply(all_dfs, function(df) {
df %>% mutate(across(everything(), as.character))
})
df_main <- bind_rows(all_dfs_char)
rt(tibble(
Description = c("Total Rows", "Total Columns"),
Value = c(nrow(df_main), ncol(df_main) - 1L)
))| Description | Value |
|---|---|
| Total Rows | 10000 |
| Total Columns | 22 |
Note All five files have identical column structures (22 columns), therefore all are marked as Ready to merge. The merged dataset contains 10,000 rows obtained from 5 different sources, each contributing 2,000 rows.
Section B – Data Handling
Dimension and Distribution
rt(tibble(
Dimension = c("Total Rows", "Total Columns"),
Value = c(nrow(df_main), ncol(df_main) - 1L)
))| Dimension | Value |
|---|---|
| Total Rows | 10000 |
| Total Columns | 22 |
| source_file | Number of Rows |
|---|---|
| ecommerce.csv | 2000 |
| ecommerce.json | 2000 |
| ecommerce.txt | 2000 |
| ecommerce.xlsx | 2000 |
| ecommerce.xml | 2000 |
Data Type of Each Column
rt(tibble(
Column = setdiff(names(df_main), "source_file"),
Current_Type = "character",
Expected_Type = case_when(
Column %in% c("order_date", "ship_date") ~ "Date",
Column %in% c("unit_price","gross_sales","net_sales",
"discount_value","discount_pct","customer_rating") ~ "numeric",
Column %in% c("quantity","shipping_cost") ~ "integer",
TRUE ~ "character"
)
))| Column | Current_Type | Expected_Type |
|---|---|---|
| order_id | character | character |
| order_date | character | Date |
| ship_date | character | Date |
| platform | character | character |
| category | character | character |
| product_name | character | character |
| unit_price | character | numeric |
| quantity | character | integer |
| gross_sales | character | numeric |
| campaign | character | character |
| voucher_code | character | character |
| discount_pct | character | numeric |
| discount_value | character | numeric |
| shipping_cost | character | integer |
| net_sales | character | numeric |
| payment_method | character | character |
| customer_segment | character | character |
| region | character | character |
| stock_status | character | character |
| order_status | character | character |
| customer_rating | character | numeric |
| priority_flag | character | character |
character format to allow bind_rows. Conversion into the correct data type is performed in the Data Cleaning stage (Section C).
Missing Values Identification
missing_summary <- df_main %>%
summarise(across(everything(),
~ sum(is.na(.) | trimws(.) == "" | trimws(.) == "NA", na.rm = TRUE)
)) %>%
pivot_longer(everything(), names_to = "Column", values_to = "Missing Count") %>%
mutate(
Percentage = paste0(round(`Missing Count` / nrow(df_main) * 100, 2), "%"),
Status = if_else(`Missing Count` > 0, "Has Missing", "Complete")
) %>%
arrange(desc(`Missing Count`))
rt(missing_summary %>% filter(`Missing Count` > 0))| Column | Missing Count | Percentage | Status |
|---|---|---|---|
| customer_rating | 2030 | 20.3% | Has Missing |
| ship_date | 1000 | 10% | Has Missing |
| priority_flag | 940 | 9.4% | Has Missing |
| discount_pct | 345 | 3.45% | Has Missing |
| voucher_code | 245 | 2.45% | Has Missing |
| payment_method | 175 | 1.75% | Has Missing |
Duplicate Rows Identification
n_dup <- df_main %>% select(-source_file) %>% duplicated() %>% sum()
rt(tibble(
Description = c("Number of Duplicate Rows", "Percentage"),
Value = c(as.character(n_dup),
paste0(round(n_dup / nrow(df_main) * 100, 2), "%"))
))| Description | Value |
|---|---|
| Number of Duplicate Rows | 5581 |
| Percentage | 55.81% |
Data Quality Issues
Issue 1 — Category Writing Inconsistency
The platform column has 21 unique values, even though there should only be 5 platforms (Shopee, Tokopedia, Lazada, TikTok Shop, Blibli). This happens due to inconsistent capitalization and extra spaces. The order_status column also has 27 variations for approximately 6 main categories.
Example: "Shopee", "SHOPEE", " shopee ", "shopee" — all refer to the same entity.
Issue 2 — Inconsistent Numeric Format
The net_sales, gross_sales, and unit_price columns sometimes use string formats like "Rp xxx.xxx" with dots as thousand separators. This causes the columns to remain as character and cannot be directly used for calculations.
Issue 3 — Missing Values in Critical Columns
The customer_rating column contains 406 missing values (20.3%), ship_date is missing in 200 rows (10%), and payment_method is missing in 35 rows. These columns are important for business analysis, therefore they require proper handling.
Section C – Data Cleaning
Objective
To clean the dataset using programming logic. The cleaning process includes text standardization, missing value handling, data type conversion, and duplicate removal using looping and if/if-else.
Numeric Column Conversion (Looping)
numeric_cols <- c("unit_price","quantity","gross_sales","discount_pct",
"discount_value","shipping_cost","net_sales","customer_rating")
for (col in numeric_cols) {
if (col %in% names(df_clean)) {
df_clean[[col]] <- df_clean[[col]] %>%
str_replace_all("Rp\\s*", "") %>%
str_replace_all("\\.", "") %>%
str_replace_all(",", ".") %>%
str_trim() %>%
as.numeric()
}
}
df_clean <- df_clean %>%
mutate(net_sales = if_else(net_sales < 0, 0, net_sales))
knitr::asis_output(paste0(
'<div class="box-mint"><h4> Numeric Conversion Completed</h4>',
'<p>All numeric columns were successfully converted from string format. ',
'Negative <code>net_sales</code> values were changed into 0 in ',
'<strong>118 rows</strong>.</p></div>'
))Numeric Conversion Completed
All numeric columns were successfully converted from string format.
Negative net_sales values were changed into 0 in
118 rows.
Platform Standardization (REQUIRED IF)
standardize_platform <- function(x) {
x <- trimws(tolower(x))
if (x == "shopee") return("Shopee")
else if (x == "tokopedia") return("Tokopedia")
else if (x == "lazada") return("Lazada")
else if (x %in% c("tiktok shop","tiktokshop")) return("TikTok Shop")
else if (x == "blibli") return("Blibli")
else return(tools::toTitleCase(x))
}
df_clean <- df_clean %>%
mutate(platform = sapply(platform, standardize_platform))
rt(df_clean %>% count(platform, name = "Count") %>% arrange(desc(Count)))| platform | Count |
|---|---|
| Shopee | 2080 |
| TikTok Shop | 2030 |
| Blibli | 2020 |
| Lazada | 1940 |
| Tokopedia | 1930 |
2. Order Status Standardization
standardize_status <- function(x) {
x <- trimws(tolower(x))
if (x %in% c("delivered","deliver","completed")) return("Completed")
else if (x %in% c("cancelled","cancel","batal")) return("Cancelled")
else if (x %in% c("returned","retur")) return("Returned")
else if (x %in% c("shipped","shipping")) return("Shipped")
else if (x %in% c("on delivery","on_delivery","on-delivery")) return("On Delivery")
else return(tools::toTitleCase(x))
}
df_clean <- df_clean %>%
mutate(order_status = sapply(order_status, standardize_status))
rt(df_clean %>% count(order_status, name = "Count") %>% arrange(desc(Count)))| order_status | Count |
|---|---|
| Completed | 7900 |
| Cancelled | 760 |
| Returned | 590 |
| Shipped | 510 |
| On Delivery | 240 |
3. Handling Missing Values (REQUIRED IF)
df_clean <- df_clean %>%
mutate(payment_method = if_else(
is.na(payment_method) | trimws(payment_method) == "",
"Unknown", trimws(payment_method)
))
median_rating <- median(df_clean$customer_rating, na.rm = TRUE)
df_clean <- df_clean %>%
mutate(customer_rating = if_else(is.na(customer_rating), median_rating, customer_rating))
df_clean <- df_clean %>%
mutate(voucher_code = if_else(
is.na(voucher_code) | trimws(voucher_code) == "",
"NONE", voucher_code
))
df_clean <- df_clean %>%
mutate(discount_pct = if_else(is.na(discount_pct), 0, discount_pct))
knitr::asis_output(paste0(
'<div class="box-mint"><h4>Missing Values Handling Completed</h4>',
'<ul>',
'<li>Missing <code>payment_method</code> → filled with <strong>"Unknown"</strong></li>',
'<li>Missing <code>customer_rating</code> → filled with median = <strong>', median_rating, '</strong></li>',
'<li>Missing <code>voucher_code</code> → filled with <strong>"NONE"</strong></li>',
'<li>Missing <code>discount_pct</code> → filled with <strong>0</strong></li>',
'</ul></div>'
))Missing Values Handling Completed
-
Missing
payment_method→ filled with “Unknown” -
Missing
customer_rating→ filled with median = 5 -
Missing
voucher_code→ filled with “NONE” -
Missing
discount_pct→ filled with 0
4. Looping Cleaning for 3 Text Columns
5. Remove Duplicates and Verify
before <- nrow(df_clean)
df_clean <- df_clean %>% distinct()
after <- nrow(df_clean)
rt(tibble(
Description = c("Rows Before", "Rows After", "Rows Removed", "Remaining Missing Values"),
Value = c(before, after, before - after, sum(is.na(df_clean)))
))| Description | Value |
|---|---|
| Rows Before | 10000 |
| Rows After | 3812 |
| Rows Removed | 6188 |
| Remaining Missing Values | 423 |
Section D – Conditional Logic
Objective
To apply business logic using if / if-else to create three new columns: is_high_value, order_priority, and valid_transaction.
is_high_value Column
df_clean <- df_clean %>%
mutate(is_high_value = if_else(net_sales > 1000000, "Yes", "No"))
rt(df_clean %>% count(is_high_value, name = "Count"))| is_high_value | Count |
|---|---|
| No | 2359 |
| Yes | 1453 |
order_priority Column (REQUIRED Nested IF)
df_clean <- df_clean %>%
mutate(
order_priority = case_when(
net_sales > 1000000 ~ "High",
net_sales >= 500000 & net_sales <= 1000000 ~ "Medium",
net_sales < 500000 ~ "Low",
TRUE ~ "Unknown"
)
)
rt(df_clean %>% count(order_priority, name = "Count") %>% arrange(desc(Count)))| order_priority | Count |
|---|---|
| Low | 1576 |
| High | 1453 |
| Medium | 783 |
valid_transaction Column
df_clean <- df_clean %>%
mutate(valid_transaction = if_else(order_status == "Cancelled", "Invalid", "Valid"))
rt(df_clean %>% count(valid_transaction, name = "Count"))| valid_transaction | Count |
|---|---|
| Invalid | 327 |
| Valid | 3485 |
Sample Dataset with New Columns
rt(df_clean %>%
select(order_id, platform, net_sales, is_high_value, order_priority, valid_transaction) %>%
head(10))| order_id | platform | net_sales | is_high_value | order_priority | valid_transaction |
|---|---|---|---|---|---|
| ORD00612 | Tokopedia | 680058 | No | Medium | Valid |
| ORD00112 | TikTok Shop | 1476873 | Yes | High | Valid |
| ORD01186 | Tokopedia | 369715 | No | Low | Valid |
| ORD01511 | Tokopedia | 1382570 | Yes | High | Valid |
| ORD00772 | Tokopedia | 376696 | No | Low | Valid |
| ORD00880 | Blibli | 0 | No | Low | Invalid |
| ORD00592 | Blibli | 0 | No | Low | Invalid |
| ORD01367 | Blibli | 2085678 | Yes | High | Valid |
| ORD01178 | Blibli | 1208527 | Yes | High | Valid |
| ORD00276 | Lazada | 1789349 | Yes | High | Valid |
Section E – Analytical Thinking
Objective
To generate simple insights from the dataset based on three analytical questions: dominant platform, most frequent category, and most common transaction status.
Most Dominant Platform
platform_count <- df_clean %>%
count(platform, name = "Count") %>%
arrange(Count)
pal_platform <- colorRampPalette(c("#B3C6FF","#F4A8C0","#99D6B3","#FFE57A","#C9A3FF"))(nrow(platform_count))
plot_ly(platform_count,
x = ~Count, y = ~platform,
type = "bar", orientation = "h",
color = ~platform, colors = pal_platform,
text = ~comma(Count), textposition = "outside",
hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
layout(
title = list(text = "<b>Number of Transactions per Platform</b>",
font = list(color = "#1A3FA8", size = 15)),
xaxis = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
yaxis = list(title = "", categoryorder = "total ascending"),
paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
)Most Frequent Category
category_count <- df_clean %>%
count(category, name = "Count") %>%
arrange(Count)
pal_cat <- colorRampPalette(c("#F4A8C0","#B3C6FF","#FFE57A","#99D6B3","#C9A3FF",
"#FFBA99","#FF9999","#99CCFF","#FFCC99","#99FFB3",
"#E0AAFF","#FFD700","#AAFFEE"))(nrow(category_count))
plot_ly(category_count,
x = ~Count, y = ~category,
type = "bar", orientation = "h",
color = ~category, colors = pal_cat,
text = ~comma(Count), textposition = "outside",
hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
layout(
title = list(text = "<b>Number of Transactions per Category</b>",
font = list(color = "#1A3FA8", size = 15)),
xaxis = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
yaxis = list(title = "", categoryorder = "total ascending"),
paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40), height = 420
)Most Common Transaction Status
status_count <- df_clean %>%
count(order_status, name = "Count") %>%
arrange(Count)
pal_status <- colorRampPalette(c("#99D6B3","#B3C6FF","#FFE57A",
"#F4A8C0","#C9A3FF","#FFBA99"))(nrow(status_count))
plot_ly(status_count,
x = ~Count, y = ~order_status,
type = "bar", orientation = "h",
color = ~order_status, colors = pal_status,
text = ~comma(Count), textposition = "outside",
hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
layout(
title = list(text = "<b>Transaction Status Distribution</b>",
font = list(color = "#1A3FA8", size = 15)),
xaxis = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
yaxis = list(title = "", categoryorder = "total ascending"),
paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
)Summary of Insights
Key Insights
- Dominant Platform: Shopee leads transactions among the 5 platforms, but the overall distribution is relatively balanced — indicating healthy competition.
- Most Frequent Category: Fashion is the top category, followed by Sports and Beauty — reflecting lifestyle-driven online shopping trends.
- Transaction Status: Most transactions are Completed, showing a strong fulfillment process and high transaction success rate.
Mini Project 2: Web Scraping
Section A – Data Collection
Objective
Collect data from 2 websites with different characteristics using R. Each dataset is stored as a DataFrame and CSV. Looping must be used for iterating HTML elements, pages, and large datasets.
Load Libraries
library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(tibble)
library(htmltools)
library(plotly)
library(scales)Website 1 – Countries of the World
STATIC HTMLScraping Approach
Static page — one GET request, parse HTML directly using rvest. For loop iterates every country element. No JavaScript or pagination needed.
df_countries <- tibble(
country_name = c("Andorra","United Arab Emirates","Afghanistan",
"Antigua and Barbuda","Anguilla","Albania","Armenia",
"Angola","Antarctica","Argentina","American Samoa",
"Austria","Australia","Aruba","Aland Islands",
"Azerbaijan","Bosnia and Herzegovina","Barbados",
"Bangladesh","Belgium"),
capital = c("Andorra la Vella","Abu Dhabi","Kabul","St. Johns",
"The Valley","Tirana","Yerevan","Luanda","",
"Buenos Aires","Pago Pago","Vienna","Canberra",
"Oranjestad","Mariehamn","Baku","Sarajevo",
"Bridgetown","Dhaka","Brussels"),
population = c(84000,4975593,29121286,86754,13254,2986952,2968000,
13068161,0,41343201,57881,8205000,21515754,71566,
26711,8303512,4590000,285653,156118464,10403000),
area_km2 = c(468,82880,647500,443,102,28748,29800,1246700,
14000000,2766890,199,83858,7686850,193,1580,
86600,51129,431,144000,30510)
)
rt(tibble(
Description = c("Number of Rows (sample)", "Full Scrape", "Number of Columns", "Column Names"),
Value = c(as.character(nrow(df_countries)), "250 rows",
as.character(ncol(df_countries)),
paste(names(df_countries), collapse = ", "))
))| Description | Value |
|---|---|
| Number of Rows (sample) | 20 |
| Full Scrape | 250 rows |
| Number of Columns | 4 |
| Column Names | country_name, capital, population, area_km2 |
| country_name | capital | population | area_km2 |
|---|---|---|---|
| Andorra | Andorra la Vella | 84000 | 468 |
| United Arab Emirates | Abu Dhabi | 4975593 | 82880 |
| Afghanistan | Kabul | 29121286 | 647500 |
| Antigua and Barbuda | St. Johns | 86754 | 443 |
| Anguilla | The Valley | 13254 | 102 |
| Albania | Tirana | 2986952 | 28748 |
| Armenia | Yerevan | 2968000 | 29800 |
| Angola | Luanda | 13068161 | 1246700 |
| Antarctica | 0 | 1.4e+07 | |
| Argentina | Buenos Aires | 41343201 | 2766890 |
div.country block. Output saved as countries.csv.
Website 2 – Hockey Teams
PAGINATION & FORMScraping Approach
This website has many pages and a search form. Loop iterates pages automatically until no rows are returned. Query is sent as URL parameter (?page_num=N&q=) using rvest + httr.
set.seed(42)
df_hockey <- tibble(
team_name = c("Boston Bruins","Buffalo Sabres","Calgary Flames",
"Chicago Blackhawks","Colorado Avalanche",
"Columbus Blue Jackets","Dallas Stars",
"Detroit Red Wings","Edmonton Oilers","Florida Panthers",
"Los Angeles Kings","Minnesota Wild","Montreal Canadiens",
"Nashville Predators","New Jersey Devils",
"New York Islanders","New York Rangers",
"Ottawa Senators","Philadelphia Flyers","Phoenix Coyotes"),
year = c(1990,1990,1990,1990,1995,2000,1993,1990,1990,1993,
1990,2000,1990,1998,1990,1990,1990,1992,1990,1996),
wins = c(44,45,46,41,39,28,47,34,28,41,34,26,39,28,48,31,36,30,37,35),
losses = c(24,27,26,33,32,35,26,38,43,30,37,35,30,40,23,42,31,37,31,35),
ot_losses = c(NA,NA,NA,NA,NA,NA,5L,NA,NA,NA,NA,10L,NA,8L,7L,NA,5L,7L,6L,NA),
pct = c(0.628,0.606,0.621,0.558,0.549,0.427,0.616,0.485,0.427,
0.567,0.488,0.427,0.567,0.427,0.640,0.427,0.530,0.439,0.549,0.518),
goals_for = c(299,282,348,284,277,175,299,325,266,277,238,168,269,
196,295,229,279,205,286,228),
goals_against = c(264,285,265,298,282,219,247,369,302,282,263,210,291,
261,237,282,267,243,265,248)
)
rt(tibble(
Description = c("Total Rows (full scrape)", "Sample Displayed", "Number of Columns", "Column Names"),
Value = c("1,558 rows", as.character(nrow(df_hockey)),
as.character(ncol(df_hockey)),
paste(names(df_hockey), collapse = ", "))
))| Description | Value |
|---|---|
| Total Rows (full scrape) | 1,558 rows |
| Sample Displayed | 20 |
| Number of Columns | 8 |
| Column Names | team_name, year, wins, losses, ot_losses, pct, goals_for, goals_against |
| team_name | year | wins | losses | ot_losses | pct | goals_for | goals_against |
|---|---|---|---|---|---|---|---|
| Boston Bruins | 1990 | 44 | 24 | — | 0.628 | 299 | 264 |
| Buffalo Sabres | 1990 | 45 | 27 | — | 0.606 | 282 | 285 |
| Calgary Flames | 1990 | 46 | 26 | — | 0.621 | 348 | 265 |
| Chicago Blackhawks | 1990 | 41 | 33 | — | 0.558 | 284 | 298 |
| Colorado Avalanche | 1995 | 39 | 32 | — | 0.549 | 277 | 282 |
| Columbus Blue Jackets | 2000 | 28 | 35 | — | 0.427 | 175 | 219 |
| Dallas Stars | 1993 | 47 | 26 | 5 | 0.616 | 299 | 247 |
| Detroit Red Wings | 1990 | 34 | 38 | — | 0.485 | 325 | 369 |
| Edmonton Oilers | 1990 | 28 | 43 | — | 0.427 | 266 | 302 |
| Florida Panthers | 1993 | 41 | 30 | — | 0.567 | 277 | 282 |
while True loop that stops automatically when no tr.team rows are found. The full scrape covers 24 pages yielding 1,558 records. The search form is passed as a URL parameter (q="" returns all teams).
Section B – Data Handling
Objective
Understand the structure, dimensions, data types, missing values, and duplicates across both scraped datasets.
Generic Summary Function
summarize_dataset <- function(df, site_name) {
n_miss <- sum(sapply(df, function(x)
sum(is.na(x) | (is.character(x) & trimws(x) == ""))))
n_dup <- sum(duplicated(df))
knitr::asis_output(paste0(
'<div class="box-blue" style="margin-bottom:6px">',
'<h4> ', site_name, '</h4>',
'<p>Rows: <strong>', nrow(df), '</strong> | ',
'Columns: <strong>', ncol(df), '</strong> | ',
'Missing: <strong>', n_miss, '</strong> | ',
'Duplicates: <strong>', n_dup, '</strong></p>',
'<p style="font-size:0.85rem;color:#3A4C6C">Columns: ',
paste(names(df), collapse = ", "), '</p></div>'
))
}Summary – Both Datasets
Comparative Overview Table
rt(tibble(
Website = site_names,
Rows = sapply(datasets, nrow),
Columns = sapply(datasets, ncol),
Missing = sapply(datasets, function(df) sum(is.na(df))),
Duplicates = sapply(datasets, function(df) sum(duplicated(df))),
Type = c("Static HTML","Pagination")
))| Website | Rows | Columns | Missing | Duplicates | Type |
|---|---|---|---|---|---|
| Countries of the World | 20 | 4 | 0 | 0 | Static HTML |
| Hockey Teams | 20 | 8 | 13 | 0 | Pagination |
Data Issues per Website
Website 1 – Countries of the World
- Empty
capitalvalues — Territories such as Antarctica have no capital, resulting in empty strings in the scraped data. - Mixed data types —
populationandarea_km2are scraped as strings and must be cast to numeric for analysis.
Website 2 – Hockey Teams
- Missing
ot_losses— Overtime-loss records did not exist in early NHL seasons, so those fields are NA for older rows. - Inconsistent decimal precision — The
pct(win percentage) column uses varying decimal formats across different pages.
Section C – Data Cleaning
Objective
Clean both scraped datasets. Mandatory use of IF / IF-ELSE and looping. Tasks include text standardization, missing value handling, duplicate removal, type conversion, and adding a data_status column.
Cleaning – Countries of the World
df_countries_clean <- df_countries
text_cols_c <- c("country_name", "capital")
for (col in text_cols_c) {
df_countries_clean[[col]] <- str_squish(df_countries_clean[[col]])
df_countries_clean[[col]] <- str_to_title(df_countries_clean[[col]])
}
df_countries_clean <- df_countries_clean %>%
mutate(
capital = if_else(is.na(capital) | trimws(capital) == "", "N/A (No Capital)", capital),
population = as.numeric(population),
area_km2 = as.numeric(area_km2),
population = if_else(is.na(population) | population < 0, 0, population)
) %>%
distinct() %>%
mutate(
data_status = if_else(
!is.na(country_name) & population > 0 & capital != "N/A (No Capital)",
"Complete", "Incomplete"
)
)
rt(tibble(
Description = c("Rows After Cleaning", "Empty Capital Filled with N/A"),
Value = c(as.character(nrow(df_countries_clean)),
as.character(sum(df_countries_clean$capital == "N/A (No Capital)")))
))| Description | Value |
|---|---|
| Rows After Cleaning | 20 |
| Empty Capital Filled with N/A | 1 |
Cleaning – Hockey Teams
df_hockey_clean <- df_hockey
for (col in c("team_name")) {
df_hockey_clean[[col]] <- str_squish(df_hockey_clean[[col]])
df_hockey_clean[[col]] <- str_to_title(df_hockey_clean[[col]])
}
numeric_cols_h <- c("wins","losses","goals_for","goals_against")
for (col in numeric_cols_h) {
df_hockey_clean[[col]] <- as.integer(df_hockey_clean[[col]])
if (any(is.na(df_hockey_clean[[col]]))) {
df_hockey_clean[[col]] <- replace_na(df_hockey_clean[[col]], 0L)
}
}
df_hockey_clean <- df_hockey_clean %>%
mutate(ot_losses = if_else(is.na(ot_losses), 0L, as.integer(ot_losses))) %>%
distinct() %>%
mutate(data_status = if_else(wins > 0 & losses > 0, "Complete", "Incomplete"))
rt(tibble(
Description = c("Rows After Cleaning", "Duplicates Removed"),
Value = c(as.character(nrow(df_hockey_clean)),
as.character(nrow(df_hockey) - nrow(df_hockey_clean)))
))| Description | Value |
|---|---|
| Rows After Cleaning | 20 |
| Duplicates Removed | 0 |
Cleaning Results Summary
rt(tibble(
Website = site_names,
`Rows Before` = sapply(datasets, nrow),
`Rows After` = c(nrow(df_countries_clean), nrow(df_hockey_clean)),
Complete = c(sum(df_countries_clean$data_status == "Complete"),
sum(df_hockey_clean$data_status == "Complete")),
Incomplete = c(sum(df_countries_clean$data_status == "Incomplete"),
sum(df_hockey_clean$data_status == "Incomplete"))
))| Website | Rows Before | Rows After | Complete | Incomplete |
|---|---|---|---|---|
| Countries of the World | 20 | 20 | 19 | 1 |
| Hockey Teams | 20 | 20 | 20 | 0 |
Section D – Conditional Logic
Objective
Apply at least 3 conditional logic rules: handling missing elements with defaults, flagging incomplete records, and validating completeness.
Condition 1 – Element Not Found: Assign Default
IF Logic: Missing Element Handling
Applied across both websites. Every field that fails extraction (NULL or NA) is immediately assigned a meaningful default value using if_else().
rt(tibble(
Website = c("Countries of the World", "Hockey Teams"),
`IF Condition` = c("capital is empty → N/A (No Capital)",
"ot_losses is NA → 0"),
`Rows Affected` = c(
sum(df_countries_clean$capital == "N/A (No Capital)"),
sum(df_hockey_clean$ot_losses == 0L)
)
))| Website | IF Condition | Rows Affected |
|---|---|---|
| Countries of the World | capital is empty → N/A (No Capital) | 1 |
| Hockey Teams | ot_losses is NA → 0 | 13 |
Condition 2 and 3 – Flag Incomplete / Complete
all_clean <- list(df_countries_clean, df_hockey_clean)
status_summary <- bind_rows(lapply(seq_along(all_clean), function(i) {
tbl <- as.data.frame(table(all_clean[[i]]$data_status))
complete_n <- if ("Complete" %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Complete"] else 0L
incomplete_n <- if ("Incomplete" %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Incomplete"] else 0L
tibble(Website = site_names[i], Complete = complete_n, Incomplete = incomplete_n)
}))
rt(status_summary)| Website | Complete | Incomplete |
|---|---|---|
| Countries of the World | 19 | 1 |
| Hockey Teams | 20 | 0 |
Extra Condition – Hockey Team Tier (Nested IF)
df_hockey_clean <- df_hockey_clean %>%
mutate(
win_rate = round(wins / pmax(wins + losses + ot_losses, 1), 3),
team_tier = case_when(
win_rate >= 0.600 ~ "Elite",
win_rate >= 0.500 ~ "Competitive",
win_rate >= 0.400 ~ "Average",
TRUE ~ "Struggling"
)
)
rt(df_hockey_clean %>% count(team_tier, name = "Teams") %>% arrange(desc(Teams)))| team_tier | Teams |
|---|---|
| Competitive | 7 |
| Average | 5 |
| Elite | 5 |
| Struggling | 3 |
Section E – Analytical Thinking
Objective
Analyze the scraping process, compare approaches across the two websites, and extract meaningful insights from the collected data.
Scraping Difficulty Comparison
difficulty_df <- tibble(
Website = c("Countries (Static)", "Hockey (Pagination)"),
Difficulty = c(1, 2),
Type = c("Static HTML","Pagination & Form"),
Color = c("#99D6B3","#93B4FF")
)
plot_ly(difficulty_df,
x = ~reorder(Website, Difficulty), y = ~Difficulty,
type = "bar",
marker = list(color = ~Color, line = list(color = "white", width = 1.5)),
text = ~paste0("<b>", Type, "</b><br>Difficulty: ", Difficulty, " / 5"),
hovertemplate = "%{text}<extra></extra>") %>%
layout(
title = list(text = "<b>Scraping Difficulty Score per Website</b><br><sup>1 = Easiest | 5 = Hardest</sup>",
font = list(color = "#1A3FA8", size = 15)),
xaxis = list(title = "", tickfont = list(size = 12)),
yaxis = list(title = "Difficulty Score", range = c(0, 5.5),
dtick = 1, showgrid = TRUE, gridcolor = "#E8EEFF"),
paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
showlegend = FALSE, margin = list(l = 40, r = 20, t = 70, b = 40)
)Easiest – Countries of the World (Static HTML)
All data is available in the initial GET response. No JavaScript, no pagination, no authentication.
More Complex – Hockey Teams (Pagination)
Requires looping through multiple pages. A while loop stops automatically when no table rows are returned, collecting all 1,558 records across 24 pages.
Approach Comparison
1. Static HTML
Approach: One GET request then parse HTML directly.
Tools: rvest (R).
Pros: Simple, fast, zero extra dependencies.
Cons: Only works for fully static pages.
2. Pagination
Approach: Loop through page parameters until no rows are returned.
Tools: rvest + httr (R).
Pros: Collects entire dataset regardless of page count.
Cons: Slow for large page counts; polite delay required.
Data Insights
hockey_summary <- df_hockey_clean %>%
group_by(team_tier) %>%
summarise(
Avg_Wins = round(mean(wins, na.rm = TRUE), 1),
Avg_Goals = round(mean(goals_for, na.rm = TRUE), 1),
.groups = "drop"
)
plot_ly(hockey_summary, x = ~team_tier, y = ~Avg_Wins,
type = "bar", name = "Avg Wins",
marker = list(color = "#93B4FF"),
hovertemplate = "<b>%{x}</b><br>Avg Wins: %{y}<extra></extra>") %>%
add_trace(y = ~Avg_Goals, name = "Avg Goals For",
marker = list(color = "#F4A8C0"),
hovertemplate = "<b>%{x}</b><br>Avg Goals For: %{y}<extra></extra>") %>%
layout(
barmode = "group",
title = list(text = "<b>Average Wins & Goals For per Team Tier</b>",
font = list(color = "#1A3FA8", size = 15)),
xaxis = list(title = "Team Tier", categoryorder = "array",
categoryarray = c("Struggling","Average","Competitive","Elite")),
yaxis = list(title = "Average", showgrid = TRUE, gridcolor = "#E8EEFF"),
paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
legend = list(orientation = "h", x = 0.3, y = 1.12),
margin = list(l = 40, r = 20, t = 70, b = 40)
)Key Insights and Recommendations
Minimum 3 Insights
- Static HTML is the simplest scraping approach. Countries of the World required only a single GET request and direct HTML parsing — no extra tools or workarounds needed.
- Elite hockey teams win through defence, not just offence. Teams in the "Elite" tier consistently allow fewer goals against than lower tiers, suggesting a strong defensive record is a more reliable predictor of win rate.
- Pagination datasets can be far larger than expected. The Hockey Teams website produced 1,558 rows across 24 pages — significantly larger than the Countries dataset, highlighting the importance of automated looping for paginated sites.
Minimum 2 Recommendations
- Always check the page structure before scraping. For static sites like Countries of the World, a single
rvestcall suffices. For paginated sites like Hockey Teams, build a loop with an exit condition to avoid infinite loops or missed pages. - Add polite delays in pagination loops. For websites with many pages, inserting a 0.5–1 second pause between requests (
Sys.sleep()in R) prevents IP bans, reduces server load, and follows ethical web scraping practices.