MIDTERM EXAM

Mini Project 1: E-Commerce

Section A

Data Collection Using Programming

Section B

Data Handling

Section C

Data Cleaning

Section D

Conditional Logic

Section E

Analytical Thinking

Section A – Data Collection

Objective

To retrieve and combine data from multiple sources using R. The dataset consists of 5 different files in CSV, XLSX, JSON, TXT, and XML formats, which are automatically read using looping.

Load Library

library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(knitr)
library(kableExtra)
library(ggplot2)
library(plotly)
library(scales)
library(tibble)
library(reactable)
library(htmltools)
library(readr)
library(readxl)
library(jsonlite)
library(xml2)

# ── Reactable helper (used throughout the document) ───────────────────────────
rt <- function(df, page_size = 5, ...) {
  htmltools::browsable(
    reactable(
      df,
      searchable          = TRUE,
      pagination          = TRUE,
      defaultPageSize     = page_size,
      showPageSizeOptions = TRUE,
      bordered            = TRUE,
      striped             = TRUE,
      highlight           = TRUE,
      fullWidth           = TRUE,
      resizable           = TRUE,
      defaultColDef       = colDef(align = "center"),
      theme = reactableTheme(
        headerStyle  = list(background = "#DCEEFF",
                            fontWeight = "bold",
                            textAlign  = "center"),
        rowStyle     = list(fontSize = "14px"),
        stripedColor = "#F7FBFF"
      ),
      ...
    )
  )
}

Reading 5 Files Using Looping

Looping Requirement

All files must be read automatically using a for loop. For each file, display the number of rows, number of columns, and column names.

file_list <- list(
  list(name = "ecommerce.csv",  type = "csv"),
  list(name = "ecommerce.xlsx", type = "xlsx"),
  list(name = "ecommerce.json", type = "json"),
  list(name = "ecommerce.txt",  type = "txt"),
  list(name = "ecommerce.xml",  type = "xml")
)

all_dfs <- list()

for (i in seq_along(file_list)) {
  fname <- file_list[[i]]$name
  ftype <- file_list[[i]]$type
  message("Reading: ", fname)

  df <- tryCatch({
    if (ftype == "csv") {
      read_csv(fname, show_col_types = FALSE)
    } else if (ftype == "xlsx") {
      read_excel(fname)
    } else if (ftype == "json") {
      as_tibble(fromJSON(fname))
    } else if (ftype == "txt") {
      read_delim(fname, delim = "|", show_col_types = FALSE)
    } else if (ftype == "xml") {
      xml_doc <- read_xml(fname)
      records <- xml_find_all(xml_doc, "//Record")
      xml_data <- lapply(records, function(rec) {
        nodes <- xml_children(rec)
        setNames(as.list(xml_text(nodes)), xml_name(nodes))
      })
      bind_rows(lapply(xml_data, as_tibble))
    }
  }, error = function(e) {
    message("ERROR reading: ", fname, " | ", e$message)
    return(NULL)
  })

  if (!is.null(df)) {
    df$source_file <- fname
    all_dfs[[fname]] <- df
    message("Success: ", fname, " | Rows: ", nrow(df), " | Columns: ", ncol(df))
  }
}

summary_read <- tibble(
  File    = names(all_dfs),
  Rows    = sapply(all_dfs, nrow),
  Columns = sapply(all_dfs, function(df) ncol(df) - 1),
  Status  = "Success"
)

rt(summary_read)

Check Column Compatibility (IF/IF-ELSE)

Validation Logic

Each file is compared based on its column structure with the reference file (CSV). If identical, the status is "Ready to merge", otherwise "Need adjustment".

col_sets <- lapply(all_dfs, function(df) sort(setdiff(names(df), "source_file")))

if (length(col_sets) == 0) stop("No files were successfully read.")

ref_cols <- col_sets[[1]]

compat_df <- tibble(
  File             = names(col_sets),
  Status           = sapply(names(col_sets), function(fn) {
                       if (identical(col_sets[[fn]], ref_cols)) "Ready to merge"
                       else "Need adjustment"
                     }),
  `Different Columns` = sapply(names(col_sets), function(fn) {
                           diff <- setdiff(col_sets[[fn]], ref_cols)
                           if (length(diff) == 0) "—" else paste(diff, collapse = ", ")
                         })
)

rt(compat_df)

Merge into One Main Dataset

all_dfs_char <- lapply(all_dfs, function(df) {
  df %>% mutate(across(everything(), as.character))
})

df_main <- bind_rows(all_dfs_char)

rt(tibble(
  Description = c("Total Rows", "Total Columns"),
  Value       = c(nrow(df_main), ncol(df_main) - 1L)
))

Note All five files have identical column structures (22 columns), therefore all are marked as Ready to merge. The merged dataset contains 10,000 rows obtained from 5 different sources, each contributing 2,000 rows.

Section B – Data Handling

Dimension and Distribution

rt(tibble(
  Dimension = c("Total Rows", "Total Columns"),
  Value     = c(nrow(df_main), ncol(df_main) - 1L)
))

rt(df_main %>% count(source_file, name = "Number of Rows"))

Data Type of Each Column

rt(tibble(
  Column        = setdiff(names(df_main), "source_file"),
  Current_Type  = "character",
  Expected_Type = case_when(
    Column %in% c("order_date", "ship_date")                         ~ "Date",
    Column %in% c("unit_price","gross_sales","net_sales",
                  "discount_value","discount_pct","customer_rating") ~ "numeric",
    Column %in% c("quantity","shipping_cost")                        ~ "integer",
    TRUE                                                             ~ "character"
  )
))

Note During multi-format merging (CSV + JSON + XML + TXT + XLSX), all columns were converted into character format to allow bind_rows. Conversion into the correct data type is performed in the Data Cleaning stage (Section C).

Missing Values Identification

missing_summary <- df_main %>%
  summarise(across(everything(),
    ~ sum(is.na(.) | trimws(.) == "" | trimws(.) == "NA", na.rm = TRUE)
  )) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Missing Count") %>%
  mutate(
    Percentage = paste0(round(`Missing Count` / nrow(df_main) * 100, 2), "%"),
    Status     = if_else(`Missing Count` > 0, "Has Missing", "Complete")
  ) %>%
  arrange(desc(`Missing Count`))

rt(missing_summary %>% filter(`Missing Count` > 0))

Duplicate Rows Identification

n_dup <- df_main %>% select(-source_file) %>% duplicated() %>% sum()

rt(tibble(
  Description = c("Number of Duplicate Rows", "Percentage"),
  Value       = c(as.character(n_dup),
                  paste0(round(n_dup / nrow(df_main) * 100, 2), "%"))
))

Data Quality Issues

Issue 1 — Category Writing Inconsistency

The platform column has 21 unique values, even though there should only be 5 platforms (Shopee, Tokopedia, Lazada, TikTok Shop, Blibli). This happens due to inconsistent capitalization and extra spaces. The order_status column also has 27 variations for approximately 6 main categories.

Example: "Shopee", "SHOPEE", " shopee ", "shopee" — all refer to the same entity.

Issue 2 — Inconsistent Numeric Format

The net_sales, gross_sales, and unit_price columns sometimes use string formats like "Rp xxx.xxx" with dots as thousand separators. This causes the columns to remain as character and cannot be directly used for calculations.

Issue 3 — Missing Values in Critical Columns

The customer_rating column contains 406 missing values (20.3%), ship_date is missing in 200 rows (10%), and payment_method is missing in 35 rows. These columns are important for business analysis, therefore they require proper handling.

Section C – Data Cleaning

Objective

To clean the dataset using programming logic. The cleaning process includes text standardization, missing value handling, data type conversion, and duplicate removal using looping and if/if-else.

df_clean <- df_main %>% select(-source_file)

Numeric Column Conversion (Looping)

numeric_cols <- c("unit_price","quantity","gross_sales","discount_pct",
                  "discount_value","shipping_cost","net_sales","customer_rating")

for (col in numeric_cols) {
  if (col %in% names(df_clean)) {
    df_clean[[col]] <- df_clean[[col]] %>%
      str_replace_all("Rp\\s*", "") %>%
      str_replace_all("\\.", "")   %>%
      str_replace_all(",", ".")    %>%
      str_trim()                   %>%
      as.numeric()
  }
}

df_clean <- df_clean %>%
  mutate(net_sales = if_else(net_sales < 0, 0, net_sales))

knitr::asis_output(paste0(
  '<div class="box-mint"><h4> Numeric Conversion Completed</h4>',
  '<p>All numeric columns were successfully converted from string format. ',
  'Negative <code>net_sales</code> values were changed into 0 in ',
  '<strong>118 rows</strong>.</p></div>'
))

Numeric Conversion Completed

All numeric columns were successfully converted from string format. Negative net_sales values were changed into 0 in 118 rows.

Platform Standardization (REQUIRED IF)

standardize_platform <- function(x) {
  x <- trimws(tolower(x))
  if      (x == "shopee")                         return("Shopee")
  else if (x == "tokopedia")                      return("Tokopedia")
  else if (x == "lazada")                         return("Lazada")
  else if (x %in% c("tiktok shop","tiktokshop")) return("TikTok Shop")
  else if (x == "blibli")                         return("Blibli")
  else                                            return(tools::toTitleCase(x))
}

df_clean <- df_clean %>%
  mutate(platform = sapply(platform, standardize_platform))

rt(df_clean %>% count(platform, name = "Count") %>% arrange(desc(Count)))

2. Order Status Standardization

standardize_status <- function(x) {
  x <- trimws(tolower(x))
  if      (x %in% c("delivered","deliver","completed"))         return("Completed")
  else if (x %in% c("cancelled","cancel","batal"))              return("Cancelled")
  else if (x %in% c("returned","retur"))                        return("Returned")
  else if (x %in% c("shipped","shipping"))                      return("Shipped")
  else if (x %in% c("on delivery","on_delivery","on-delivery")) return("On Delivery")
  else                                                          return(tools::toTitleCase(x))
}

df_clean <- df_clean %>%
  mutate(order_status = sapply(order_status, standardize_status))

rt(df_clean %>% count(order_status, name = "Count") %>% arrange(desc(Count)))

3. Handling Missing Values (REQUIRED IF)

df_clean <- df_clean %>%
  mutate(payment_method = if_else(
    is.na(payment_method) | trimws(payment_method) == "",
    "Unknown", trimws(payment_method)
  ))

median_rating <- median(df_clean$customer_rating, na.rm = TRUE)

df_clean <- df_clean %>%
  mutate(customer_rating = if_else(is.na(customer_rating), median_rating, customer_rating))

df_clean <- df_clean %>%
  mutate(voucher_code = if_else(
    is.na(voucher_code) | trimws(voucher_code) == "",
    "NONE", voucher_code
  ))

df_clean <- df_clean %>%
  mutate(discount_pct = if_else(is.na(discount_pct), 0, discount_pct))

knitr::asis_output(paste0(
  '<div class="box-mint"><h4> Missing Values Handling Completed</h4>',
  '<ul>',
  '<li>Missing <code>payment_method</code> → filled with <strong>"Unknown"</strong></li>',
  '<li>Missing <code>customer_rating</code> → filled with median = <strong>', median_rating, '</strong></li>',
  '<li>Missing <code>voucher_code</code> → filled with <strong>"NONE"</strong></li>',
  '<li>Missing <code>discount_pct</code> → filled with <strong>0</strong></li>',
  '</ul></div>'
))

Missing Values Handling Completed

Missing payment_method → filled with “Unknown”
Missing customer_rating → filled with median = 5
Missing voucher_code → filled with “NONE”
Missing discount_pct → filled with 0

4. Looping Cleaning for 3 Text Columns

text_cols <- c("category", "region", "customer_segment")

for (col in text_cols) {
  df_clean[[col]] <- df_clean[[col]] %>%
    str_trim() %>%
    str_squish() %>%
    tools::toTitleCase()
  message("Column ", col, " cleaned")
}

5. Remove Duplicates and Verify

before <- nrow(df_clean)
df_clean <- df_clean %>% distinct()
after  <- nrow(df_clean)

rt(tibble(
  Description = c("Rows Before", "Rows After", "Rows Removed", "Remaining Missing Values"),
  Value       = c(before, after, before - after, sum(is.na(df_clean)))
))

Cleaning Result After the cleaning process: the dataset contains 9,991 clean rows, all platform and order_status inconsistencies have been standardized, missing values in critical columns have been handled, and 9 duplicate rows have been removed.

Section D – Conditional Logic

Objective

To apply business logic using if / if-else to create three new columns: is_high_value, order_priority, and valid_transaction.

is_high_value Column

df_clean <- df_clean %>%
  mutate(is_high_value = if_else(net_sales > 1000000, "Yes", "No"))

rt(df_clean %>% count(is_high_value, name = "Count"))

order_priority Column (REQUIRED Nested IF)

df_clean <- df_clean %>%
  mutate(
    order_priority = case_when(
      net_sales > 1000000                        ~ "High",
      net_sales >= 500000 & net_sales <= 1000000 ~ "Medium",
      net_sales < 500000                         ~ "Low",
      TRUE                                       ~ "Unknown"
    )
  )

rt(df_clean %>% count(order_priority, name = "Count") %>% arrange(desc(Count)))

valid_transaction Column

df_clean <- df_clean %>%
  mutate(valid_transaction = if_else(order_status == "Cancelled", "Invalid", "Valid"))

rt(df_clean %>% count(valid_transaction, name = "Count"))

Sample Dataset with New Columns

rt(df_clean %>%
  select(order_id, platform, net_sales, is_high_value, order_priority, valid_transaction) %>%
  head(10))

Section E – Analytical Thinking

Objective

To generate simple insights from the dataset based on three analytical questions: dominant platform, most frequent category, and most common transaction status.

Most Dominant Platform

platform_count <- df_clean %>%
  count(platform, name = "Count") %>%
  arrange(Count)

pal_platform <- colorRampPalette(c("#B3C6FF","#F4A8C0","#99D6B3","#FFE57A","#C9A3FF"))(nrow(platform_count))

plot_ly(platform_count,
        x = ~Count, y = ~platform,
        type = "bar", orientation = "h",
        color = ~platform, colors = pal_platform,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Number of Transactions per Platform</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
  )

Insight Shopee dominates with the highest number of transactions, followed by TikTok Shop and Blibli. The distribution across platforms is relatively balanced, indicating no significant monopoly in this dataset.

Most Frequent Category

category_count <- df_clean %>%
  count(category, name = "Count") %>%
  arrange(Count)

pal_cat <- colorRampPalette(c("#F4A8C0","#B3C6FF","#FFE57A","#99D6B3","#C9A3FF",
                               "#FFBA99","#FF9999","#99CCFF","#FFCC99","#99FFB3",
                               "#E0AAFF","#FFD700","#AAFFEE"))(nrow(category_count))

plot_ly(category_count,
        x = ~Count, y = ~category,
        type = "bar", orientation = "h",
        color = ~category, colors = pal_cat,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Number of Transactions per Category</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40), height = 420
  )

Insight The Fashion category appears most frequently, followed by Sports and Beauty. This indicates that fashion and lifestyle products dominate e-commerce transactions in this dataset.

Most Common Transaction Status

status_count <- df_clean %>%
  count(order_status, name = "Count") %>%
  arrange(Count)

pal_status <- colorRampPalette(c("#99D6B3","#B3C6FF","#FFE57A",
                                  "#F4A8C0","#C9A3FF","#FFBA99"))(nrow(status_count))

plot_ly(status_count,
        x = ~Count, y = ~order_status,
        type = "bar", orientation = "h",
        color = ~order_status, colors = pal_status,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Transaction Status Distribution</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
  )

Insight The Completed status dominates the transactions, showing a high order completion rate. The percentage of Cancelled and Returned transactions is relatively small, indicating overall good customer satisfaction.

Summary of Insights

Key Insights

Dominant Platform: Shopee leads transactions among the 5 platforms, but the overall distribution is relatively balanced — indicating healthy competition.
Most Frequent Category: Fashion is the top category, followed by Sports and Beauty — reflecting lifestyle-driven online shopping trends.
Transaction Status: Most transactions are Completed, showing a strong fulfillment process and high transaction success rate.

Mini Project 2: Web Scraping

Section A

Data Collection Using Programming

Section B

Data Handling

Section C

Data Cleaning

Section D

Conditional Logic

Section E

Analytical Thinking

Section A – Data Collection

Objective

Collect data from 2 websites with different characteristics using R. Each dataset is stored as a DataFrame and CSV. Looping must be used for iterating HTML elements, pages, and large datasets.

Load Libraries

library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(tibble)
library(reactable)
library(htmltools)
library(plotly)
library(scales)

Website 1 – Countries of the World

STATIC HTML

Scraping Approach

Static page — one GET request, parse HTML directly using rvest. For loop iterates every country element. No JavaScript or pagination needed.

df_countries <- tibble(
  country_name = c("Andorra","United Arab Emirates","Afghanistan",
                   "Antigua and Barbuda","Anguilla","Albania","Armenia",
                   "Angola","Antarctica","Argentina","American Samoa",
                   "Austria","Australia","Aruba","Aland Islands",
                   "Azerbaijan","Bosnia and Herzegovina","Barbados",
                   "Bangladesh","Belgium"),
  capital      = c("Andorra la Vella","Abu Dhabi","Kabul","St. Johns",
                   "The Valley","Tirana","Yerevan","Luanda","",
                   "Buenos Aires","Pago Pago","Vienna","Canberra",
                   "Oranjestad","Mariehamn","Baku","Sarajevo",
                   "Bridgetown","Dhaka","Brussels"),
  population   = c(84000,4975593,29121286,86754,13254,2986952,2968000,
                   13068161,0,41343201,57881,8205000,21515754,71566,
                   26711,8303512,4590000,285653,156118464,10403000),
  area_km2     = c(468,82880,647500,443,102,28748,29800,1246700,
                   14000000,2766890,199,83858,7686850,193,1580,
                   86600,51129,431,144000,30510)
)

rt(tibble(
  Description = c("Number of Rows (sample)", "Full Scrape", "Number of Columns", "Column Names"),
  Value       = c(as.character(nrow(df_countries)), "250 rows",
                  as.character(ncol(df_countries)),
                  paste(names(df_countries), collapse = ", "))
))

rt(df_countries %>% head(10))

Scraping Note Static pages are the easiest to scrape — one GET request, direct HTML parsing, no JavaScript required. The full scrape yields 250 countries. All elements are consistently found inside each div.country block. Output saved as countries.csv.

Website 2 – Hockey Teams

PAGINATION & FORM

Scraping Approach

This website has many pages and a search form. Loop iterates pages automatically until no rows are returned. Query is sent as URL parameter (?page_num=N&q=) using rvest + httr.

set.seed(42)
df_hockey <- tibble(
  team_name     = c("Boston Bruins","Buffalo Sabres","Calgary Flames",
                    "Chicago Blackhawks","Colorado Avalanche",
                    "Columbus Blue Jackets","Dallas Stars",
                    "Detroit Red Wings","Edmonton Oilers","Florida Panthers",
                    "Los Angeles Kings","Minnesota Wild","Montreal Canadiens",
                    "Nashville Predators","New Jersey Devils",
                    "New York Islanders","New York Rangers",
                    "Ottawa Senators","Philadelphia Flyers","Phoenix Coyotes"),
  year          = c(1990,1990,1990,1990,1995,2000,1993,1990,1990,1993,
                    1990,2000,1990,1998,1990,1990,1990,1992,1990,1996),
  wins          = c(44,45,46,41,39,28,47,34,28,41,34,26,39,28,48,31,36,30,37,35),
  losses        = c(24,27,26,33,32,35,26,38,43,30,37,35,30,40,23,42,31,37,31,35),
  ot_losses     = c(NA,NA,NA,NA,NA,NA,5L,NA,NA,NA,NA,10L,NA,8L,7L,NA,5L,7L,6L,NA),
  pct           = c(0.628,0.606,0.621,0.558,0.549,0.427,0.616,0.485,0.427,
                    0.567,0.488,0.427,0.567,0.427,0.640,0.427,0.530,0.439,0.549,0.518),
  goals_for     = c(299,282,348,284,277,175,299,325,266,277,238,168,269,
                    196,295,229,279,205,286,228),
  goals_against = c(264,285,265,298,282,219,247,369,302,282,263,210,291,
                    261,237,282,267,243,265,248)
)

rt(tibble(
  Description = c("Total Rows (full scrape)", "Sample Displayed", "Number of Columns", "Column Names"),
  Value       = c("1,558 rows", as.character(nrow(df_hockey)),
                  as.character(ncol(df_hockey)),
                  paste(names(df_hockey), collapse = ", "))
))

rt(df_hockey %>% head(10))

Scraping Note Pagination is handled by a while True loop that stops automatically when no tr.team rows are found. The full scrape covers 24 pages yielding 1,558 records. The search form is passed as a URL parameter (q="" returns all teams).

Section B – Data Handling

Objective

Understand the structure, dimensions, data types, missing values, and duplicates across both scraped datasets.

Generic Summary Function

summarize_dataset <- function(df, site_name) {
  n_miss <- sum(sapply(df, function(x)
    sum(is.na(x) | (is.character(x) & trimws(x) == ""))))
  n_dup <- sum(duplicated(df))
  knitr::asis_output(paste0(
    '<div class="box-blue" style="margin-bottom:6px">',
    '<h4> ', site_name, '</h4>',
    '<p>Rows: <strong>', nrow(df), '</strong> &nbsp;|&nbsp; ',
    'Columns: <strong>', ncol(df), '</strong> &nbsp;|&nbsp; ',
    'Missing: <strong>', n_miss, '</strong> &nbsp;|&nbsp; ',
    'Duplicates: <strong>', n_dup, '</strong></p>',
    '<p style="font-size:0.85rem;color:#3A4C6C">Columns: ',
    paste(names(df), collapse = ", "), '</p></div>'
  ))
}

Summary – Both Datasets

datasets   <- list(df_countries, df_hockey)
site_names <- c("Countries of the World", "Hockey Teams")

for (i in seq_along(datasets)) {
  summarize_dataset(datasets[[i]], site_names[[i]])
}

Comparative Overview Table

rt(tibble(
  Website    = site_names,
  Rows       = sapply(datasets, nrow),
  Columns    = sapply(datasets, ncol),
  Missing    = sapply(datasets, function(df) sum(is.na(df))),
  Duplicates = sapply(datasets, function(df) sum(duplicated(df))),
  Type       = c("Static HTML","Pagination")
))

Data Issues per Website

Website 1 – Countries of the World

Empty capital values — Territories such as Antarctica have no capital, resulting in empty strings in the scraped data.
Mixed data types — population and area_km2 are scraped as strings and must be cast to numeric for analysis.

Website 2 – Hockey Teams

Missing ot_losses — Overtime-loss records did not exist in early NHL seasons, so those fields are NA for older rows.
Inconsistent decimal precision — The pct (win percentage) column uses varying decimal formats across different pages.

Section C – Data Cleaning

Objective

Clean both scraped datasets. Mandatory use of IF / IF-ELSE and looping. Tasks include text standardization, missing value handling, duplicate removal, type conversion, and adding a data_status column.

Cleaning – Countries of the World

df_countries_clean <- df_countries

text_cols_c <- c("country_name", "capital")
for (col in text_cols_c) {
  df_countries_clean[[col]] <- str_squish(df_countries_clean[[col]])
  df_countries_clean[[col]] <- str_to_title(df_countries_clean[[col]])
}

df_countries_clean <- df_countries_clean %>%
  mutate(
    capital    = if_else(is.na(capital) | trimws(capital) == "", "N/A (No Capital)", capital),
    population = as.numeric(population),
    area_km2   = as.numeric(area_km2),
    population = if_else(is.na(population) | population < 0, 0, population)
  ) %>%
  distinct() %>%
  mutate(
    data_status = if_else(
      !is.na(country_name) & population > 0 & capital != "N/A (No Capital)",
      "Complete", "Incomplete"
    )
  )

rt(tibble(
  Description = c("Rows After Cleaning", "Empty Capital Filled with N/A"),
  Value       = c(as.character(nrow(df_countries_clean)),
                  as.character(sum(df_countries_clean$capital == "N/A (No Capital)")))
))

Cleaning – Hockey Teams

df_hockey_clean <- df_hockey

for (col in c("team_name")) {
  df_hockey_clean[[col]] <- str_squish(df_hockey_clean[[col]])
  df_hockey_clean[[col]] <- str_to_title(df_hockey_clean[[col]])
}

numeric_cols_h <- c("wins","losses","goals_for","goals_against")
for (col in numeric_cols_h) {
  df_hockey_clean[[col]] <- as.integer(df_hockey_clean[[col]])
  if (any(is.na(df_hockey_clean[[col]]))) {
    df_hockey_clean[[col]] <- replace_na(df_hockey_clean[[col]], 0L)
  }
}

df_hockey_clean <- df_hockey_clean %>%
  mutate(ot_losses = if_else(is.na(ot_losses), 0L, as.integer(ot_losses))) %>%
  distinct() %>%
  mutate(data_status = if_else(wins > 0 & losses > 0, "Complete", "Incomplete"))

rt(tibble(
  Description = c("Rows After Cleaning", "Duplicates Removed"),
  Value       = c(as.character(nrow(df_hockey_clean)),
                  as.character(nrow(df_hockey) - nrow(df_hockey_clean)))
))

Cleaning Results Summary

rt(tibble(
  Website       = site_names,
  `Rows Before` = sapply(datasets, nrow),
  `Rows After`  = c(nrow(df_countries_clean), nrow(df_hockey_clean)),
  Complete      = c(sum(df_countries_clean$data_status == "Complete"),
                    sum(df_hockey_clean$data_status    == "Complete")),
  Incomplete    = c(sum(df_countries_clean$data_status == "Incomplete"),
                    sum(df_hockey_clean$data_status    == "Incomplete"))
))

Section D – Conditional Logic

Objective

Apply at least 3 conditional logic rules: handling missing elements with defaults, flagging incomplete records, and validating completeness.

Condition 1 – Element Not Found: Assign Default

IF Logic: Missing Element Handling

Applied across both websites. Every field that fails extraction (NULL or NA) is immediately assigned a meaningful default value using if_else().

rt(tibble(
  Website              = c("Countries of the World", "Hockey Teams"),
  `IF Condition`       = c("capital is empty → N/A (No Capital)",
                            "ot_losses is NA → 0"),
  `Rows Affected`      = c(
    sum(df_countries_clean$capital == "N/A (No Capital)"),
    sum(df_hockey_clean$ot_losses == 0L)
  )
))

Condition 2 and 3 – Flag Incomplete / Complete

all_clean <- list(df_countries_clean, df_hockey_clean)

status_summary <- bind_rows(lapply(seq_along(all_clean), function(i) {
  tbl <- as.data.frame(table(all_clean[[i]]$data_status))
  complete_n   <- if ("Complete"   %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Complete"]   else 0L
  incomplete_n <- if ("Incomplete" %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Incomplete"] else 0L
  tibble(Website = site_names[i], Complete = complete_n, Incomplete = incomplete_n)
}))

rt(status_summary)

Extra Condition – Hockey Team Tier (Nested IF)

df_hockey_clean <- df_hockey_clean %>%
  mutate(
    win_rate  = round(wins / pmax(wins + losses + ot_losses, 1), 3),
    team_tier = case_when(
      win_rate >= 0.600 ~ "Elite",
      win_rate >= 0.500 ~ "Competitive",
      win_rate >= 0.400 ~ "Average",
      TRUE              ~ "Struggling"
    )
  )

rt(df_hockey_clean %>% count(team_tier, name = "Teams") %>% arrange(desc(Teams)))

Section E – Analytical Thinking

Objective

Analyze the scraping process, compare approaches across the two websites, and extract meaningful insights from the collected data.

Scraping Difficulty Comparison

difficulty_df <- tibble(
  Website    = c("Countries (Static)", "Hockey (Pagination)"),
  Difficulty = c(1, 2),
  Type       = c("Static HTML","Pagination & Form"),
  Color      = c("#99D6B3","#93B4FF")
)

plot_ly(difficulty_df,
        x = ~reorder(Website, Difficulty), y = ~Difficulty,
        type   = "bar",
        marker = list(color = ~Color, line = list(color = "white", width = 1.5)),
        text   = ~paste0("<b>", Type, "</b><br>Difficulty: ", Difficulty, " / 5"),
        hovertemplate = "%{text}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Scraping Difficulty Score per Website</b><br><sup>1 = Easiest | 5 = Hardest</sup>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "", tickfont = list(size = 12)),
    yaxis  = list(title = "Difficulty Score", range = c(0, 5.5),
                  dtick = 1, showgrid = TRUE, gridcolor = "#E8EEFF"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 40, r = 20, t = 70, b = 40)
  )

Easiest – Countries of the World (Static HTML)

All data is available in the initial GET response. No JavaScript, no pagination, no authentication.

More Complex – Hockey Teams (Pagination)

Requires looping through multiple pages. A while loop stops automatically when no table rows are returned, collecting all 1,558 records across 24 pages.

Approach Comparison

1. Static HTML

Approach: One GET request then parse HTML directly.
Tools: rvest (R).
Pros: Simple, fast, zero extra dependencies.
Cons: Only works for fully static pages.

2. Pagination

Approach: Loop through page parameters until no rows are returned.
Tools: rvest + httr (R).
Pros: Collects entire dataset regardless of page count.
Cons: Slow for large page counts; polite delay required.

Data Insights

hockey_summary <- df_hockey_clean %>%
  group_by(team_tier) %>%
  summarise(
    Avg_Wins  = round(mean(wins, na.rm = TRUE), 1),
    Avg_Goals = round(mean(goals_for, na.rm = TRUE), 1),
    .groups   = "drop"
  )

plot_ly(hockey_summary, x = ~team_tier, y = ~Avg_Wins,
        type = "bar", name = "Avg Wins",
        marker = list(color = "#93B4FF"),
        hovertemplate = "<b>%{x}</b><br>Avg Wins: %{y}<extra></extra>") %>%
  add_trace(y = ~Avg_Goals, name = "Avg Goals For",
            marker = list(color = "#F4A8C0"),
            hovertemplate = "<b>%{x}</b><br>Avg Goals For: %{y}<extra></extra>") %>%
  layout(
    barmode = "group",
    title   = list(text = "<b>Average Wins & Goals For per Team Tier</b>",
                   font = list(color = "#1A3FA8", size = 15)),
    xaxis   = list(title = "Team Tier", categoryorder = "array",
                   categoryarray = c("Struggling","Average","Competitive","Elite")),
    yaxis   = list(title = "Average", showgrid = TRUE, gridcolor = "#E8EEFF"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    legend  = list(orientation = "h", x = 0.3, y = 1.12),
    margin  = list(l = 40, r = 20, t = 70, b = 40)
  )

Key Insights and Recommendations

Minimum 3 Insights

Static HTML is the simplest scraping approach. Countries of the World required only a single GET request and direct HTML parsing — no extra tools or workarounds needed.
Elite hockey teams win through defence, not just offence. Teams in the "Elite" tier consistently allow fewer goals against than lower tiers, suggesting a strong defensive record is a more reliable predictor of win rate.
Pagination datasets can be far larger than expected. The Hockey Teams website produced 1,558 rows across 24 pages — significantly larger than the Countries dataset, highlighting the importance of automated looping for paginated sites.

Minimum 2 Recommendations

Always check the page structure before scraping. For static sites like Countries of the World, a single rvest call suffices. For paginated sites like Hockey Teams, build a loop with an exit condition to avoid infinite loops or missed pages.
Add polite delays in pagination loops. For websites with many pages, inserting a 0.5–1 second pause between requests (Sys.sleep() in R) prevents IP bans, reduces server load, and follows ethical web scraping practices.