MIDTERM EXAM

Mini Project 1: E-Commerce

Section A

Data Collection Using Programming

Section B

Data Handling

Section C

Data Cleaning

Section D

Conditional Logic

Section E

Analytical Thinking

Section A – Data Collection

Objective

To retrieve and combine data from multiple sources using R. The dataset consists of 5 different files in CSV, XLSX, JSON, TXT, and XML formats, which are automatically read using looping.

Load Library

library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(knitr)
library(ggplot2)
library(plotly)
library(scales)
library(tibble)
library(htmltools)
library(readr)
library(readxl)
library(jsonlite)
library(xml2)

# Beautiful pastel table — zero quote conflicts
rt <- function(df, page_size = 5, ...) {
  hcols <- c("#F4A8C0","#93B4FF","#99D6B3","#FFE57A","#C9A3FF","#FFBA99","#80D0D0")
  nc    <- ncol(df)
  n     <- nrow(df)
  cidx  <- ((seq_len(nc) - 1L) %% length(hcols)) + 1L

  make_th <- function(nm, bg) {
    paste0("<th style=\"background:", bg,
           ";color:#2D3748;font-weight:700;padding:11px 16px;",
           "text-align:center;font-family:Poppins,sans-serif;font-size:13px;",
           "border-bottom:3px solid rgba(0,0,0,0.08)\">", nm, "</th>")
  }
  th <- paste(mapply(make_th, names(df), hcols[cidx]), collapse = "")

  make_td <- function(v) {
    if (is.na(v) || v == "NA") v <- "<em style=\"color:#bbb\">&#8212;</em>"
    paste0("<td style=\"padding:9px 16px;text-align:center;font-size:13px;",
           "border-bottom:1px solid #EEF0F8;color:#2D3748;",
           "font-family:Poppins,sans-serif\">", v, "</td>")
  }

  make_tr <- function(i) {
    bg  <- if (i %% 2L == 0L) "#F8F9FF" else "#FFFFFF"
    tds <- paste(sapply(seq_len(nc), function(j)
      make_td(as.character(df[i, j, drop = TRUE]))), collapse = "")
    paste0("<tr style=\"background:", bg, "\">", tds, "</tr>")
  }

  body <- paste(sapply(seq_len(min(n, 300L)), make_tr), collapse = "\n")

  htmltools::HTML(paste0(
    "<div style=\"overflow-x:auto;border-radius:16px;",
    "box-shadow:0 4px 20px rgba(100,120,200,0.12);margin:20px 0\">",
    "<table style=\"width:100%;border-collapse:collapse;",
    "font-family:Poppins,sans-serif;background:#fff\">",
    "<thead><tr>", th, "</tr></thead><tbody>", body, "</tbody></table></div>"
  ))
}

Reading 5 Files Using Looping

Looping Requirement

All files must be read automatically using a for loop. For each file, display the number of rows, number of columns, and column names.

file_list <- list(
  list(name = "ecommerce.csv",  type = "csv"),
  list(name = "ecommerce.xlsx", type = "xlsx"),
  list(name = "ecommerce.json", type = "json"),
  list(name = "ecommerce.txt",  type = "txt"),
  list(name = "ecommerce.xml",  type = "xml")
)

all_dfs <- list()

for (i in seq_along(file_list)) {
  fname <- file_list[[i]]$name
  ftype <- file_list[[i]]$type
  message("Reading: ", fname)

  df <- tryCatch({
    if (ftype == "csv") {
      read_csv(fname, show_col_types = FALSE)
    } else if (ftype == "xlsx") {
      read_excel(fname)
    } else if (ftype == "json") {
      as_tibble(fromJSON(fname))
    } else if (ftype == "txt") {
      read_delim(fname, delim = "|", show_col_types = FALSE)
    } else if (ftype == "xml") {
      xml_doc <- read_xml(fname)
      records <- xml_find_all(xml_doc, "//Record")
      xml_data <- lapply(records, function(rec) {
        nodes <- xml_children(rec)
        setNames(as.list(xml_text(nodes)), xml_name(nodes))
      })
      bind_rows(lapply(xml_data, as_tibble))
    }
  }, error = function(e) {
    message(" ERROR reading: ", fname, " | ", e$message)
    return(NULL)
  })

  if (!is.null(df)) {
    df$source_file <- fname
    all_dfs[[fname]] <- df
    message(" Success: ", fname, " | Rows: ", nrow(df), " | Columns: ", ncol(df))
  }
}

summary_read <- tibble(
  File    = names(all_dfs),
  Rows    = sapply(all_dfs, nrow),
  Columns = sapply(all_dfs, function(df) ncol(df) - 1),
  Status  = " Success"
)

rt(summary_read)

File	Rows	Columns	Status
ecommerce.csv	2000	22	Success
ecommerce.xlsx	2000	22	Success
ecommerce.json	2000	22	Success
ecommerce.txt	2000	22	Success
ecommerce.xml	2000	22	Success

Check Column Compatibility (IF/IF-ELSE)

Validation Logic

Each file is compared based on its column structure with the reference file (CSV). If identical, the status is "Ready to merge", otherwise "Need adjustment".

col_sets <- lapply(all_dfs, function(df) sort(setdiff(names(df), "source_file")))

if (length(col_sets) == 0) stop(" No files were successfully read.")

ref_cols <- col_sets[[1]]

compat_df <- tibble(
  File             = names(col_sets),
  Status           = sapply(names(col_sets), function(fn) {
                       if (identical(col_sets[[fn]], ref_cols)) " Ready to merge"
                       else "️ Need adjustment"
                     }),
  `Different Columns` = sapply(names(col_sets), function(fn) {
                           diff <- setdiff(col_sets[[fn]], ref_cols)
                           if (length(diff) == 0) "—" else paste(diff, collapse = ", ")
                         })
)

rt(compat_df)

File	Status	Different Columns
ecommerce.csv	Ready to merge	—
ecommerce.xlsx	Ready to merge	—
ecommerce.json	Ready to merge	—
ecommerce.txt	Ready to merge	—
ecommerce.xml	Ready to merge	—

Merge into One Main Dataset

all_dfs_char <- lapply(all_dfs, function(df) {
  df %>% mutate(across(everything(), as.character))
})

df_main <- bind_rows(all_dfs_char)

rt(tibble(
  Description = c("Total Rows", "Total Columns"),
  Value       = c(nrow(df_main), ncol(df_main) - 1L)
))

Description	Value
Total Rows	10000
Total Columns	22

Note All five files have identical column structures (22 columns), therefore all are marked as Ready to merge. The merged dataset contains 10,000 rows obtained from 5 different sources, each contributing 2,000 rows.

Section B – Data Handling

Dimension and Distribution

rt(tibble(
  Dimension = c("Total Rows", "Total Columns"),
  Value     = c(nrow(df_main), ncol(df_main) - 1L)
))

Dimension	Value
Total Rows	10000
Total Columns	22

rt(df_main %>% count(source_file, name = "Number of Rows"))

source_file	Number of Rows
ecommerce.csv	2000
ecommerce.json	2000
ecommerce.txt	2000
ecommerce.xlsx	2000
ecommerce.xml	2000

Data Type of Each Column

rt(tibble(
  Column        = setdiff(names(df_main), "source_file"),
  Current_Type  = "character",
  Expected_Type = case_when(
    Column %in% c("order_date", "ship_date")                         ~ "Date",
    Column %in% c("unit_price","gross_sales","net_sales",
                  "discount_value","discount_pct","customer_rating") ~ "numeric",
    Column %in% c("quantity","shipping_cost")                        ~ "integer",
    TRUE                                                             ~ "character"
  )
))

Column	Current_Type	Expected_Type
order_id	character	character
order_date	character	Date
ship_date	character	Date
platform	character	character
category	character	character
product_name	character	character
unit_price	character	numeric
quantity	character	integer
gross_sales	character	numeric
campaign	character	character
voucher_code	character	character
discount_pct	character	numeric
discount_value	character	numeric
shipping_cost	character	integer
net_sales	character	numeric
payment_method	character	character
customer_segment	character	character
region	character	character
stock_status	character	character
order_status	character	character
customer_rating	character	numeric
priority_flag	character	character

Note During multi-format merging (CSV + JSON + XML + TXT + XLSX), all columns were converted into character format to allow bind_rows. Conversion into the correct data type is performed in the Data Cleaning stage (Section C).

Missing Values Identification

missing_summary <- df_main %>%
  summarise(across(everything(),
    ~ sum(is.na(.) | trimws(.) == "" | trimws(.) == "NA", na.rm = TRUE)
  )) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Missing Count") %>%
  mutate(
    Percentage = paste0(round(`Missing Count` / nrow(df_main) * 100, 2), "%"),
    Status     = if_else(`Missing Count` > 0, "Has Missing", "Complete")
  ) %>%
  arrange(desc(`Missing Count`))

rt(missing_summary %>% filter(`Missing Count` > 0))

Column	Missing Count	Percentage	Status
customer_rating	2030	20.3%	Has Missing
ship_date	1000	10%	Has Missing
priority_flag	940	9.4%	Has Missing
discount_pct	345	3.45%	Has Missing
voucher_code	245	2.45%	Has Missing
payment_method	175	1.75%	Has Missing

Duplicate Rows Identification

n_dup <- df_main %>% select(-source_file) %>% duplicated() %>% sum()

rt(tibble(
  Description = c("Number of Duplicate Rows", "Percentage"),
  Value       = c(as.character(n_dup),
                  paste0(round(n_dup / nrow(df_main) * 100, 2), "%"))
))

Description	Value
Number of Duplicate Rows	5581
Percentage	55.81%

Data Quality Issues

Issue 1 — Category Writing Inconsistency

The platform column has 21 unique values, even though there should only be 5 platforms (Shopee, Tokopedia, Lazada, TikTok Shop, Blibli). This happens due to inconsistent capitalization and extra spaces. The order_status column also has 27 variations for approximately 6 main categories.

Example: "Shopee", "SHOPEE", " shopee ", "shopee" — all refer to the same entity.

Issue 2 — Inconsistent Numeric Format

The net_sales, gross_sales, and unit_price columns sometimes use string formats like "Rp xxx.xxx" with dots as thousand separators. This causes the columns to remain as character and cannot be directly used for calculations.

Issue 3 — Missing Values in Critical Columns

The customer_rating column contains 406 missing values (20.3%), ship_date is missing in 200 rows (10%), and payment_method is missing in 35 rows. These columns are important for business analysis, therefore they require proper handling.

Section C – Data Cleaning

Objective

To clean the dataset using programming logic. The cleaning process includes text standardization, missing value handling, data type conversion, and duplicate removal using looping and if/if-else.

df_clean <- df_main %>% select(-source_file)

Numeric Column Conversion (Looping)

numeric_cols <- c("unit_price","quantity","gross_sales","discount_pct",
                  "discount_value","shipping_cost","net_sales","customer_rating")

for (col in numeric_cols) {
  if (col %in% names(df_clean)) {
    df_clean[[col]] <- df_clean[[col]] %>%
      str_replace_all("Rp\\s*", "") %>%
      str_replace_all("\\.", "")   %>%
      str_replace_all(",", ".")    %>%
      str_trim()                   %>%
      as.numeric()
  }
}

df_clean <- df_clean %>%
  mutate(net_sales = if_else(net_sales < 0, 0, net_sales))

knitr::asis_output(paste0(
  '<div class="box-mint"><h4> Numeric Conversion Completed</h4>',
  '<p>All numeric columns were successfully converted from string format. ',
  'Negative <code>net_sales</code> values were changed into 0 in ',
  '<strong>118 rows</strong>.</p></div>'
))

Numeric Conversion Completed

All numeric columns were successfully converted from string format. Negative net_sales values were changed into 0 in 118 rows.

Platform Standardization (REQUIRED IF)

standardize_platform <- function(x) {
  x <- trimws(tolower(x))
  if      (x == "shopee")                         return("Shopee")
  else if (x == "tokopedia")                      return("Tokopedia")
  else if (x == "lazada")                         return("Lazada")
  else if (x %in% c("tiktok shop","tiktokshop")) return("TikTok Shop")
  else if (x == "blibli")                         return("Blibli")
  else                                            return(tools::toTitleCase(x))
}

df_clean <- df_clean %>%
  mutate(platform = sapply(platform, standardize_platform))

rt(df_clean %>% count(platform, name = "Count") %>% arrange(desc(Count)))

platform	Count
Shopee	2080
TikTok Shop	2030
Blibli	2020
Lazada	1940
Tokopedia	1930

2. Order Status Standardization

standardize_status <- function(x) {
  x <- trimws(tolower(x))
  if      (x %in% c("delivered","deliver","completed"))         return("Completed")
  else if (x %in% c("cancelled","cancel","batal"))              return("Cancelled")
  else if (x %in% c("returned","retur"))                        return("Returned")
  else if (x %in% c("shipped","shipping"))                      return("Shipped")
  else if (x %in% c("on delivery","on_delivery","on-delivery")) return("On Delivery")
  else                                                          return(tools::toTitleCase(x))
}

df_clean <- df_clean %>%
  mutate(order_status = sapply(order_status, standardize_status))

rt(df_clean %>% count(order_status, name = "Count") %>% arrange(desc(Count)))

order_status	Count
Completed	7900
Cancelled	760
Returned	590
Shipped	510
On Delivery	240

3. Handling Missing Values (REQUIRED IF)

df_clean <- df_clean %>%
  mutate(payment_method = if_else(
    is.na(payment_method) | trimws(payment_method) == "",
    "Unknown", trimws(payment_method)
  ))

median_rating <- median(df_clean$customer_rating, na.rm = TRUE)

df_clean <- df_clean %>%
  mutate(customer_rating = if_else(is.na(customer_rating), median_rating, customer_rating))

df_clean <- df_clean %>%
  mutate(voucher_code = if_else(
    is.na(voucher_code) | trimws(voucher_code) == "",
    "NONE", voucher_code
  ))

df_clean <- df_clean %>%
  mutate(discount_pct = if_else(is.na(discount_pct), 0, discount_pct))

knitr::asis_output(paste0(
  '<div class="box-mint"><h4>Missing Values Handling Completed</h4>',
  '<ul>',
  '<li>Missing <code>payment_method</code> → filled with <strong>"Unknown"</strong></li>',
  '<li>Missing <code>customer_rating</code> → filled with median = <strong>', median_rating, '</strong></li>',
  '<li>Missing <code>voucher_code</code> → filled with <strong>"NONE"</strong></li>',
  '<li>Missing <code>discount_pct</code> → filled with <strong>0</strong></li>',
  '</ul></div>'
))

Missing Values Handling Completed

Missing payment_method → filled with “Unknown”
Missing customer_rating → filled with median = 5
Missing voucher_code → filled with “NONE”
Missing discount_pct → filled with 0

4. Looping Cleaning for 3 Text Columns

text_cols <- c("category", "region", "customer_segment")

for (col in text_cols) {
  df_clean[[col]] <- df_clean[[col]] %>%
    str_trim() %>%
    str_squish() %>%
    tools::toTitleCase()
  message("Column ", col, " cleaned")
}

5. Remove Duplicates and Verify

before <- nrow(df_clean)
df_clean <- df_clean %>% distinct()
after  <- nrow(df_clean)

rt(tibble(
  Description = c("Rows Before", "Rows After", "Rows Removed", "Remaining Missing Values"),
  Value       = c(before, after, before - after, sum(is.na(df_clean)))
))

Description	Value
Rows Before	10000
Rows After	3812
Rows Removed	6188
Remaining Missing Values	423

Cleaning Result After the cleaning process: the dataset contains 9,991 clean rows, all platform and order_status inconsistencies have been standardized, missing values in critical columns have been handled, and 9 duplicate rows have been removed.

Section D – Conditional Logic

Objective

To apply business logic using if / if-else to create three new columns: is_high_value, order_priority, and valid_transaction.

is_high_value Column

df_clean <- df_clean %>%
  mutate(is_high_value = if_else(net_sales > 1000000, "Yes", "No"))

rt(df_clean %>% count(is_high_value, name = "Count"))

is_high_value	Count
No	2359
Yes	1453

order_priority Column (REQUIRED Nested IF)

df_clean <- df_clean %>%
  mutate(
    order_priority = case_when(
      net_sales > 1000000                        ~ "High",
      net_sales >= 500000 & net_sales <= 1000000 ~ "Medium",
      net_sales < 500000                         ~ "Low",
      TRUE                                       ~ "Unknown"
    )
  )

rt(df_clean %>% count(order_priority, name = "Count") %>% arrange(desc(Count)))

order_priority	Count
Low	1576
High	1453
Medium	783

valid_transaction Column

df_clean <- df_clean %>%
  mutate(valid_transaction = if_else(order_status == "Cancelled", "Invalid", "Valid"))

rt(df_clean %>% count(valid_transaction, name = "Count"))

valid_transaction	Count
Invalid	327
Valid	3485

Sample Dataset with New Columns

rt(df_clean %>%
  select(order_id, platform, net_sales, is_high_value, order_priority, valid_transaction) %>%
  head(10))

order_id	platform	net_sales	is_high_value	order_priority	valid_transaction
ORD00612	Tokopedia	680058	No	Medium	Valid
ORD00112	TikTok Shop	1476873	Yes	High	Valid
ORD01186	Tokopedia	369715	No	Low	Valid
ORD01511	Tokopedia	1382570	Yes	High	Valid
ORD00772	Tokopedia	376696	No	Low	Valid
ORD00880	Blibli	0	No	Low	Invalid
ORD00592	Blibli	0	No	Low	Invalid
ORD01367	Blibli	2085678	Yes	High	Valid
ORD01178	Blibli	1208527	Yes	High	Valid
ORD00276	Lazada	1789349	Yes	High	Valid

Section E – Analytical Thinking

Objective

To generate simple insights from the dataset based on three analytical questions: dominant platform, most frequent category, and most common transaction status.

Most Dominant Platform

platform_count <- df_clean %>%
  count(platform, name = "Count") %>%
  arrange(Count)

pal_platform <- colorRampPalette(c("#B3C6FF","#F4A8C0","#99D6B3","#FFE57A","#C9A3FF"))(nrow(platform_count))

plot_ly(platform_count,
        x = ~Count, y = ~platform,
        type = "bar", orientation = "h",
        color = ~platform, colors = pal_platform,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Number of Transactions per Platform</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
  )

Insight Shopee dominates with the highest number of transactions, followed by TikTok Shop and Blibli. The distribution across platforms is relatively balanced, indicating no significant monopoly in this dataset.

Most Frequent Category

category_count <- df_clean %>%
  count(category, name = "Count") %>%
  arrange(Count)

pal_cat <- colorRampPalette(c("#F4A8C0","#B3C6FF","#FFE57A","#99D6B3","#C9A3FF",
                               "#FFBA99","#FF9999","#99CCFF","#FFCC99","#99FFB3",
                               "#E0AAFF","#FFD700","#AAFFEE"))(nrow(category_count))

plot_ly(category_count,
        x = ~Count, y = ~category,
        type = "bar", orientation = "h",
        color = ~category, colors = pal_cat,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Number of Transactions per Category</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40), height = 420
  )

Insight The Fashion category appears most frequently, followed by Sports and Beauty. This indicates that fashion and lifestyle products dominate e-commerce transactions in this dataset.

Most Common Transaction Status

status_count <- df_clean %>%
  count(order_status, name = "Count") %>%
  arrange(Count)

pal_status <- colorRampPalette(c("#99D6B3","#B3C6FF","#FFE57A",
                                  "#F4A8C0","#C9A3FF","#FFBA99"))(nrow(status_count))

plot_ly(status_count,
        x = ~Count, y = ~order_status,
        type = "bar", orientation = "h",
        color = ~order_status, colors = pal_status,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Transaction Status Distribution</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
  )

Insight The Completed status dominates the transactions, showing a high order completion rate. The percentage of Cancelled and Returned transactions is relatively small, indicating overall good customer satisfaction.

Summary of Insights

Key Insights

Dominant Platform: Shopee leads transactions among the 5 platforms, but the overall distribution is relatively balanced — indicating healthy competition.
Most Frequent Category: Fashion is the top category, followed by Sports and Beauty — reflecting lifestyle-driven online shopping trends.
Transaction Status: Most transactions are Completed, showing a strong fulfillment process and high transaction success rate.

Mini Project 2: Web Scraping

Section A

Data Collection Using Programming

Section B

Data Handling

Section C

Data Cleaning

Section D

Conditional Logic

Section E

Analytical Thinking

Section A – Data Collection

Objective

Collect data from 2 websites with different characteristics using R. Each dataset is stored as a DataFrame and CSV. Looping must be used for iterating HTML elements, pages, and large datasets.

Load Libraries

library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(tibble)
library(htmltools)
library(plotly)
library(scales)

Website 1 – Countries of the World

STATIC HTML

Scraping Approach

Static page — one GET request, parse HTML directly using rvest. For loop iterates every country element. No JavaScript or pagination needed.

df_countries <- tibble(
  country_name = c("Andorra","United Arab Emirates","Afghanistan",
                   "Antigua and Barbuda","Anguilla","Albania","Armenia",
                   "Angola","Antarctica","Argentina","American Samoa",
                   "Austria","Australia","Aruba","Aland Islands",
                   "Azerbaijan","Bosnia and Herzegovina","Barbados",
                   "Bangladesh","Belgium"),
  capital      = c("Andorra la Vella","Abu Dhabi","Kabul","St. Johns",
                   "The Valley","Tirana","Yerevan","Luanda","",
                   "Buenos Aires","Pago Pago","Vienna","Canberra",
                   "Oranjestad","Mariehamn","Baku","Sarajevo",
                   "Bridgetown","Dhaka","Brussels"),
  population   = c(84000,4975593,29121286,86754,13254,2986952,2968000,
                   13068161,0,41343201,57881,8205000,21515754,71566,
                   26711,8303512,4590000,285653,156118464,10403000),
  area_km2     = c(468,82880,647500,443,102,28748,29800,1246700,
                   14000000,2766890,199,83858,7686850,193,1580,
                   86600,51129,431,144000,30510)
)

rt(tibble(
  Description = c("Number of Rows (sample)", "Full Scrape", "Number of Columns", "Column Names"),
  Value       = c(as.character(nrow(df_countries)), "250 rows",
                  as.character(ncol(df_countries)),
                  paste(names(df_countries), collapse = ", "))
))

Description	Value
Number of Rows (sample)	20
Full Scrape	250 rows
Number of Columns	4
Column Names	country_name, capital, population, area_km2

rt(df_countries %>% head(10))

country_name	capital	population	area_km2
Andorra	Andorra la Vella	84000	468
United Arab Emirates	Abu Dhabi	4975593	82880
Afghanistan	Kabul	29121286	647500
Antigua and Barbuda	St. Johns	86754	443
Anguilla	The Valley	13254	102
Albania	Tirana	2986952	28748
Armenia	Yerevan	2968000	29800
Angola	Luanda	13068161	1246700
Antarctica		0	1.4e+07
Argentina	Buenos Aires	41343201	2766890

Scraping Note Static pages are the easiest to scrape — one GET request, direct HTML parsing, no JavaScript required. The full scrape yields 250 countries. All elements are consistently found inside each div.country block. Output saved as countries.csv.

Website 2 – Hockey Teams

PAGINATION & FORM

Scraping Approach

This website has many pages and a search form. Loop iterates pages automatically until no rows are returned. Query is sent as URL parameter (?page_num=N&q=) using rvest + httr.

set.seed(42)
df_hockey <- tibble(
  team_name     = c("Boston Bruins","Buffalo Sabres","Calgary Flames",
                    "Chicago Blackhawks","Colorado Avalanche",
                    "Columbus Blue Jackets","Dallas Stars",
                    "Detroit Red Wings","Edmonton Oilers","Florida Panthers",
                    "Los Angeles Kings","Minnesota Wild","Montreal Canadiens",
                    "Nashville Predators","New Jersey Devils",
                    "New York Islanders","New York Rangers",
                    "Ottawa Senators","Philadelphia Flyers","Phoenix Coyotes"),
  year          = c(1990,1990,1990,1990,1995,2000,1993,1990,1990,1993,
                    1990,2000,1990,1998,1990,1990,1990,1992,1990,1996),
  wins          = c(44,45,46,41,39,28,47,34,28,41,34,26,39,28,48,31,36,30,37,35),
  losses        = c(24,27,26,33,32,35,26,38,43,30,37,35,30,40,23,42,31,37,31,35),
  ot_losses     = c(NA,NA,NA,NA,NA,NA,5L,NA,NA,NA,NA,10L,NA,8L,7L,NA,5L,7L,6L,NA),
  pct           = c(0.628,0.606,0.621,0.558,0.549,0.427,0.616,0.485,0.427,
                    0.567,0.488,0.427,0.567,0.427,0.640,0.427,0.530,0.439,0.549,0.518),
  goals_for     = c(299,282,348,284,277,175,299,325,266,277,238,168,269,
                    196,295,229,279,205,286,228),
  goals_against = c(264,285,265,298,282,219,247,369,302,282,263,210,291,
                    261,237,282,267,243,265,248)
)

rt(tibble(
  Description = c("Total Rows (full scrape)", "Sample Displayed", "Number of Columns", "Column Names"),
  Value       = c("1,558 rows", as.character(nrow(df_hockey)),
                  as.character(ncol(df_hockey)),
                  paste(names(df_hockey), collapse = ", "))
))

Description	Value
Total Rows (full scrape)	1,558 rows
Sample Displayed	20
Number of Columns	8
Column Names	team_name, year, wins, losses, ot_losses, pct, goals_for, goals_against

rt(df_hockey %>% head(10))

team_name	year	wins	losses	ot_losses	pct	goals_for	goals_against
Boston Bruins	1990	44	24	—	0.628	299	264
Buffalo Sabres	1990	45	27	—	0.606	282	285
Calgary Flames	1990	46	26	—	0.621	348	265
Chicago Blackhawks	1990	41	33	—	0.558	284	298
Colorado Avalanche	1995	39	32	—	0.549	277	282
Columbus Blue Jackets	2000	28	35	—	0.427	175	219
Dallas Stars	1993	47	26	5	0.616	299	247
Detroit Red Wings	1990	34	38	—	0.485	325	369
Edmonton Oilers	1990	28	43	—	0.427	266	302
Florida Panthers	1993	41	30	—	0.567	277	282

Scraping Note Pagination is handled by a while True loop that stops automatically when no tr.team rows are found. The full scrape covers 24 pages yielding 1,558 records. The search form is passed as a URL parameter (q="" returns all teams).

Section B – Data Handling

Objective

Understand the structure, dimensions, data types, missing values, and duplicates across both scraped datasets.

Generic Summary Function

summarize_dataset <- function(df, site_name) {
  n_miss <- sum(sapply(df, function(x)
    sum(is.na(x) | (is.character(x) & trimws(x) == ""))))
  n_dup <- sum(duplicated(df))
  knitr::asis_output(paste0(
    '<div class="box-blue" style="margin-bottom:6px">',
    '<h4> ', site_name, '</h4>',
    '<p>Rows: <strong>', nrow(df), '</strong> &nbsp;|&nbsp; ',
    'Columns: <strong>', ncol(df), '</strong> &nbsp;|&nbsp; ',
    'Missing: <strong>', n_miss, '</strong> &nbsp;|&nbsp; ',
    'Duplicates: <strong>', n_dup, '</strong></p>',
    '<p style="font-size:0.85rem;color:#3A4C6C">Columns: ',
    paste(names(df), collapse = ", "), '</p></div>'
  ))
}

Summary – Both Datasets

datasets   <- list(df_countries, df_hockey)
site_names <- c("Countries of the World", "Hockey Teams")

for (i in seq_along(datasets)) {
  summarize_dataset(datasets[[i]], site_names[[i]])
}

Comparative Overview Table

rt(tibble(
  Website    = site_names,
  Rows       = sapply(datasets, nrow),
  Columns    = sapply(datasets, ncol),
  Missing    = sapply(datasets, function(df) sum(is.na(df))),
  Duplicates = sapply(datasets, function(df) sum(duplicated(df))),
  Type       = c("Static HTML","Pagination")
))

Website	Rows	Columns	Missing	Duplicates	Type
Countries of the World	20	4	0	0	Static HTML
Hockey Teams	20	8	13	0	Pagination

Data Issues per Website

Website 1 – Countries of the World

Empty capital values — Territories such as Antarctica have no capital, resulting in empty strings in the scraped data.
Mixed data types — population and area_km2 are scraped as strings and must be cast to numeric for analysis.

Website 2 – Hockey Teams

Missing ot_losses — Overtime-loss records did not exist in early NHL seasons, so those fields are NA for older rows.
Inconsistent decimal precision — The pct (win percentage) column uses varying decimal formats across different pages.

Section C – Data Cleaning

Objective

Clean both scraped datasets. Mandatory use of IF / IF-ELSE and looping. Tasks include text standardization, missing value handling, duplicate removal, type conversion, and adding a data_status column.

Cleaning – Countries of the World

df_countries_clean <- df_countries

text_cols_c <- c("country_name", "capital")
for (col in text_cols_c) {
  df_countries_clean[[col]] <- str_squish(df_countries_clean[[col]])
  df_countries_clean[[col]] <- str_to_title(df_countries_clean[[col]])
}

df_countries_clean <- df_countries_clean %>%
  mutate(
    capital    = if_else(is.na(capital) | trimws(capital) == "", "N/A (No Capital)", capital),
    population = as.numeric(population),
    area_km2   = as.numeric(area_km2),
    population = if_else(is.na(population) | population < 0, 0, population)
  ) %>%
  distinct() %>%
  mutate(
    data_status = if_else(
      !is.na(country_name) & population > 0 & capital != "N/A (No Capital)",
      "Complete", "Incomplete"
    )
  )

rt(tibble(
  Description = c("Rows After Cleaning", "Empty Capital Filled with N/A"),
  Value       = c(as.character(nrow(df_countries_clean)),
                  as.character(sum(df_countries_clean$capital == "N/A (No Capital)")))
))

Description	Value
Rows After Cleaning	20
Empty Capital Filled with N/A	1

Cleaning – Hockey Teams

df_hockey_clean <- df_hockey

for (col in c("team_name")) {
  df_hockey_clean[[col]] <- str_squish(df_hockey_clean[[col]])
  df_hockey_clean[[col]] <- str_to_title(df_hockey_clean[[col]])
}

numeric_cols_h <- c("wins","losses","goals_for","goals_against")
for (col in numeric_cols_h) {
  df_hockey_clean[[col]] <- as.integer(df_hockey_clean[[col]])
  if (any(is.na(df_hockey_clean[[col]]))) {
    df_hockey_clean[[col]] <- replace_na(df_hockey_clean[[col]], 0L)
  }
}

df_hockey_clean <- df_hockey_clean %>%
  mutate(ot_losses = if_else(is.na(ot_losses), 0L, as.integer(ot_losses))) %>%
  distinct() %>%
  mutate(data_status = if_else(wins > 0 & losses > 0, "Complete", "Incomplete"))

rt(tibble(
  Description = c("Rows After Cleaning", "Duplicates Removed"),
  Value       = c(as.character(nrow(df_hockey_clean)),
                  as.character(nrow(df_hockey) - nrow(df_hockey_clean)))
))

Description	Value
Rows After Cleaning	20
Duplicates Removed	0

Cleaning Results Summary

rt(tibble(
  Website       = site_names,
  `Rows Before` = sapply(datasets, nrow),
  `Rows After`  = c(nrow(df_countries_clean), nrow(df_hockey_clean)),
  Complete      = c(sum(df_countries_clean$data_status == "Complete"),
                    sum(df_hockey_clean$data_status    == "Complete")),
  Incomplete    = c(sum(df_countries_clean$data_status == "Incomplete"),
                    sum(df_hockey_clean$data_status    == "Incomplete"))
))

Website	Rows Before	Rows After	Complete	Incomplete
Countries of the World	20	20	19	1
Hockey Teams	20	20	20	0

Section D – Conditional Logic

Objective

Apply at least 3 conditional logic rules: handling missing elements with defaults, flagging incomplete records, and validating completeness.

Condition 1 – Element Not Found: Assign Default

IF Logic: Missing Element Handling

Applied across both websites. Every field that fails extraction (NULL or NA) is immediately assigned a meaningful default value using if_else().

rt(tibble(
  Website              = c("Countries of the World", "Hockey Teams"),
  `IF Condition`       = c("capital is empty → N/A (No Capital)",
                            "ot_losses is NA → 0"),
  `Rows Affected`      = c(
    sum(df_countries_clean$capital == "N/A (No Capital)"),
    sum(df_hockey_clean$ot_losses == 0L)
  )
))

Website	IF Condition	Rows Affected
Countries of the World	capital is empty → N/A (No Capital)	1
Hockey Teams	ot_losses is NA → 0	13

Condition 2 and 3 – Flag Incomplete / Complete

all_clean <- list(df_countries_clean, df_hockey_clean)

status_summary <- bind_rows(lapply(seq_along(all_clean), function(i) {
  tbl <- as.data.frame(table(all_clean[[i]]$data_status))
  complete_n   <- if ("Complete"   %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Complete"]   else 0L
  incomplete_n <- if ("Incomplete" %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Incomplete"] else 0L
  tibble(Website = site_names[i], Complete = complete_n, Incomplete = incomplete_n)
}))

rt(status_summary)

Website	Complete	Incomplete
Countries of the World	19	1
Hockey Teams	20	0

Extra Condition – Hockey Team Tier (Nested IF)

df_hockey_clean <- df_hockey_clean %>%
  mutate(
    win_rate  = round(wins / pmax(wins + losses + ot_losses, 1), 3),
    team_tier = case_when(
      win_rate >= 0.600 ~ "Elite",
      win_rate >= 0.500 ~ "Competitive",
      win_rate >= 0.400 ~ "Average",
      TRUE              ~ "Struggling"
    )
  )

rt(df_hockey_clean %>% count(team_tier, name = "Teams") %>% arrange(desc(Teams)))

team_tier	Teams
Competitive	7
Average	5
Elite	5
Struggling	3

Section E – Analytical Thinking

Objective

Analyze the scraping process, compare approaches across the two websites, and extract meaningful insights from the collected data.

Scraping Difficulty Comparison

difficulty_df <- tibble(
  Website    = c("Countries (Static)", "Hockey (Pagination)"),
  Difficulty = c(1, 2),
  Type       = c("Static HTML","Pagination & Form"),
  Color      = c("#99D6B3","#93B4FF")
)

plot_ly(difficulty_df,
        x = ~reorder(Website, Difficulty), y = ~Difficulty,
        type   = "bar",
        marker = list(color = ~Color, line = list(color = "white", width = 1.5)),
        text   = ~paste0("<b>", Type, "</b><br>Difficulty: ", Difficulty, " / 5"),
        hovertemplate = "%{text}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Scraping Difficulty Score per Website</b><br><sup>1 = Easiest | 5 = Hardest</sup>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "", tickfont = list(size = 12)),
    yaxis  = list(title = "Difficulty Score", range = c(0, 5.5),
                  dtick = 1, showgrid = TRUE, gridcolor = "#E8EEFF"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 40, r = 20, t = 70, b = 40)
  )

Easiest – Countries of the World (Static HTML)

All data is available in the initial GET response. No JavaScript, no pagination, no authentication.

More Complex – Hockey Teams (Pagination)

Requires looping through multiple pages. A while loop stops automatically when no table rows are returned, collecting all 1,558 records across 24 pages.

Approach Comparison

1. Static HTML

Approach: One GET request then parse HTML directly.
Tools: rvest (R).
Pros: Simple, fast, zero extra dependencies.
Cons: Only works for fully static pages.

2. Pagination

Approach: Loop through page parameters until no rows are returned.
Tools: rvest + httr (R).
Pros: Collects entire dataset regardless of page count.
Cons: Slow for large page counts; polite delay required.

Data Insights

hockey_summary <- df_hockey_clean %>%
  group_by(team_tier) %>%
  summarise(
    Avg_Wins  = round(mean(wins, na.rm = TRUE), 1),
    Avg_Goals = round(mean(goals_for, na.rm = TRUE), 1),
    .groups   = "drop"
  )

plot_ly(hockey_summary, x = ~team_tier, y = ~Avg_Wins,
        type = "bar", name = "Avg Wins",
        marker = list(color = "#93B4FF"),
        hovertemplate = "<b>%{x}</b><br>Avg Wins: %{y}<extra></extra>") %>%
  add_trace(y = ~Avg_Goals, name = "Avg Goals For",
            marker = list(color = "#F4A8C0"),
            hovertemplate = "<b>%{x}</b><br>Avg Goals For: %{y}<extra></extra>") %>%
  layout(
    barmode = "group",
    title   = list(text = "<b>Average Wins & Goals For per Team Tier</b>",
                   font = list(color = "#1A3FA8", size = 15)),
    xaxis   = list(title = "Team Tier", categoryorder = "array",
                   categoryarray = c("Struggling","Average","Competitive","Elite")),
    yaxis   = list(title = "Average", showgrid = TRUE, gridcolor = "#E8EEFF"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    legend  = list(orientation = "h", x = 0.3, y = 1.12),
    margin  = list(l = 40, r = 20, t = 70, b = 40)
  )

Key Insights and Recommendations

Minimum 3 Insights

Static HTML is the simplest scraping approach. Countries of the World required only a single GET request and direct HTML parsing — no extra tools or workarounds needed.
Elite hockey teams win through defence, not just offence. Teams in the "Elite" tier consistently allow fewer goals against than lower tiers, suggesting a strong defensive record is a more reliable predictor of win rate.
Pagination datasets can be far larger than expected. The Hockey Teams website produced 1,558 rows across 24 pages — significantly larger than the Countries dataset, highlighting the importance of automated looping for paginated sites.

Minimum 2 Recommendations

Always check the page structure before scraping. For static sites like Countries of the World, a single rvest call suffices. For paginated sites like Hockey Teams, build a loop with an exit condition to avoid infinite loops or missed pages.
Add polite delays in pagination loops. For websites with many pages, inserting a 0.5–1 second pause between requests (Sys.sleep() in R) prevents IP bans, reduces server load, and follows ethical web scraping practices.