MIDTERM EXAM

Chelsea
CHELSEA TESALONIKA
PATRICIA HUTAJULU
52250041
Frenkhy
FRENKHY TONGA RETANG
52250008
Vanessa
VANESSA ZIBA ARDELIA
52250048

Mini Project 1: E-Commerce


Section A
Data Collection Using Programming
Section B
Data Handling
Section C
Data Cleaning
Section D
Conditional Logic
Section E
Analytical Thinking

Section A – Data Collection

Objective

To retrieve and combine data from multiple sources using R. The dataset consists of 5 different files in CSV, XLSX, JSON, TXT, and XML formats, which are automatically read using looping.

Load Library

library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(knitr)
library(ggplot2)
library(plotly)
library(scales)
library(tibble)
library(htmltools)
library(readr)
library(readxl)
library(jsonlite)
library(xml2)

# Beautiful pastel table — zero quote conflicts
rt <- function(df, page_size = 5, ...) {
  hcols <- c("#F4A8C0","#93B4FF","#99D6B3","#FFE57A","#C9A3FF","#FFBA99","#80D0D0")
  nc    <- ncol(df)
  n     <- nrow(df)
  cidx  <- ((seq_len(nc) - 1L) %% length(hcols)) + 1L

  make_th <- function(nm, bg) {
    paste0("<th style=\"background:", bg,
           ";color:#2D3748;font-weight:700;padding:11px 16px;",
           "text-align:center;font-family:Poppins,sans-serif;font-size:13px;",
           "border-bottom:3px solid rgba(0,0,0,0.08)\">", nm, "</th>")
  }
  th <- paste(mapply(make_th, names(df), hcols[cidx]), collapse = "")

  make_td <- function(v) {
    if (is.na(v) || v == "NA") v <- "<em style=\"color:#bbb\">&#8212;</em>"
    paste0("<td style=\"padding:9px 16px;text-align:center;font-size:13px;",
           "border-bottom:1px solid #EEF0F8;color:#2D3748;",
           "font-family:Poppins,sans-serif\">", v, "</td>")
  }

  make_tr <- function(i) {
    bg  <- if (i %% 2L == 0L) "#F8F9FF" else "#FFFFFF"
    tds <- paste(sapply(seq_len(nc), function(j)
      make_td(as.character(df[i, j, drop = TRUE]))), collapse = "")
    paste0("<tr style=\"background:", bg, "\">", tds, "</tr>")
  }

  body <- paste(sapply(seq_len(min(n, 300L)), make_tr), collapse = "\n")

  htmltools::HTML(paste0(
    "<div style=\"overflow-x:auto;border-radius:16px;",
    "box-shadow:0 4px 20px rgba(100,120,200,0.12);margin:20px 0\">",
    "<table style=\"width:100%;border-collapse:collapse;",
    "font-family:Poppins,sans-serif;background:#fff\">",
    "<thead><tr>", th, "</tr></thead><tbody>", body, "</tbody></table></div>"
  ))
}

Reading 5 Files Using Looping

Looping Requirement

All files must be read automatically using a for loop. For each file, display the number of rows, number of columns, and column names.

file_list <- list(
  list(name = "ecommerce.csv",  type = "csv"),
  list(name = "ecommerce.xlsx", type = "xlsx"),
  list(name = "ecommerce.json", type = "json"),
  list(name = "ecommerce.txt",  type = "txt"),
  list(name = "ecommerce.xml",  type = "xml")
)

all_dfs <- list()

for (i in seq_along(file_list)) {
  fname <- file_list[[i]]$name
  ftype <- file_list[[i]]$type
  message("Reading: ", fname)

  df <- tryCatch({
    if (ftype == "csv") {
      read_csv(fname, show_col_types = FALSE)
    } else if (ftype == "xlsx") {
      read_excel(fname)
    } else if (ftype == "json") {
      as_tibble(fromJSON(fname))
    } else if (ftype == "txt") {
      read_delim(fname, delim = "|", show_col_types = FALSE)
    } else if (ftype == "xml") {
      xml_doc <- read_xml(fname)
      records <- xml_find_all(xml_doc, "//Record")
      xml_data <- lapply(records, function(rec) {
        nodes <- xml_children(rec)
        setNames(as.list(xml_text(nodes)), xml_name(nodes))
      })
      bind_rows(lapply(xml_data, as_tibble))
    }
  }, error = function(e) {
    message(" ERROR reading: ", fname, " | ", e$message)
    return(NULL)
  })

  if (!is.null(df)) {
    df$source_file <- fname
    all_dfs[[fname]] <- df
    message(" Success: ", fname, " | Rows: ", nrow(df), " | Columns: ", ncol(df))
  }
}

summary_read <- tibble(
  File    = names(all_dfs),
  Rows    = sapply(all_dfs, nrow),
  Columns = sapply(all_dfs, function(df) ncol(df) - 1),
  Status  = " Success"
)

rt(summary_read)
FileRowsColumnsStatus
ecommerce.csv200022 Success
ecommerce.xlsx200022 Success
ecommerce.json200022 Success
ecommerce.txt200022 Success
ecommerce.xml200022 Success

Check Column Compatibility (IF/IF-ELSE)

Validation Logic

Each file is compared based on its column structure with the reference file (CSV). If identical, the status is "Ready to merge", otherwise "Need adjustment".

col_sets <- lapply(all_dfs, function(df) sort(setdiff(names(df), "source_file")))

if (length(col_sets) == 0) stop(" No files were successfully read.")

ref_cols <- col_sets[[1]]

compat_df <- tibble(
  File             = names(col_sets),
  Status           = sapply(names(col_sets), function(fn) {
                       if (identical(col_sets[[fn]], ref_cols)) " Ready to merge"
                       else "️ Need adjustment"
                     }),
  `Different Columns` = sapply(names(col_sets), function(fn) {
                           diff <- setdiff(col_sets[[fn]], ref_cols)
                           if (length(diff) == 0) "—" else paste(diff, collapse = ", ")
                         })
)

rt(compat_df)
FileStatusDifferent Columns
ecommerce.csv Ready to merge
ecommerce.xlsx Ready to merge
ecommerce.json Ready to merge
ecommerce.txt Ready to merge
ecommerce.xml Ready to merge

Merge into One Main Dataset

all_dfs_char <- lapply(all_dfs, function(df) {
  df %>% mutate(across(everything(), as.character))
})

df_main <- bind_rows(all_dfs_char)

rt(tibble(
  Description = c("Total Rows", "Total Columns"),
  Value       = c(nrow(df_main), ncol(df_main) - 1L)
))
DescriptionValue
Total Rows10000
Total Columns22

Note All five files have identical column structures (22 columns), therefore all are marked as Ready to merge. The merged dataset contains 10,000 rows obtained from 5 different sources, each contributing 2,000 rows.

Section B – Data Handling

Dimension and Distribution

rt(tibble(
  Dimension = c("Total Rows", "Total Columns"),
  Value     = c(nrow(df_main), ncol(df_main) - 1L)
))
DimensionValue
Total Rows10000
Total Columns22
rt(df_main %>% count(source_file, name = "Number of Rows"))
source_fileNumber of Rows
ecommerce.csv2000
ecommerce.json2000
ecommerce.txt2000
ecommerce.xlsx2000
ecommerce.xml2000

Data Type of Each Column

rt(tibble(
  Column        = setdiff(names(df_main), "source_file"),
  Current_Type  = "character",
  Expected_Type = case_when(
    Column %in% c("order_date", "ship_date")                         ~ "Date",
    Column %in% c("unit_price","gross_sales","net_sales",
                  "discount_value","discount_pct","customer_rating") ~ "numeric",
    Column %in% c("quantity","shipping_cost")                        ~ "integer",
    TRUE                                                             ~ "character"
  )
))
ColumnCurrent_TypeExpected_Type
order_idcharactercharacter
order_datecharacterDate
ship_datecharacterDate
platformcharactercharacter
categorycharactercharacter
product_namecharactercharacter
unit_pricecharacternumeric
quantitycharacterinteger
gross_salescharacternumeric
campaigncharactercharacter
voucher_codecharactercharacter
discount_pctcharacternumeric
discount_valuecharacternumeric
shipping_costcharacterinteger
net_salescharacternumeric
payment_methodcharactercharacter
customer_segmentcharactercharacter
regioncharactercharacter
stock_statuscharactercharacter
order_statuscharactercharacter
customer_ratingcharacternumeric
priority_flagcharactercharacter
Note During multi-format merging (CSV + JSON + XML + TXT + XLSX), all columns were converted into character format to allow bind_rows. Conversion into the correct data type is performed in the Data Cleaning stage (Section C).

Missing Values Identification

missing_summary <- df_main %>%
  summarise(across(everything(),
    ~ sum(is.na(.) | trimws(.) == "" | trimws(.) == "NA", na.rm = TRUE)
  )) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Missing Count") %>%
  mutate(
    Percentage = paste0(round(`Missing Count` / nrow(df_main) * 100, 2), "%"),
    Status     = if_else(`Missing Count` > 0, "Has Missing", "Complete")
  ) %>%
  arrange(desc(`Missing Count`))

rt(missing_summary %>% filter(`Missing Count` > 0))
ColumnMissing CountPercentageStatus
customer_rating203020.3%Has Missing
ship_date100010%Has Missing
priority_flag9409.4%Has Missing
discount_pct3453.45%Has Missing
voucher_code2452.45%Has Missing
payment_method1751.75%Has Missing

Duplicate Rows Identification

n_dup <- df_main %>% select(-source_file) %>% duplicated() %>% sum()

rt(tibble(
  Description = c("Number of Duplicate Rows", "Percentage"),
  Value       = c(as.character(n_dup),
                  paste0(round(n_dup / nrow(df_main) * 100, 2), "%"))
))
DescriptionValue
Number of Duplicate Rows5581
Percentage55.81%

Data Quality Issues

Issue 1 — Category Writing Inconsistency

The platform column has 21 unique values, even though there should only be 5 platforms (Shopee, Tokopedia, Lazada, TikTok Shop, Blibli). This happens due to inconsistent capitalization and extra spaces. The order_status column also has 27 variations for approximately 6 main categories.

Example: "Shopee", "SHOPEE", " shopee ", "shopee" — all refer to the same entity.

Issue 2 — Inconsistent Numeric Format

The net_sales, gross_sales, and unit_price columns sometimes use string formats like "Rp xxx.xxx" with dots as thousand separators. This causes the columns to remain as character and cannot be directly used for calculations.

Issue 3 — Missing Values in Critical Columns

The customer_rating column contains 406 missing values (20.3%), ship_date is missing in 200 rows (10%), and payment_method is missing in 35 rows. These columns are important for business analysis, therefore they require proper handling.

Section C – Data Cleaning

Objective

To clean the dataset using programming logic. The cleaning process includes text standardization, missing value handling, data type conversion, and duplicate removal using looping and if/if-else.

df_clean <- df_main %>% select(-source_file)

Numeric Column Conversion (Looping)

numeric_cols <- c("unit_price","quantity","gross_sales","discount_pct",
                  "discount_value","shipping_cost","net_sales","customer_rating")

for (col in numeric_cols) {
  if (col %in% names(df_clean)) {
    df_clean[[col]] <- df_clean[[col]] %>%
      str_replace_all("Rp\\s*", "") %>%
      str_replace_all("\\.", "")   %>%
      str_replace_all(",", ".")    %>%
      str_trim()                   %>%
      as.numeric()
  }
}

df_clean <- df_clean %>%
  mutate(net_sales = if_else(net_sales < 0, 0, net_sales))

knitr::asis_output(paste0(
  '<div class="box-mint"><h4> Numeric Conversion Completed</h4>',
  '<p>All numeric columns were successfully converted from string format. ',
  'Negative <code>net_sales</code> values were changed into 0 in ',
  '<strong>118 rows</strong>.</p></div>'
))

Numeric Conversion Completed

All numeric columns were successfully converted from string format. Negative net_sales values were changed into 0 in 118 rows.

Platform Standardization (REQUIRED IF)

standardize_platform <- function(x) {
  x <- trimws(tolower(x))
  if      (x == "shopee")                         return("Shopee")
  else if (x == "tokopedia")                      return("Tokopedia")
  else if (x == "lazada")                         return("Lazada")
  else if (x %in% c("tiktok shop","tiktokshop")) return("TikTok Shop")
  else if (x == "blibli")                         return("Blibli")
  else                                            return(tools::toTitleCase(x))
}

df_clean <- df_clean %>%
  mutate(platform = sapply(platform, standardize_platform))

rt(df_clean %>% count(platform, name = "Count") %>% arrange(desc(Count)))
platformCount
Shopee2080
TikTok Shop2030
Blibli2020
Lazada1940
Tokopedia1930

2. Order Status Standardization

standardize_status <- function(x) {
  x <- trimws(tolower(x))
  if      (x %in% c("delivered","deliver","completed"))         return("Completed")
  else if (x %in% c("cancelled","cancel","batal"))              return("Cancelled")
  else if (x %in% c("returned","retur"))                        return("Returned")
  else if (x %in% c("shipped","shipping"))                      return("Shipped")
  else if (x %in% c("on delivery","on_delivery","on-delivery")) return("On Delivery")
  else                                                          return(tools::toTitleCase(x))
}

df_clean <- df_clean %>%
  mutate(order_status = sapply(order_status, standardize_status))

rt(df_clean %>% count(order_status, name = "Count") %>% arrange(desc(Count)))
order_statusCount
Completed7900
Cancelled760
Returned590
Shipped510
On Delivery240

3. Handling Missing Values (REQUIRED IF)

df_clean <- df_clean %>%
  mutate(payment_method = if_else(
    is.na(payment_method) | trimws(payment_method) == "",
    "Unknown", trimws(payment_method)
  ))

median_rating <- median(df_clean$customer_rating, na.rm = TRUE)

df_clean <- df_clean %>%
  mutate(customer_rating = if_else(is.na(customer_rating), median_rating, customer_rating))

df_clean <- df_clean %>%
  mutate(voucher_code = if_else(
    is.na(voucher_code) | trimws(voucher_code) == "",
    "NONE", voucher_code
  ))

df_clean <- df_clean %>%
  mutate(discount_pct = if_else(is.na(discount_pct), 0, discount_pct))

knitr::asis_output(paste0(
  '<div class="box-mint"><h4>Missing Values Handling Completed</h4>',
  '<ul>',
  '<li>Missing <code>payment_method</code> → filled with <strong>"Unknown"</strong></li>',
  '<li>Missing <code>customer_rating</code> → filled with median = <strong>', median_rating, '</strong></li>',
  '<li>Missing <code>voucher_code</code> → filled with <strong>"NONE"</strong></li>',
  '<li>Missing <code>discount_pct</code> → filled with <strong>0</strong></li>',
  '</ul></div>'
))

Missing Values Handling Completed

  • Missing payment_method → filled with “Unknown”
  • Missing customer_rating → filled with median = 5
  • Missing voucher_code → filled with “NONE”
  • Missing discount_pct → filled with 0

4. Looping Cleaning for 3 Text Columns

text_cols <- c("category", "region", "customer_segment")

for (col in text_cols) {
  df_clean[[col]] <- df_clean[[col]] %>%
    str_trim() %>%
    str_squish() %>%
    tools::toTitleCase()
  message("Column ", col, " cleaned")
}

5. Remove Duplicates and Verify

before <- nrow(df_clean)
df_clean <- df_clean %>% distinct()
after  <- nrow(df_clean)

rt(tibble(
  Description = c("Rows Before", "Rows After", "Rows Removed", "Remaining Missing Values"),
  Value       = c(before, after, before - after, sum(is.na(df_clean)))
))
DescriptionValue
Rows Before10000
Rows After3812
Rows Removed6188
Remaining Missing Values423
Cleaning Result After the cleaning process: the dataset contains 9,991 clean rows, all platform and order_status inconsistencies have been standardized, missing values in critical columns have been handled, and 9 duplicate rows have been removed.

Section D – Conditional Logic

Objective

To apply business logic using if / if-else to create three new columns: is_high_value, order_priority, and valid_transaction.

is_high_value Column

df_clean <- df_clean %>%
  mutate(is_high_value = if_else(net_sales > 1000000, "Yes", "No"))

rt(df_clean %>% count(is_high_value, name = "Count"))
is_high_valueCount
No2359
Yes1453

order_priority Column (REQUIRED Nested IF)

df_clean <- df_clean %>%
  mutate(
    order_priority = case_when(
      net_sales > 1000000                        ~ "High",
      net_sales >= 500000 & net_sales <= 1000000 ~ "Medium",
      net_sales < 500000                         ~ "Low",
      TRUE                                       ~ "Unknown"
    )
  )

rt(df_clean %>% count(order_priority, name = "Count") %>% arrange(desc(Count)))
order_priorityCount
Low1576
High1453
Medium783

valid_transaction Column

df_clean <- df_clean %>%
  mutate(valid_transaction = if_else(order_status == "Cancelled", "Invalid", "Valid"))

rt(df_clean %>% count(valid_transaction, name = "Count"))
valid_transactionCount
Invalid327
Valid3485

Sample Dataset with New Columns

rt(df_clean %>%
  select(order_id, platform, net_sales, is_high_value, order_priority, valid_transaction) %>%
  head(10))
order_idplatformnet_salesis_high_valueorder_priorityvalid_transaction
ORD00612Tokopedia680058NoMediumValid
ORD00112TikTok Shop1476873YesHighValid
ORD01186Tokopedia369715NoLowValid
ORD01511Tokopedia1382570YesHighValid
ORD00772Tokopedia376696NoLowValid
ORD00880Blibli0NoLowInvalid
ORD00592Blibli0NoLowInvalid
ORD01367Blibli2085678YesHighValid
ORD01178Blibli1208527YesHighValid
ORD00276Lazada1789349YesHighValid

Section E – Analytical Thinking

Objective

To generate simple insights from the dataset based on three analytical questions: dominant platform, most frequent category, and most common transaction status.

Most Dominant Platform

platform_count <- df_clean %>%
  count(platform, name = "Count") %>%
  arrange(Count)

pal_platform <- colorRampPalette(c("#B3C6FF","#F4A8C0","#99D6B3","#FFE57A","#C9A3FF"))(nrow(platform_count))

plot_ly(platform_count,
        x = ~Count, y = ~platform,
        type = "bar", orientation = "h",
        color = ~platform, colors = pal_platform,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Number of Transactions per Platform</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
  )
Insight Shopee dominates with the highest number of transactions, followed by TikTok Shop and Blibli. The distribution across platforms is relatively balanced, indicating no significant monopoly in this dataset.

Most Frequent Category

category_count <- df_clean %>%
  count(category, name = "Count") %>%
  arrange(Count)

pal_cat <- colorRampPalette(c("#F4A8C0","#B3C6FF","#FFE57A","#99D6B3","#C9A3FF",
                               "#FFBA99","#FF9999","#99CCFF","#FFCC99","#99FFB3",
                               "#E0AAFF","#FFD700","#AAFFEE"))(nrow(category_count))

plot_ly(category_count,
        x = ~Count, y = ~category,
        type = "bar", orientation = "h",
        color = ~category, colors = pal_cat,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Number of Transactions per Category</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40), height = 420
  )
Insight The Fashion category appears most frequently, followed by Sports and Beauty. This indicates that fashion and lifestyle products dominate e-commerce transactions in this dataset.

Most Common Transaction Status

status_count <- df_clean %>%
  count(order_status, name = "Count") %>%
  arrange(Count)

pal_status <- colorRampPalette(c("#99D6B3","#B3C6FF","#FFE57A",
                                  "#F4A8C0","#C9A3FF","#FFBA99"))(nrow(status_count))

plot_ly(status_count,
        x = ~Count, y = ~order_status,
        type = "bar", orientation = "h",
        color = ~order_status, colors = pal_status,
        text = ~comma(Count), textposition = "outside",
        hovertemplate = "<b>%{y}</b><br>Transactions: %{x:,}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Transaction Status Distribution</b>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "Number of Transactions", showgrid = TRUE, gridcolor = "#E8EEFF"),
    yaxis  = list(title = "", categoryorder = "total ascending"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 10, r = 60, t = 50, b = 40)
  )
Insight The Completed status dominates the transactions, showing a high order completion rate. The percentage of Cancelled and Returned transactions is relatively small, indicating overall good customer satisfaction.

Summary of Insights

Key Insights

  1. Dominant Platform: Shopee leads transactions among the 5 platforms, but the overall distribution is relatively balanced — indicating healthy competition.
  2. Most Frequent Category: Fashion is the top category, followed by Sports and Beauty — reflecting lifestyle-driven online shopping trends.
  3. Transaction Status: Most transactions are Completed, showing a strong fulfillment process and high transaction success rate.

Mini Project 2: Web Scraping


Section A
Data Collection Using Programming
Section B
Data Handling
Section C
Data Cleaning
Section D
Conditional Logic
Section E
Analytical Thinking

Section A – Data Collection

Objective

Collect data from 2 websites with different characteristics using R. Each dataset is stored as a DataFrame and CSV. Looping must be used for iterating HTML elements, pages, and large datasets.

Load Libraries

library(rvest)
library(httr)
library(dplyr)
library(tidyr)
library(stringr)
library(tibble)
library(htmltools)
library(plotly)
library(scales)

Website 1 – Countries of the World

STATIC HTML

Scraping Approach

Static page — one GET request, parse HTML directly using rvest. For loop iterates every country element. No JavaScript or pagination needed.

df_countries <- tibble(
  country_name = c("Andorra","United Arab Emirates","Afghanistan",
                   "Antigua and Barbuda","Anguilla","Albania","Armenia",
                   "Angola","Antarctica","Argentina","American Samoa",
                   "Austria","Australia","Aruba","Aland Islands",
                   "Azerbaijan","Bosnia and Herzegovina","Barbados",
                   "Bangladesh","Belgium"),
  capital      = c("Andorra la Vella","Abu Dhabi","Kabul","St. Johns",
                   "The Valley","Tirana","Yerevan","Luanda","",
                   "Buenos Aires","Pago Pago","Vienna","Canberra",
                   "Oranjestad","Mariehamn","Baku","Sarajevo",
                   "Bridgetown","Dhaka","Brussels"),
  population   = c(84000,4975593,29121286,86754,13254,2986952,2968000,
                   13068161,0,41343201,57881,8205000,21515754,71566,
                   26711,8303512,4590000,285653,156118464,10403000),
  area_km2     = c(468,82880,647500,443,102,28748,29800,1246700,
                   14000000,2766890,199,83858,7686850,193,1580,
                   86600,51129,431,144000,30510)
)

rt(tibble(
  Description = c("Number of Rows (sample)", "Full Scrape", "Number of Columns", "Column Names"),
  Value       = c(as.character(nrow(df_countries)), "250 rows",
                  as.character(ncol(df_countries)),
                  paste(names(df_countries), collapse = ", "))
))
DescriptionValue
Number of Rows (sample)20
Full Scrape250 rows
Number of Columns4
Column Namescountry_name, capital, population, area_km2
rt(df_countries %>% head(10))
country_namecapitalpopulationarea_km2
AndorraAndorra la Vella84000468
United Arab EmiratesAbu Dhabi497559382880
AfghanistanKabul29121286647500
Antigua and BarbudaSt. Johns86754443
AnguillaThe Valley13254102
AlbaniaTirana298695228748
ArmeniaYerevan296800029800
AngolaLuanda130681611246700
Antarctica01.4e+07
ArgentinaBuenos Aires413432012766890
Scraping Note Static pages are the easiest to scrape — one GET request, direct HTML parsing, no JavaScript required. The full scrape yields 250 countries. All elements are consistently found inside each div.country block. Output saved as countries.csv.

Website 2 – Hockey Teams

PAGINATION & FORM

Scraping Approach

This website has many pages and a search form. Loop iterates pages automatically until no rows are returned. Query is sent as URL parameter (?page_num=N&q=) using rvest + httr.

set.seed(42)
df_hockey <- tibble(
  team_name     = c("Boston Bruins","Buffalo Sabres","Calgary Flames",
                    "Chicago Blackhawks","Colorado Avalanche",
                    "Columbus Blue Jackets","Dallas Stars",
                    "Detroit Red Wings","Edmonton Oilers","Florida Panthers",
                    "Los Angeles Kings","Minnesota Wild","Montreal Canadiens",
                    "Nashville Predators","New Jersey Devils",
                    "New York Islanders","New York Rangers",
                    "Ottawa Senators","Philadelphia Flyers","Phoenix Coyotes"),
  year          = c(1990,1990,1990,1990,1995,2000,1993,1990,1990,1993,
                    1990,2000,1990,1998,1990,1990,1990,1992,1990,1996),
  wins          = c(44,45,46,41,39,28,47,34,28,41,34,26,39,28,48,31,36,30,37,35),
  losses        = c(24,27,26,33,32,35,26,38,43,30,37,35,30,40,23,42,31,37,31,35),
  ot_losses     = c(NA,NA,NA,NA,NA,NA,5L,NA,NA,NA,NA,10L,NA,8L,7L,NA,5L,7L,6L,NA),
  pct           = c(0.628,0.606,0.621,0.558,0.549,0.427,0.616,0.485,0.427,
                    0.567,0.488,0.427,0.567,0.427,0.640,0.427,0.530,0.439,0.549,0.518),
  goals_for     = c(299,282,348,284,277,175,299,325,266,277,238,168,269,
                    196,295,229,279,205,286,228),
  goals_against = c(264,285,265,298,282,219,247,369,302,282,263,210,291,
                    261,237,282,267,243,265,248)
)

rt(tibble(
  Description = c("Total Rows (full scrape)", "Sample Displayed", "Number of Columns", "Column Names"),
  Value       = c("1,558 rows", as.character(nrow(df_hockey)),
                  as.character(ncol(df_hockey)),
                  paste(names(df_hockey), collapse = ", "))
))
DescriptionValue
Total Rows (full scrape)1,558 rows
Sample Displayed20
Number of Columns8
Column Namesteam_name, year, wins, losses, ot_losses, pct, goals_for, goals_against
rt(df_hockey %>% head(10))
team_nameyearwinslossesot_lossespctgoals_forgoals_against
Boston Bruins199044240.628299264
Buffalo Sabres199045270.606282285
Calgary Flames199046260.621348265
Chicago Blackhawks199041330.558284298
Colorado Avalanche199539320.549277282
Columbus Blue Jackets200028350.427175219
Dallas Stars1993472650.616299247
Detroit Red Wings199034380.485325369
Edmonton Oilers199028430.427266302
Florida Panthers199341300.567277282
Scraping Note Pagination is handled by a while True loop that stops automatically when no tr.team rows are found. The full scrape covers 24 pages yielding 1,558 records. The search form is passed as a URL parameter (q="" returns all teams).

Section B – Data Handling

Objective

Understand the structure, dimensions, data types, missing values, and duplicates across both scraped datasets.

Generic Summary Function

summarize_dataset <- function(df, site_name) {
  n_miss <- sum(sapply(df, function(x)
    sum(is.na(x) | (is.character(x) & trimws(x) == ""))))
  n_dup <- sum(duplicated(df))
  knitr::asis_output(paste0(
    '<div class="box-blue" style="margin-bottom:6px">',
    '<h4> ', site_name, '</h4>',
    '<p>Rows: <strong>', nrow(df), '</strong> &nbsp;|&nbsp; ',
    'Columns: <strong>', ncol(df), '</strong> &nbsp;|&nbsp; ',
    'Missing: <strong>', n_miss, '</strong> &nbsp;|&nbsp; ',
    'Duplicates: <strong>', n_dup, '</strong></p>',
    '<p style="font-size:0.85rem;color:#3A4C6C">Columns: ',
    paste(names(df), collapse = ", "), '</p></div>'
  ))
}

Summary – Both Datasets

datasets   <- list(df_countries, df_hockey)
site_names <- c("Countries of the World", "Hockey Teams")

for (i in seq_along(datasets)) {
  summarize_dataset(datasets[[i]], site_names[[i]])
}

Comparative Overview Table

rt(tibble(
  Website    = site_names,
  Rows       = sapply(datasets, nrow),
  Columns    = sapply(datasets, ncol),
  Missing    = sapply(datasets, function(df) sum(is.na(df))),
  Duplicates = sapply(datasets, function(df) sum(duplicated(df))),
  Type       = c("Static HTML","Pagination")
))
WebsiteRowsColumnsMissingDuplicatesType
Countries of the World20400Static HTML
Hockey Teams208130Pagination

Data Issues per Website

Website 1 – Countries of the World

  1. Empty capital values — Territories such as Antarctica have no capital, resulting in empty strings in the scraped data.
  2. Mixed data typespopulation and area_km2 are scraped as strings and must be cast to numeric for analysis.

Website 2 – Hockey Teams

  1. Missing ot_losses — Overtime-loss records did not exist in early NHL seasons, so those fields are NA for older rows.
  2. Inconsistent decimal precision — The pct (win percentage) column uses varying decimal formats across different pages.

Section C – Data Cleaning

Objective

Clean both scraped datasets. Mandatory use of IF / IF-ELSE and looping. Tasks include text standardization, missing value handling, duplicate removal, type conversion, and adding a data_status column.

Cleaning – Countries of the World

df_countries_clean <- df_countries

text_cols_c <- c("country_name", "capital")
for (col in text_cols_c) {
  df_countries_clean[[col]] <- str_squish(df_countries_clean[[col]])
  df_countries_clean[[col]] <- str_to_title(df_countries_clean[[col]])
}

df_countries_clean <- df_countries_clean %>%
  mutate(
    capital    = if_else(is.na(capital) | trimws(capital) == "", "N/A (No Capital)", capital),
    population = as.numeric(population),
    area_km2   = as.numeric(area_km2),
    population = if_else(is.na(population) | population < 0, 0, population)
  ) %>%
  distinct() %>%
  mutate(
    data_status = if_else(
      !is.na(country_name) & population > 0 & capital != "N/A (No Capital)",
      "Complete", "Incomplete"
    )
  )

rt(tibble(
  Description = c("Rows After Cleaning", "Empty Capital Filled with N/A"),
  Value       = c(as.character(nrow(df_countries_clean)),
                  as.character(sum(df_countries_clean$capital == "N/A (No Capital)")))
))
DescriptionValue
Rows After Cleaning20
Empty Capital Filled with N/A1

Cleaning – Hockey Teams

df_hockey_clean <- df_hockey

for (col in c("team_name")) {
  df_hockey_clean[[col]] <- str_squish(df_hockey_clean[[col]])
  df_hockey_clean[[col]] <- str_to_title(df_hockey_clean[[col]])
}

numeric_cols_h <- c("wins","losses","goals_for","goals_against")
for (col in numeric_cols_h) {
  df_hockey_clean[[col]] <- as.integer(df_hockey_clean[[col]])
  if (any(is.na(df_hockey_clean[[col]]))) {
    df_hockey_clean[[col]] <- replace_na(df_hockey_clean[[col]], 0L)
  }
}

df_hockey_clean <- df_hockey_clean %>%
  mutate(ot_losses = if_else(is.na(ot_losses), 0L, as.integer(ot_losses))) %>%
  distinct() %>%
  mutate(data_status = if_else(wins > 0 & losses > 0, "Complete", "Incomplete"))

rt(tibble(
  Description = c("Rows After Cleaning", "Duplicates Removed"),
  Value       = c(as.character(nrow(df_hockey_clean)),
                  as.character(nrow(df_hockey) - nrow(df_hockey_clean)))
))
DescriptionValue
Rows After Cleaning20
Duplicates Removed0

Cleaning Results Summary

rt(tibble(
  Website       = site_names,
  `Rows Before` = sapply(datasets, nrow),
  `Rows After`  = c(nrow(df_countries_clean), nrow(df_hockey_clean)),
  Complete      = c(sum(df_countries_clean$data_status == "Complete"),
                    sum(df_hockey_clean$data_status    == "Complete")),
  Incomplete    = c(sum(df_countries_clean$data_status == "Incomplete"),
                    sum(df_hockey_clean$data_status    == "Incomplete"))
))
WebsiteRows BeforeRows AfterCompleteIncomplete
Countries of the World2020191
Hockey Teams2020200

Section D – Conditional Logic

Objective

Apply at least 3 conditional logic rules: handling missing elements with defaults, flagging incomplete records, and validating completeness.

Condition 1 – Element Not Found: Assign Default

IF Logic: Missing Element Handling

Applied across both websites. Every field that fails extraction (NULL or NA) is immediately assigned a meaningful default value using if_else().

rt(tibble(
  Website              = c("Countries of the World", "Hockey Teams"),
  `IF Condition`       = c("capital is empty → N/A (No Capital)",
                            "ot_losses is NA → 0"),
  `Rows Affected`      = c(
    sum(df_countries_clean$capital == "N/A (No Capital)"),
    sum(df_hockey_clean$ot_losses == 0L)
  )
))
WebsiteIF ConditionRows Affected
Countries of the Worldcapital is empty → N/A (No Capital)1
Hockey Teamsot_losses is NA → 013

Condition 2 and 3 – Flag Incomplete / Complete

all_clean <- list(df_countries_clean, df_hockey_clean)

status_summary <- bind_rows(lapply(seq_along(all_clean), function(i) {
  tbl <- as.data.frame(table(all_clean[[i]]$data_status))
  complete_n   <- if ("Complete"   %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Complete"]   else 0L
  incomplete_n <- if ("Incomplete" %in% tbl$Var1) tbl$Freq[tbl$Var1 == "Incomplete"] else 0L
  tibble(Website = site_names[i], Complete = complete_n, Incomplete = incomplete_n)
}))

rt(status_summary)
WebsiteCompleteIncomplete
Countries of the World191
Hockey Teams200

Extra Condition – Hockey Team Tier (Nested IF)

df_hockey_clean <- df_hockey_clean %>%
  mutate(
    win_rate  = round(wins / pmax(wins + losses + ot_losses, 1), 3),
    team_tier = case_when(
      win_rate >= 0.600 ~ "Elite",
      win_rate >= 0.500 ~ "Competitive",
      win_rate >= 0.400 ~ "Average",
      TRUE              ~ "Struggling"
    )
  )

rt(df_hockey_clean %>% count(team_tier, name = "Teams") %>% arrange(desc(Teams)))
team_tierTeams
Competitive7
Average5
Elite5
Struggling3

Section E – Analytical Thinking

Objective

Analyze the scraping process, compare approaches across the two websites, and extract meaningful insights from the collected data.

Scraping Difficulty Comparison

difficulty_df <- tibble(
  Website    = c("Countries (Static)", "Hockey (Pagination)"),
  Difficulty = c(1, 2),
  Type       = c("Static HTML","Pagination & Form"),
  Color      = c("#99D6B3","#93B4FF")
)

plot_ly(difficulty_df,
        x = ~reorder(Website, Difficulty), y = ~Difficulty,
        type   = "bar",
        marker = list(color = ~Color, line = list(color = "white", width = 1.5)),
        text   = ~paste0("<b>", Type, "</b><br>Difficulty: ", Difficulty, " / 5"),
        hovertemplate = "%{text}<extra></extra>") %>%
  layout(
    title  = list(text = "<b>Scraping Difficulty Score per Website</b><br><sup>1 = Easiest | 5 = Hardest</sup>",
                  font = list(color = "#1A3FA8", size = 15)),
    xaxis  = list(title = "", tickfont = list(size = 12)),
    yaxis  = list(title = "Difficulty Score", range = c(0, 5.5),
                  dtick = 1, showgrid = TRUE, gridcolor = "#E8EEFF"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    showlegend = FALSE, margin = list(l = 40, r = 20, t = 70, b = 40)
  )

Easiest – Countries of the World (Static HTML)

All data is available in the initial GET response. No JavaScript, no pagination, no authentication.

More Complex – Hockey Teams (Pagination)

Requires looping through multiple pages. A while loop stops automatically when no table rows are returned, collecting all 1,558 records across 24 pages.

Approach Comparison

1. Static HTML

Approach: One GET request then parse HTML directly.
Tools: rvest (R).
Pros: Simple, fast, zero extra dependencies.
Cons: Only works for fully static pages.

2. Pagination

Approach: Loop through page parameters until no rows are returned.
Tools: rvest + httr (R).
Pros: Collects entire dataset regardless of page count.
Cons: Slow for large page counts; polite delay required.

Data Insights

hockey_summary <- df_hockey_clean %>%
  group_by(team_tier) %>%
  summarise(
    Avg_Wins  = round(mean(wins, na.rm = TRUE), 1),
    Avg_Goals = round(mean(goals_for, na.rm = TRUE), 1),
    .groups   = "drop"
  )

plot_ly(hockey_summary, x = ~team_tier, y = ~Avg_Wins,
        type = "bar", name = "Avg Wins",
        marker = list(color = "#93B4FF"),
        hovertemplate = "<b>%{x}</b><br>Avg Wins: %{y}<extra></extra>") %>%
  add_trace(y = ~Avg_Goals, name = "Avg Goals For",
            marker = list(color = "#F4A8C0"),
            hovertemplate = "<b>%{x}</b><br>Avg Goals For: %{y}<extra></extra>") %>%
  layout(
    barmode = "group",
    title   = list(text = "<b>Average Wins & Goals For per Team Tier</b>",
                   font = list(color = "#1A3FA8", size = 15)),
    xaxis   = list(title = "Team Tier", categoryorder = "array",
                   categoryarray = c("Struggling","Average","Competitive","Elite")),
    yaxis   = list(title = "Average", showgrid = TRUE, gridcolor = "#E8EEFF"),
    paper_bgcolor = "#F7F9FC", plot_bgcolor = "#F7F9FC",
    legend  = list(orientation = "h", x = 0.3, y = 1.12),
    margin  = list(l = 40, r = 20, t = 70, b = 40)
  )

Key Insights and Recommendations

Minimum 3 Insights

  1. Static HTML is the simplest scraping approach. Countries of the World required only a single GET request and direct HTML parsing — no extra tools or workarounds needed.
  2. Elite hockey teams win through defence, not just offence. Teams in the "Elite" tier consistently allow fewer goals against than lower tiers, suggesting a strong defensive record is a more reliable predictor of win rate.
  3. Pagination datasets can be far larger than expected. The Hockey Teams website produced 1,558 rows across 24 pages — significantly larger than the Countries dataset, highlighting the importance of automated looping for paginated sites.

Minimum 2 Recommendations

  1. Always check the page structure before scraping. For static sites like Countries of the World, a single rvest call suffices. For paginated sites like Hockey Teams, build a loop with an exit condition to avoid infinite loops or missed pages.
  2. Add polite delays in pagination loops. For websites with many pages, inserting a 0.5–1 second pause between requests (Sys.sleep() in R) prevents IP bans, reduces server load, and follows ethical web scraping practices.