QUESTION 1: DATA INGESTION ARCHITECTURE

Demonstrating R’s capability to interface with multiple data storage formats

We just need to Load necessary libraries for the modern ecosystem and DB connectors

library(readxl)   # For Excel
library(haven)    # For SAS, SPSS, Stata
library(readr)    # For ASCII/Text
library(xml2)     # For XML
library(rvest)    # For Web scraping
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(DBI)      # For Database management systems
library(RSQLite)  # Example DB driver
# 1. Data from Excel
excel_data <- read_excel("Burera new.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...5`
## • `` -> `...6`
csv_data <- read.csv("Student_Marks.csv")
#displaying samples from the dataset
head(excel_data, 3)
## # A tibble: 3 × 6
##   `District coordinator` ...2     ...3     MAGEZI Celestin - 07883…¹ ...5  ...6 
##   <chr>                  <chr>    <chr>    <chr>                     <chr> <chr>
## 1 No                     UMURENGE AKAGALI  SITE Y'ITORA              AMAZ… TELE…
## 2 1                      Bungwe   Bungwe   G.S Regina Pacis Bungwe   UWIN… 2507…
## 3 2                      Bungwe   Bushenya G.S.Bushenya              KAMI… 2507…
## # ℹ abbreviated name: ¹​`MAGEZI Celestin - 0788356646`
tail(csv_data, 3)
##     number_courses time_study  Marks
## 98               4      7.163 41.444
## 99               7      0.309 12.027
## 100              3      6.335 32.357
# 2. Data from Statistical Packages (SAS, SPSS, Stata)
sas7bdat_data   <- read_sas("sas7bdatfile.sas7bdat")
stata_data <- read_dta("dtafile.dta")
head(sas7bdat_data,5)
## # A tibble: 5 × 78
##      id   age  race   sex   mar    ed prvarrst curcrim crime yrsent depart relyr
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>   <dbl> <dbl>  <dbl>  <dbl> <dbl>
## 1    10  34.3     1     1     2     5       NA      71    72    0.5      3    76
## 2    10  34.3     1     1     2     5       NA      71    72    0.5      3    76
## 3    11  44.2     1     1     2     6       NA      NA    52    0.5      3    76
## 4    19  22.1     2     1     1     8        4      NA  1302    3        1    76
## 5    19  22.1     2     1     1     8        4      NA  1302    3        1    76
## # ℹ 66 more variables: relmo <dbl>, expgrp <dbl>, sexint <dbl>, paro <dbl>,
## #   higrade <dbl>, age1st <dbl>, numarst <dbl>, numconv <dbl>, emp1 <dbl>,
## #   emp2 <dbl>, emp3 <dbl>, emp4 <dbl>, emp5 <dbl>, emp6 <dbl>, emp7 <dbl>,
## #   emp8 <dbl>, emp9 <dbl>, emp10 <dbl>, emp11 <dbl>, emp12 <dbl>, emp13 <dbl>,
## #   emp19 <dbl>, frstarrmo <dbl>, frstarrday <dbl>, frstarryr <dbl>,
## #   frstcharge <dbl>, arr2mo <dbl>, arr2dy <dbl>, arr2yr <dbl>, arr3mo <dbl>,
## #   arr3dy <dbl>, arr3yr <dbl>, arr4mo <dbl>, arr4dy <dbl>, arr4yr <dbl>, …
head(stata_data,5)
## # A tibble: 5 × 3
##   smoke  educ  male
##   <dbl> <dbl> <dbl>
## 1     1    10     1
## 2     1    11     1
## 3     0     9     1
## 4     0     9     1
## 5     0     8     1
# 3. Data from Text files (ASCII, XML, Webscraping)
text_data <- data <- read.table("textfile.txt", header = TRUE)

# XML 
xml_data <- read_xml("xmlfile.xml")

# Webscraping (HTML)
html_page <- read_html("https://testbook.com/question-answer/the-table-below-gives-the-number-of-students-passi--613f28e1fb752a828a256563")
page_table <- html_table(html_nodes(html_page, "table")[[1]])

head(text_data)
##   gradrat  lenroll rmbrd private stufac csat act
## 1      15 4.007333 4.120       1   11.9  972  20
## 2      NA 6.833032 3.590       0   10.0  961  22
## 3      39 4.488636 4.764       0    9.5   NA  NA
## 4      NA 7.057898 5.120       0   13.7  881  20
## 5      40 6.891626 2.550       0   14.3   NA  17
## 6      55 5.187386 3.250       1   32.8   NA  20
head(xml_data)
## $node
## <pointer: 0x0000020509182380>
## 
## $doc
## <pointer: 0x0000020509180470>
head(page_table)
## # A tibble: 4 × 3
##   X1    X2     X3    
##   <chr> <chr>  <chr> 
## 1 Year  Girls  Boys  
## 2 2016  128734 115526
## 3 2017  130567 124313
## 4 2018  129209 122131
# 4. Data from Database Management Systems (SQL, MySQL, Oracle, Access)
db_conn <- dbConnect(RSQLite::SQLite(), dbname = "institutional_db.sqlite")

QUESTION 2: MERGING DATASETS ON MULTIPLE VARIABLES (COMPOUND KEYS)

dplyr joins and Base R merge functions

# Load Dataset A
vulnerability_scans <- read.csv("vulnerability_scans.txt")
# Load Dataset B
patch_logs <- read.csv("patch_logs.txt")

#see the head of our datasets to be aware of nature of data
head(vulnerability_scans)
##        Host_IP Port  Scan_Date Severity
## 1 192.168.1.10  443 2026-05-20     High
## 2 192.168.1.10   22 2026-05-20      Low
## 3     10.0.0.5   80 2026-05-21 Critical
## 4   172.16.0.8   21 2026-05-21   Medium
## 5 192.168.1.15 3389 2026-05-22     High
## 6    10.0.0.12   25 2026-05-22      Low
head(patch_logs)
##        Host_IP Port  Scan_Date Patch_Status
## 1 192.168.1.10  443 2026-05-20      Applied
## 2 192.168.1.10   22 2026-05-20      Pending
## 3     10.0.0.5 8080 2026-05-21       Failed
## 4   172.16.0.8   21 2026-05-21      Applied
## 5 192.168.1.15 3389 2026-05-22      Applied
## 6    10.0.0.12   25 2026-05-22      Pending
# Convert dates properly to make them real date in nature
vulnerability_scans$Scan_Date <- as.Date(vulnerability_scans$Scan_Date)
patch_logs$Scan_Date <- as.Date(patch_logs$Scan_Date)

# Merge datasets using compound keys
merged_data <- merge(
  vulnerability_scans,
  patch_logs,
  by = c("Host_IP", "Port", "Scan_Date")
)

# then Display merged dataset
print(merged_data)
##         Host_IP Port  Scan_Date Severity Patch_Status
## 1     10.0.0.12   25 2026-05-22      Low      Pending
## 2     10.0.0.15 1433 2026-05-25     High       Failed
## 3     10.0.0.22 8443 2026-05-26     High      Applied
## 4     10.0.0.33   22 2026-05-27      Low      Pending
## 5     10.0.0.41   80 2026-05-28   Medium      Applied
## 6      10.0.0.7  443 2026-05-23     High      Pending
## 7      10.0.0.9   53 2026-05-24   Medium      Applied
## 8   172.16.0.18 5900 2026-05-25 Critical      Applied
## 9   172.16.0.20 3306 2026-05-22 Critical       Failed
## 10  172.16.0.25   22 2026-05-23      Low      Applied
## 11  172.16.0.40  110 2026-05-24      Low      Pending
## 12  172.16.0.60  161 2026-05-26   Medium       Failed
## 13  172.16.0.72  443 2026-05-27     High      Applied
## 14   172.16.0.8   21 2026-05-21   Medium      Applied
## 15  172.16.0.90  445 2026-05-29   Medium      Applied
## 16 192.168.1.10   22 2026-05-20      Low      Pending
## 17 192.168.1.10  443 2026-05-20     High      Applied
## 18 192.168.1.15 3389 2026-05-22     High      Applied
## 19 192.168.1.18 8080 2026-05-23   Medium      Applied
## 20 192.168.1.30  445 2026-05-24 Critical       Failed
## 21 192.168.1.44  995 2026-05-25   Medium      Applied
## 22 192.168.1.61 1521 2026-05-27 Critical      Applied
## 23 192.168.1.80 3306 2026-05-29     High      Applied
## 24 192.168.1.95   21 2026-05-30      Low      Pending

QUESTION 3: BASE R EXTRACTION ($) vs. PIPELINE ARCHITECTURE (%>%, group_by)

Demonstrating the transition from vector extraction to chained operations

# A. BASE R EXTRACTION USING $, 
severity_column <- vulnerability_scans$Severity  #Extract only the Severity column
print(severity_column)    # Display extracted column
##  [1] "High"     "Low"      "Critical" "Medium"   "High"     "Low"     
##  [7] "Critical" "Medium"   "High"     "Low"      "Critical" "Medium"  
## [13] "Low"      "Medium"   "High"     "Critical" "Low"      "High"    
## [19] "Medium"   "Critical" "Low"      "High"     "Critical" "Medium"  
## [25] "Low"      "High"     "Critical" "Medium"   "Low"      "High"
hosts <- vulnerability_scans$Host_IP # Extract Host_IP column
print(hosts)  # Display Host_IPs
##  [1] "192.168.1.10" "192.168.1.10" "10.0.0.5"     "172.16.0.8"   "192.168.1.15"
##  [6] "10.0.0.12"    "172.16.0.20"  "192.168.1.18" "10.0.0.7"     "172.16.0.25" 
## [11] "192.168.1.30" "10.0.0.9"     "172.16.0.40"  "192.168.1.44" "10.0.0.15"   
## [16] "172.16.0.18"  "192.168.1.55" "10.0.0.22"    "172.16.0.60"  "192.168.1.61"
## [21] "10.0.0.33"    "172.16.0.72"  "192.168.1.75" "10.0.0.41"    "172.16.0.85" 
## [26] "192.168.1.80" "10.0.0.50"    "172.16.0.90"  "192.168.1.95" "10.0.0.60"
critical_count <- sum(vulnerability_scans$Severity == "Critical") # Count how many "Critical"
print(critical_count) # Display result
## [1] 7
# B. PIPELINE ARCHITECTURE USING %>% group_by()
library(dplyr) # Load dplyr package
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Group vulnerabilities by Severity level then count total
severity_summary <- vulnerability_scans %>%
    group_by(Severity) %>%      # Group rows by Severity
    summarise(
    Total = n()               # Count rows in each group
  )

# Display grouped summary
print(severity_summary)
## # A tibble: 4 × 2
##   Severity Total
##   <chr>    <int>
## 1 Critical     7
## 2 High         8
## 3 Low          8
## 4 Medium       7
# We can again use the pipe line in the way Show only High and Critical in our dataset

high_risk <- vulnerability_scans %>%
    filter(Severity %in% c("High", "Critical")) %>%
    select(Host_IP, Port, Severity)

print(high_risk) # Display filtered records
##         Host_IP Port Severity
## 1  192.168.1.10  443     High
## 2      10.0.0.5   80 Critical
## 3  192.168.1.15 3389     High
## 4   172.16.0.20 3306 Critical
## 5      10.0.0.7  443     High
## 6  192.168.1.30  445 Critical
## 7     10.0.0.15 1433     High
## 8   172.16.0.18 5900 Critical
## 9     10.0.0.22 8443     High
## 10 192.168.1.61 1521 Critical
## 11  172.16.0.72  443     High
## 12 192.168.1.75 3389 Critical
## 13 192.168.1.80 3306     High
## 14    10.0.0.50 8080 Critical
## 15    10.0.0.60 5900     High

QUESTION 4: ADVANCED DEBUGGING (trace and recover)

Demonstrating interactive debugging and preserving the proper execution

# 4.1. TRACE() DEBUGGING WITHOUT CHANGING THE FUNCTION

# Create a base function to test
calculate_risk <- function(severity_score, assets_exposed) {
  return(severity_score * assets_exposed)
}
# Inject debugging code into the function temporarily
trace(
  "calculate_risk",
  tracer = quote({
    cat("\n DEBUG MODE STARTED\n")
    cat("Function calculate_risk is running\n")
  })
)
## [1] "calculate_risk"
# Run function
test_risk <- calculate_risk(3, 5)
## Tracing calculate_risk(3, 5) on entry 
## 
##  DEBUG MODE STARTED
## Function calculate_risk is running
print(paste("Calculated Risk:", test_risk))
## [1] "Calculated Risk: 15"
untrace("calculate_risk")
# 4.2. RECOVER() INTERACTIVE DEBUGGING ON ERROR
options(error = recover)
calculate_patch_rate <- function(applied) {
    # This will cause a fatal error: non-numeric argument to binary operator
  rate <- applied + "total_servers" 
    return(rate)
}
options(error = NULL)

# 3. TRY CATCH ERROR HANDLING (PREVENT SCRIPT CRASH)
result <- tryCatch(
  {
    calculate_patch_rate(5) # We attempt to run the broken function from above
  },
  error = function(e) {
    cat("\nERROR HANDLED SUCCESSFULLY\n")
    cat("R intercepted this error message:", e$message, "\n")
    cat("Returning NA instead of crashing.\n")
    
    return(NA) # this Return a safe value to keep the script moving
  }
)
## 
## ERROR HANDLED SUCCESSFULLY
## R intercepted this error message: non-numeric argument to binary operator 
## Returning NA instead of crashing.
print(paste("Result after tryCatch:", result))
## [1] "Result after tryCatch: NA"
cat("\nScript finished executing without crashing!\n")
## 
## Script finished executing without crashing!

QUESTION 5: CUSTOM SUMMARY FUNCTION

Replicating base R’s summary() using custom type-checking and iteration

cyber_summary_engine <- function(sec_dataset) {
  # Validate input type
  if (!is.data.frame(sec_dataset)) {
    stop("Error: Input must be a valid data frame")
  }
  cat("CYBERSECURITY DATA PROFILER REPORT\n\n")
  
  # Loop through each feature/column
  for (feature_name in names(sec_dataset)) {
    feature_vector <- sec_dataset[[feature_name]]
    cat(">> Feature:", feature_name, "\n")
    
    # NUMERIC FEATURE ANALYSIS
    
    if (is.numeric(feature_vector)) {
      cat("Data Type: Numeric Metric\n")
      cat("Lowest Value :", min(feature_vector, na.rm = TRUE), "\n")
      cat("Highest Value:", max(feature_vector, na.rm = TRUE), "\n")
      cat("Average Value:", mean(feature_vector, na.rm = TRUE), "\n")
      cat("Mid Value    :", median(feature_vector, na.rm = TRUE), "\n\n")
     }
    
    # CATEGORICAL FEATURE ANALYSIS
    else if (is.factor(feature_vector) || is.character(feature_vector)) {
      cat("Data Type: Categorical Indicator\n")
      breach_distribution <- table(feature_vector)
      print(breach_distribution)
      cat("\n")
    }
    # UNSUPPORTED DATA TYPE
    else {
      cat("Data Type: Unclassified\n")
      cat("No statistical summary available\n\n")
    }
  }
  
  cat("END OF My REPORT SUMMARY \n")
}
# TEST THE FUNCTION
cyber_summary_engine(vulnerability_scans)
## CYBERSECURITY DATA PROFILER REPORT
## 
## >> Feature: Host_IP 
## Data Type: Categorical Indicator
## feature_vector
##    10.0.0.12    10.0.0.15    10.0.0.22    10.0.0.33    10.0.0.41     10.0.0.5 
##            1            1            1            1            1            1 
##    10.0.0.50    10.0.0.60     10.0.0.7     10.0.0.9  172.16.0.18  172.16.0.20 
##            1            1            1            1            1            1 
##  172.16.0.25  172.16.0.40  172.16.0.60  172.16.0.72   172.16.0.8  172.16.0.85 
##            1            1            1            1            1            1 
##  172.16.0.90 192.168.1.10 192.168.1.15 192.168.1.18 192.168.1.30 192.168.1.44 
##            1            2            1            1            1            1 
## 192.168.1.55 192.168.1.61 192.168.1.75 192.168.1.80 192.168.1.95 
##            1            1            1            1            1 
## 
## >> Feature: Port 
## Data Type: Numeric Metric
## Lowest Value : 21 
## Highest Value: 8443 
## Average Value: 2156.1 
## Mid Value    : 444 
## 
## >> Feature: Scan_Date 
## Data Type: Unclassified
## No statistical summary available
## 
## >> Feature: Severity 
## Data Type: Categorical Indicator
## feature_vector
## Critical     High      Low   Medium 
##        7        8        8        7 
## 
## END OF My REPORT SUMMARY

QUESTION 6: FUNCTIONAL PROGRAMMING WITH THE APPLY FAMILY

Demonstrating lapply, sapply, vapply, and mapply on a real-world dataset

# let keep Using vulnerability_scans dataset
sec_records <- vulnerability_scans

# 1. lapply() returns a LIST: Get unique values per column
unique_tracker <- lapply(sec_records, function(col_data) {
  unique(col_data)
})
print(unique_tracker)
## $Host_IP
##  [1] "192.168.1.10" "10.0.0.5"     "172.16.0.8"   "192.168.1.15" "10.0.0.12"   
##  [6] "172.16.0.20"  "192.168.1.18" "10.0.0.7"     "172.16.0.25"  "192.168.1.30"
## [11] "10.0.0.9"     "172.16.0.40"  "192.168.1.44" "10.0.0.15"    "172.16.0.18" 
## [16] "192.168.1.55" "10.0.0.22"    "172.16.0.60"  "192.168.1.61" "10.0.0.33"   
## [21] "172.16.0.72"  "192.168.1.75" "10.0.0.41"    "172.16.0.85"  "192.168.1.80"
## [26] "10.0.0.50"    "172.16.0.90"  "192.168.1.95" "10.0.0.60"   
## 
## $Port
##  [1]  443   22   80   21 3389   25 3306 8080  445   53  110  995 1433 5900 8443
## [16]  161 1521
## 
## $Scan_Date
##  [1] "2026-05-20" "2026-05-21" "2026-05-22" "2026-05-23" "2026-05-24"
##  [6] "2026-05-25" "2026-05-26" "2026-05-27" "2026-05-28" "2026-05-29"
## [11] "2026-05-30"
## 
## $Severity
## [1] "High"     "Low"      "Critical" "Medium"
# 2. sapply() simplifies output to vector/matrix : Count number of unique values per column
uniqueness_score <- sapply(sec_records, function(col_data) {
  length(unique(col_data))
})
print(uniqueness_score)
##   Host_IP      Port Scan_Date  Severity 
##        29        17        11         4
# 3. vapply() safer version of sapply (fixed output type): Count NA values per column (expected numeric output)
na_audit <- vapply(sec_records, function(col_data) {
  sum(is.na(col_data))
}, FUN.VALUE = numeric(1))
print(na_audit)
##   Host_IP      Port Scan_Date  Severity 
##         0         0         0         0
# 4. mapply() multiple inputs at the same time: Compute risk score from Severity + Port weight

# Convert severity into numeric weight
severity_weight <- function(level) {
  if (level == "Low") return(1)
  else if (level == "Medium") return(2)
  else if (level == "High") return(3)
  else return(4)  # Critical
}

risk_scores <- mapply(
  function(sev, port) {
    severity_weight(sev) * port
  },
  sec_records$Severity,
  sec_records$Port
)

print(risk_scores)
##     High      Low Critical   Medium     High      Low Critical   Medium 
##     1329       22      320       42    10167       25    13224    16160 
##     High      Low Critical   Medium      Low   Medium     High Critical 
##     1329       22     1780      106      110     1990     4299    23600 
##      Low     High   Medium Critical      Low     High Critical   Medium 
##     8080    25329      322     6084       22     1329    13556      160 
##      Low     High Critical   Medium      Low     High 
##       25     9918    32320      890       21    17700

QUESTION 7: DATA VISUALIZATION WITH GGPLOT2

Demonstrating the conceptual differences between bar, col, and smooth geoms

library(ggplot2)
library(dplyr)
# let again work on our dataset of vulnerablitysans
sec_records <- vulnerability_scans

# 1. geom_bar(): automatic counting
ggplot(sec_records, aes(x = Severity)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Frequency of Vulnerability Severity Levels 'GEOMBAR'",
    x = "Severity",
    y = "Count"
  )

# 2. geom_col() requires precomputed values, FIXED aggregation method
severity_counts <- sec_records %>%
  group_by(Severity) %>%
  summarise(Total = n())
ggplot(severity_counts, aes(x = Severity, y = Total)) +
  geom_col(fill = "darkred") +
  labs(
    title = "Precomputed Severity Distribution 'GEOMCOL'",
    x = "Severity",
    y = "Total Count"
  )

# 3. geom_smooth() trend analysis; FIXED mapping issue
ggplot(sec_records, aes(x = Port, y = as.numeric(factor(Severity)))) +
  geom_point(color = "black") +
  geom_smooth(method = "loess") +
  labs(
    title = "Trend of Severity Across Ports 'GEOMSMOOTH'",
    x = "Port",
    y = "Severity (numeric encoding)"
  )
## `geom_smooth()` using formula = 'y ~ x'