We just need to Load necessary libraries for the modern ecosystem and DB connectors
library(readxl) # For Excel
library(haven) # For SAS, SPSS, Stata
library(readr) # For ASCII/Text
library(xml2) # For XML
library(rvest) # For Web scraping
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(DBI) # For Database management systems
library(RSQLite) # Example DB driver
# 1. Data from Excel
excel_data <- read_excel("Burera new.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...5`
## • `` -> `...6`
csv_data <- read.csv("Student_Marks.csv")
#displaying samples from the dataset
head(excel_data, 3)
## # A tibble: 3 × 6
## `District coordinator` ...2 ...3 MAGEZI Celestin - 07883…¹ ...5 ...6
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 No UMURENGE AKAGALI SITE Y'ITORA AMAZ… TELE…
## 2 1 Bungwe Bungwe G.S Regina Pacis Bungwe UWIN… 2507…
## 3 2 Bungwe Bushenya G.S.Bushenya KAMI… 2507…
## # ℹ abbreviated name: ¹​`MAGEZI Celestin - 0788356646`
tail(csv_data, 3)
## number_courses time_study Marks
## 98 4 7.163 41.444
## 99 7 0.309 12.027
## 100 3 6.335 32.357
# 2. Data from Statistical Packages (SAS, SPSS, Stata)
sas7bdat_data <- read_sas("sas7bdatfile.sas7bdat")
stata_data <- read_dta("dtafile.dta")
head(sas7bdat_data,5)
## # A tibble: 5 × 78
## id age race sex mar ed prvarrst curcrim crime yrsent depart relyr
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10 34.3 1 1 2 5 NA 71 72 0.5 3 76
## 2 10 34.3 1 1 2 5 NA 71 72 0.5 3 76
## 3 11 44.2 1 1 2 6 NA NA 52 0.5 3 76
## 4 19 22.1 2 1 1 8 4 NA 1302 3 1 76
## 5 19 22.1 2 1 1 8 4 NA 1302 3 1 76
## # ℹ 66 more variables: relmo <dbl>, expgrp <dbl>, sexint <dbl>, paro <dbl>,
## # higrade <dbl>, age1st <dbl>, numarst <dbl>, numconv <dbl>, emp1 <dbl>,
## # emp2 <dbl>, emp3 <dbl>, emp4 <dbl>, emp5 <dbl>, emp6 <dbl>, emp7 <dbl>,
## # emp8 <dbl>, emp9 <dbl>, emp10 <dbl>, emp11 <dbl>, emp12 <dbl>, emp13 <dbl>,
## # emp19 <dbl>, frstarrmo <dbl>, frstarrday <dbl>, frstarryr <dbl>,
## # frstcharge <dbl>, arr2mo <dbl>, arr2dy <dbl>, arr2yr <dbl>, arr3mo <dbl>,
## # arr3dy <dbl>, arr3yr <dbl>, arr4mo <dbl>, arr4dy <dbl>, arr4yr <dbl>, …
head(stata_data,5)
## # A tibble: 5 × 3
## smoke educ male
## <dbl> <dbl> <dbl>
## 1 1 10 1
## 2 1 11 1
## 3 0 9 1
## 4 0 9 1
## 5 0 8 1
# 3. Data from Text files (ASCII, XML, Webscraping)
text_data <- data <- read.table("textfile.txt", header = TRUE)
# XML
xml_data <- read_xml("xmlfile.xml")
# Webscraping (HTML)
html_page <- read_html("https://testbook.com/question-answer/the-table-below-gives-the-number-of-students-passi--613f28e1fb752a828a256563")
page_table <- html_table(html_nodes(html_page, "table")[[1]])
head(text_data)
## gradrat lenroll rmbrd private stufac csat act
## 1 15 4.007333 4.120 1 11.9 972 20
## 2 NA 6.833032 3.590 0 10.0 961 22
## 3 39 4.488636 4.764 0 9.5 NA NA
## 4 NA 7.057898 5.120 0 13.7 881 20
## 5 40 6.891626 2.550 0 14.3 NA 17
## 6 55 5.187386 3.250 1 32.8 NA 20
head(xml_data)
## $node
## <pointer: 0x0000020509182380>
##
## $doc
## <pointer: 0x0000020509180470>
head(page_table)
## # A tibble: 4 × 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 Year Girls Boys
## 2 2016 128734 115526
## 3 2017 130567 124313
## 4 2018 129209 122131
# 4. Data from Database Management Systems (SQL, MySQL, Oracle, Access)
db_conn <- dbConnect(RSQLite::SQLite(), dbname = "institutional_db.sqlite")
# Load Dataset A
vulnerability_scans <- read.csv("vulnerability_scans.txt")
# Load Dataset B
patch_logs <- read.csv("patch_logs.txt")
#see the head of our datasets to be aware of nature of data
head(vulnerability_scans)
## Host_IP Port Scan_Date Severity
## 1 192.168.1.10 443 2026-05-20 High
## 2 192.168.1.10 22 2026-05-20 Low
## 3 10.0.0.5 80 2026-05-21 Critical
## 4 172.16.0.8 21 2026-05-21 Medium
## 5 192.168.1.15 3389 2026-05-22 High
## 6 10.0.0.12 25 2026-05-22 Low
head(patch_logs)
## Host_IP Port Scan_Date Patch_Status
## 1 192.168.1.10 443 2026-05-20 Applied
## 2 192.168.1.10 22 2026-05-20 Pending
## 3 10.0.0.5 8080 2026-05-21 Failed
## 4 172.16.0.8 21 2026-05-21 Applied
## 5 192.168.1.15 3389 2026-05-22 Applied
## 6 10.0.0.12 25 2026-05-22 Pending
# Convert dates properly to make them real date in nature
vulnerability_scans$Scan_Date <- as.Date(vulnerability_scans$Scan_Date)
patch_logs$Scan_Date <- as.Date(patch_logs$Scan_Date)
# Merge datasets using compound keys
merged_data <- merge(
vulnerability_scans,
patch_logs,
by = c("Host_IP", "Port", "Scan_Date")
)
# then Display merged dataset
print(merged_data)
## Host_IP Port Scan_Date Severity Patch_Status
## 1 10.0.0.12 25 2026-05-22 Low Pending
## 2 10.0.0.15 1433 2026-05-25 High Failed
## 3 10.0.0.22 8443 2026-05-26 High Applied
## 4 10.0.0.33 22 2026-05-27 Low Pending
## 5 10.0.0.41 80 2026-05-28 Medium Applied
## 6 10.0.0.7 443 2026-05-23 High Pending
## 7 10.0.0.9 53 2026-05-24 Medium Applied
## 8 172.16.0.18 5900 2026-05-25 Critical Applied
## 9 172.16.0.20 3306 2026-05-22 Critical Failed
## 10 172.16.0.25 22 2026-05-23 Low Applied
## 11 172.16.0.40 110 2026-05-24 Low Pending
## 12 172.16.0.60 161 2026-05-26 Medium Failed
## 13 172.16.0.72 443 2026-05-27 High Applied
## 14 172.16.0.8 21 2026-05-21 Medium Applied
## 15 172.16.0.90 445 2026-05-29 Medium Applied
## 16 192.168.1.10 22 2026-05-20 Low Pending
## 17 192.168.1.10 443 2026-05-20 High Applied
## 18 192.168.1.15 3389 2026-05-22 High Applied
## 19 192.168.1.18 8080 2026-05-23 Medium Applied
## 20 192.168.1.30 445 2026-05-24 Critical Failed
## 21 192.168.1.44 995 2026-05-25 Medium Applied
## 22 192.168.1.61 1521 2026-05-27 Critical Applied
## 23 192.168.1.80 3306 2026-05-29 High Applied
## 24 192.168.1.95 21 2026-05-30 Low Pending
# A. BASE R EXTRACTION USING $,
severity_column <- vulnerability_scans$Severity #Extract only the Severity column
print(severity_column) # Display extracted column
## [1] "High" "Low" "Critical" "Medium" "High" "Low"
## [7] "Critical" "Medium" "High" "Low" "Critical" "Medium"
## [13] "Low" "Medium" "High" "Critical" "Low" "High"
## [19] "Medium" "Critical" "Low" "High" "Critical" "Medium"
## [25] "Low" "High" "Critical" "Medium" "Low" "High"
hosts <- vulnerability_scans$Host_IP # Extract Host_IP column
print(hosts) # Display Host_IPs
## [1] "192.168.1.10" "192.168.1.10" "10.0.0.5" "172.16.0.8" "192.168.1.15"
## [6] "10.0.0.12" "172.16.0.20" "192.168.1.18" "10.0.0.7" "172.16.0.25"
## [11] "192.168.1.30" "10.0.0.9" "172.16.0.40" "192.168.1.44" "10.0.0.15"
## [16] "172.16.0.18" "192.168.1.55" "10.0.0.22" "172.16.0.60" "192.168.1.61"
## [21] "10.0.0.33" "172.16.0.72" "192.168.1.75" "10.0.0.41" "172.16.0.85"
## [26] "192.168.1.80" "10.0.0.50" "172.16.0.90" "192.168.1.95" "10.0.0.60"
critical_count <- sum(vulnerability_scans$Severity == "Critical") # Count how many "Critical"
print(critical_count) # Display result
## [1] 7
# B. PIPELINE ARCHITECTURE USING %>% group_by()
library(dplyr) # Load dplyr package
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Group vulnerabilities by Severity level then count total
severity_summary <- vulnerability_scans %>%
group_by(Severity) %>% # Group rows by Severity
summarise(
Total = n() # Count rows in each group
)
# Display grouped summary
print(severity_summary)
## # A tibble: 4 × 2
## Severity Total
## <chr> <int>
## 1 Critical 7
## 2 High 8
## 3 Low 8
## 4 Medium 7
# We can again use the pipe line in the way Show only High and Critical in our dataset
high_risk <- vulnerability_scans %>%
filter(Severity %in% c("High", "Critical")) %>%
select(Host_IP, Port, Severity)
print(high_risk) # Display filtered records
## Host_IP Port Severity
## 1 192.168.1.10 443 High
## 2 10.0.0.5 80 Critical
## 3 192.168.1.15 3389 High
## 4 172.16.0.20 3306 Critical
## 5 10.0.0.7 443 High
## 6 192.168.1.30 445 Critical
## 7 10.0.0.15 1433 High
## 8 172.16.0.18 5900 Critical
## 9 10.0.0.22 8443 High
## 10 192.168.1.61 1521 Critical
## 11 172.16.0.72 443 High
## 12 192.168.1.75 3389 Critical
## 13 192.168.1.80 3306 High
## 14 10.0.0.50 8080 Critical
## 15 10.0.0.60 5900 High
# 4.1. TRACE() DEBUGGING WITHOUT CHANGING THE FUNCTION
# Create a base function to test
calculate_risk <- function(severity_score, assets_exposed) {
return(severity_score * assets_exposed)
}
# Inject debugging code into the function temporarily
trace(
"calculate_risk",
tracer = quote({
cat("\n DEBUG MODE STARTED\n")
cat("Function calculate_risk is running\n")
})
)
## [1] "calculate_risk"
# Run function
test_risk <- calculate_risk(3, 5)
## Tracing calculate_risk(3, 5) on entry
##
## DEBUG MODE STARTED
## Function calculate_risk is running
print(paste("Calculated Risk:", test_risk))
## [1] "Calculated Risk: 15"
untrace("calculate_risk")
# 4.2. RECOVER() INTERACTIVE DEBUGGING ON ERROR
options(error = recover)
calculate_patch_rate <- function(applied) {
# This will cause a fatal error: non-numeric argument to binary operator
rate <- applied + "total_servers"
return(rate)
}
options(error = NULL)
# 3. TRY CATCH ERROR HANDLING (PREVENT SCRIPT CRASH)
result <- tryCatch(
{
calculate_patch_rate(5) # We attempt to run the broken function from above
},
error = function(e) {
cat("\nERROR HANDLED SUCCESSFULLY\n")
cat("R intercepted this error message:", e$message, "\n")
cat("Returning NA instead of crashing.\n")
return(NA) # this Return a safe value to keep the script moving
}
)
##
## ERROR HANDLED SUCCESSFULLY
## R intercepted this error message: non-numeric argument to binary operator
## Returning NA instead of crashing.
print(paste("Result after tryCatch:", result))
## [1] "Result after tryCatch: NA"
cat("\nScript finished executing without crashing!\n")
##
## Script finished executing without crashing!
cyber_summary_engine <- function(sec_dataset) {
# Validate input type
if (!is.data.frame(sec_dataset)) {
stop("Error: Input must be a valid data frame")
}
cat("CYBERSECURITY DATA PROFILER REPORT\n\n")
# Loop through each feature/column
for (feature_name in names(sec_dataset)) {
feature_vector <- sec_dataset[[feature_name]]
cat(">> Feature:", feature_name, "\n")
# NUMERIC FEATURE ANALYSIS
if (is.numeric(feature_vector)) {
cat("Data Type: Numeric Metric\n")
cat("Lowest Value :", min(feature_vector, na.rm = TRUE), "\n")
cat("Highest Value:", max(feature_vector, na.rm = TRUE), "\n")
cat("Average Value:", mean(feature_vector, na.rm = TRUE), "\n")
cat("Mid Value :", median(feature_vector, na.rm = TRUE), "\n\n")
}
# CATEGORICAL FEATURE ANALYSIS
else if (is.factor(feature_vector) || is.character(feature_vector)) {
cat("Data Type: Categorical Indicator\n")
breach_distribution <- table(feature_vector)
print(breach_distribution)
cat("\n")
}
# UNSUPPORTED DATA TYPE
else {
cat("Data Type: Unclassified\n")
cat("No statistical summary available\n\n")
}
}
cat("END OF My REPORT SUMMARY \n")
}
# TEST THE FUNCTION
cyber_summary_engine(vulnerability_scans)
## CYBERSECURITY DATA PROFILER REPORT
##
## >> Feature: Host_IP
## Data Type: Categorical Indicator
## feature_vector
## 10.0.0.12 10.0.0.15 10.0.0.22 10.0.0.33 10.0.0.41 10.0.0.5
## 1 1 1 1 1 1
## 10.0.0.50 10.0.0.60 10.0.0.7 10.0.0.9 172.16.0.18 172.16.0.20
## 1 1 1 1 1 1
## 172.16.0.25 172.16.0.40 172.16.0.60 172.16.0.72 172.16.0.8 172.16.0.85
## 1 1 1 1 1 1
## 172.16.0.90 192.168.1.10 192.168.1.15 192.168.1.18 192.168.1.30 192.168.1.44
## 1 2 1 1 1 1
## 192.168.1.55 192.168.1.61 192.168.1.75 192.168.1.80 192.168.1.95
## 1 1 1 1 1
##
## >> Feature: Port
## Data Type: Numeric Metric
## Lowest Value : 21
## Highest Value: 8443
## Average Value: 2156.1
## Mid Value : 444
##
## >> Feature: Scan_Date
## Data Type: Unclassified
## No statistical summary available
##
## >> Feature: Severity
## Data Type: Categorical Indicator
## feature_vector
## Critical High Low Medium
## 7 8 8 7
##
## END OF My REPORT SUMMARY
# let keep Using vulnerability_scans dataset
sec_records <- vulnerability_scans
# 1. lapply() returns a LIST: Get unique values per column
unique_tracker <- lapply(sec_records, function(col_data) {
unique(col_data)
})
print(unique_tracker)
## $Host_IP
## [1] "192.168.1.10" "10.0.0.5" "172.16.0.8" "192.168.1.15" "10.0.0.12"
## [6] "172.16.0.20" "192.168.1.18" "10.0.0.7" "172.16.0.25" "192.168.1.30"
## [11] "10.0.0.9" "172.16.0.40" "192.168.1.44" "10.0.0.15" "172.16.0.18"
## [16] "192.168.1.55" "10.0.0.22" "172.16.0.60" "192.168.1.61" "10.0.0.33"
## [21] "172.16.0.72" "192.168.1.75" "10.0.0.41" "172.16.0.85" "192.168.1.80"
## [26] "10.0.0.50" "172.16.0.90" "192.168.1.95" "10.0.0.60"
##
## $Port
## [1] 443 22 80 21 3389 25 3306 8080 445 53 110 995 1433 5900 8443
## [16] 161 1521
##
## $Scan_Date
## [1] "2026-05-20" "2026-05-21" "2026-05-22" "2026-05-23" "2026-05-24"
## [6] "2026-05-25" "2026-05-26" "2026-05-27" "2026-05-28" "2026-05-29"
## [11] "2026-05-30"
##
## $Severity
## [1] "High" "Low" "Critical" "Medium"
# 2. sapply() simplifies output to vector/matrix : Count number of unique values per column
uniqueness_score <- sapply(sec_records, function(col_data) {
length(unique(col_data))
})
print(uniqueness_score)
## Host_IP Port Scan_Date Severity
## 29 17 11 4
# 3. vapply() safer version of sapply (fixed output type): Count NA values per column (expected numeric output)
na_audit <- vapply(sec_records, function(col_data) {
sum(is.na(col_data))
}, FUN.VALUE = numeric(1))
print(na_audit)
## Host_IP Port Scan_Date Severity
## 0 0 0 0
# 4. mapply() multiple inputs at the same time: Compute risk score from Severity + Port weight
# Convert severity into numeric weight
severity_weight <- function(level) {
if (level == "Low") return(1)
else if (level == "Medium") return(2)
else if (level == "High") return(3)
else return(4) # Critical
}
risk_scores <- mapply(
function(sev, port) {
severity_weight(sev) * port
},
sec_records$Severity,
sec_records$Port
)
print(risk_scores)
## High Low Critical Medium High Low Critical Medium
## 1329 22 320 42 10167 25 13224 16160
## High Low Critical Medium Low Medium High Critical
## 1329 22 1780 106 110 1990 4299 23600
## Low High Medium Critical Low High Critical Medium
## 8080 25329 322 6084 22 1329 13556 160
## Low High Critical Medium Low High
## 25 9918 32320 890 21 17700
library(ggplot2)
library(dplyr)
# let again work on our dataset of vulnerablitysans
sec_records <- vulnerability_scans
# 1. geom_bar(): automatic counting
ggplot(sec_records, aes(x = Severity)) +
geom_bar(fill = "steelblue") +
labs(
title = "Frequency of Vulnerability Severity Levels 'GEOMBAR'",
x = "Severity",
y = "Count"
)
# 2. geom_col() requires precomputed values, FIXED aggregation method
severity_counts <- sec_records %>%
group_by(Severity) %>%
summarise(Total = n())
ggplot(severity_counts, aes(x = Severity, y = Total)) +
geom_col(fill = "darkred") +
labs(
title = "Precomputed Severity Distribution 'GEOMCOL'",
x = "Severity",
y = "Total Count"
)
# 3. geom_smooth() trend analysis; FIXED mapping issue
ggplot(sec_records, aes(x = Port, y = as.numeric(factor(Severity)))) +
geom_point(color = "black") +
geom_smooth(method = "loess") +
labs(
title = "Trend of Severity Across Ports 'GEOMSMOOTH'",
x = "Port",
y = "Severity (numeric encoding)"
)
## `geom_smooth()` using formula = 'y ~ x'