Assignement 1: Importing Data From Different Sources

Importing data from different sources allows analysts to combine and work with information stored in various formats and systems for analysis and reporting. In this assignment, We will imported data from a created data frame, a CSV file, and a PostgreSQL database to demonstrate different data access methods in R programming language.

Creating and consuming a dataframe with R

#1 Creating a dataset with 100,000 rows (simulating "big data")
n <- 100000  

# Generate variables
data <- data.frame(
  id = 1:n,   # Unique ID for each row
  
  age = sample(18:70, n, replace = TRUE),  
  # Random ages between 18 and 70
  
  salary = rnorm(n, mean = 50000, sd = 15000),  
  # Normally distributed salaries
  
  department = sample(c("IT", "HR", "Finance", "Sales"), n, replace = TRUE),  
  # Random department categories
  
  experience = sample(0:40, n, replace = TRUE)  
  # Years of experience
)
kable(head(data))
id age salary department experience
1 48 35105.06 Finance 17
2 58 20879.84 Finance 20
3 25 39407.48 Sales 26
4 48 42250.00 HR 9
5 35 37878.72 HR 9
6 24 48283.05 HR 6
kable(tail(data))
id age salary department experience
99995 99995 68 54057.77 HR 15
99996 99996 62 60363.95 IT 15
99997 99997 62 56260.00 IT 35
99998 99998 60 53618.39 HR 6
99999 99999 35 34713.28 IT 8
100000 100000 30 24685.59 Finance 2

Importing Data From a Database

Importing from a database (PostgreSQL) will enables direct access to structured, live data for efficient querying and analysis.

Before working with a database, all required libraries and packages must be imported and loaded to establish and manage the connection.

#install.packages("DBI")
#install.packages("RSQLite")
#install.packages("dplyr")

# Load packages into memory
library(DBI)
library(RSQLite)
library(dplyr)
library(knitr)
library(ggplot2)


# Create PostgreSQL connection

conn <- dbConnect(
    RPostgres::Postgres(),
    dbname = "system_audit_db",
    host = "localhost",
    port = 5432,
    user = "postgres",
    password = "arthur"
)

# Read Data from PostgreSQL

logs <- dbGetQuery(
    conn,  "SELECT * FROM audit_logs"
)

head(logs)
##   log_id user_id department          login_time  status access_level
## 1      1    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 2      2    1002    Finance 2026-05-07 13:20:49 SUCCESS        ADMIN
## 3      3    1003      Audit 2026-05-07 13:20:49  FAILED         USER
## 4      4    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 5      5    1001         IT 2026-05-07 13:20:49  FAILED         USER
##     ip_address
## 1 192.168.1.10
## 2     10.0.0.5
## 3   172.16.0.8
## 4 192.168.1.10
## 5 192.168.1.10
tail(logs)
##   log_id user_id department          login_time  status access_level
## 1      1    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 2      2    1002    Finance 2026-05-07 13:20:49 SUCCESS        ADMIN
## 3      3    1003      Audit 2026-05-07 13:20:49  FAILED         USER
## 4      4    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 5      5    1001         IT 2026-05-07 13:20:49  FAILED         USER
##     ip_address
## 1 192.168.1.10
## 2     10.0.0.5
## 3   172.16.0.8
## 4 192.168.1.10
## 5 192.168.1.10

Importing Data From a XML file

#install.packages("xml2")
library(xml2)

#Reading the XML file
xml_data <- read_xml("C:/Users/abutera/Documents/data.xml")

#Viewing the structure
xml_data
## {xml_document}
## <employees>
## [1] <employee>\n  <id>1</id>\n  <name>Arthur</name>\n  <role>Analyst</role>\n ...
## [2] <employee>\n  <id>2</id>\n  <name>Jane</name>\n  <role>Manager</role>\n</ ...
#Extracting the values into a dataframe
employees <- xml_find_all(xml_data, "//employee")

ids <- xml_text(xml_find_all(employees, "id"))
names <- xml_text(xml_find_all(employees, "name"))
roles <- xml_text(xml_find_all(employees, "role"))

df <- data.frame(id = ids, name = names, role = roles)

kable(df)
id name role
1 Arthur Analyst
2 Jane Manager

Importing Data From a statistical Package

library(haven)
data <- read_sav("C:/Users/abutera/Documents/audit_logs.sav")
kable(data)
user_id department status login_time ip_address
U001 IT SUCCESS 08:00 192.168.1.10
U002 Finance FAILED 08:10 10.0.0.5
U003 HR FAILED 09:00 172.16.0.8
U001 IT FAILED 09:15 192.168.1.10
U004 Finance SUCCESS 10:00 10.0.0.9
U002 FAILED 10:05 10.0.0.5
U005 IT FAILED 11:00 172.16.0.3
U003 HR SUCCESS 11:20 192.168.1.11
U001 IT FAILED 12:00 192.168.1.10

ASSIGNMENT 2: MERGING 2 DATASETS(LOGS AND EMPLOYEES) USING DIFFERENT VARIABLES

First loading a second dataset (Employees) from system_audit_db database to merge with logs dataset

emp <- dbGetQuery(conn, "SELECT * FROM employees")

# First rows
kable(logs)
log_id user_id department login_time status access_level ip_address
1 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10
2 1002 Finance 2026-05-07 13:20:49 SUCCESS ADMIN 10.0.0.5
3 1003 Audit 2026-05-07 13:20:49 FAILED USER 172.16.0.8
4 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10
5 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10
# Summary
summary(logs)
##      log_id     user_id         department   login_time                 
##  Min.   :1   Min.   :1001   Length   :5    Min.   :2026-05-07 13:20:49  
##  1st Qu.:2   1st Qu.:1001   N.unique :3    1st Qu.:2026-05-07 13:20:49  
##  Median :3   Median :1001   N.blank  :0    Median :2026-05-07 13:20:49  
##  Mean   :3   Mean   :1002   Min.nchar:2    Mean   :2026-05-07 13:20:49  
##  3rd Qu.:4   3rd Qu.:1002   Max.nchar:7    3rd Qu.:2026-05-07 13:20:49  
##  Max.   :5   Max.   :1003                  Max.   :2026-05-07 13:20:49  
##        status     access_level     ip_address
##  Length   :5   Length   :5     Length   : 5  
##  N.unique :2   N.unique :2     N.unique : 3  
##  N.blank  :0   N.blank  :0     N.blank  : 0  
##  Min.nchar:6   Min.nchar:4     Min.nchar: 8  
##  Max.nchar:7   Max.nchar:5     Max.nchar:12  
## 
# viewing all database table data
kable(logs)
log_id user_id department login_time status access_level ip_address
1 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10
2 1002 Finance 2026-05-07 13:20:49 SUCCESS ADMIN 10.0.0.5
3 1003 Audit 2026-05-07 13:20:49 FAILED USER 172.16.0.8
4 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10
5 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10

Performing an inner join between logs and emp using user_id as the matching key.

glimpse(logs)
## Rows: 5
## Columns: 7
## $ log_id       <int> 1, 2, 3, 4, 5
## $ user_id      <int> 1001, 1002, 1003, 1001, 1001
## $ department   <chr> "IT", "Finance", "Audit", "IT", "IT"
## $ login_time   <dttm> 2026-05-07 13:20:49, 2026-05-07 13:20:49, 2026-05-07 13:2…
## $ status       <chr> "FAILED", "SUCCESS", "FAILED", "FAILED", "FAILED"
## $ access_level <chr> "USER", "ADMIN", "USER", "USER", "USER"
## $ ip_address   <chr> "192.168.1.10", "10.0.0.5", "172.16.0.8", "192.168.1.10",…
glimpse(emp)
## Rows: 5
## Columns: 8
## $ employee_id  <int> 1, 2, 3, 4, 5
## $ user_id      <int> 1001, 1002, 1003, 1004, 1005
## $ full_name    <chr> "Alice Nkusi", "Bob Habimana", "Claire Uwera", "Denis Mug…
## $ department   <chr> "IT", "Finance", "Audit", "HR", "Marketing"
## $ job_title    <chr> "Sysadmin", "Analyst", "Auditor", "HR Officer", "Designer"
## $ access_level <chr> "USER", "ADMIN", "USER", "USER", "USER"
## $ hire_date    <date> 2021-03-15, 2019-07-01, 2022-01-20, 2020-09-10, 2023-05-0…
## $ active       <lgl> TRUE, TRUE, TRUE, TRUE, FALSE
# SQL: INNER JOIN employees e ON a.user_id = e.user_id
inner_one <- inner_join(
  logs,   # left table  (audit_logs)
  emp,    # right table (employees)
  by = "user_id"   # the single matching column
)

kable(inner_one)
log_id user_id department.x login_time status access_level.x ip_address employee_id full_name department.y job_title access_level.y hire_date active
1 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi IT Sysadmin USER 2021-03-15 TRUE
2 1002 Finance 2026-05-07 13:20:49 SUCCESS ADMIN 10.0.0.5 2 Bob Habimana Finance Analyst ADMIN 2019-07-01 TRUE
3 1003 Audit 2026-05-07 13:20:49 FAILED USER 172.16.0.8 3 Claire Uwera Audit Auditor USER 2022-01-20 TRUE
4 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi IT Sysadmin USER 2021-03-15 TRUE
5 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi IT Sysadmin USER 2021-03-15 TRUE

Merging two datasets by matching records where both user_id and department are the same in each table.

# SQL equivalent: ON a.user_id = e.user_id AND a.department = e.department (using 2 variables)
inner_two <- inner_join(
  logs,
  emp,
  by = c("user_id", "department")  # BOTH must match simultaneously
)
kable(inner_two)
log_id user_id department login_time status access_level.x ip_address employee_id full_name job_title access_level.y hire_date active
1 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi Sysadmin USER 2021-03-15 TRUE
2 1002 Finance 2026-05-07 13:20:49 SUCCESS ADMIN 10.0.0.5 2 Bob Habimana Analyst ADMIN 2019-07-01 TRUE
3 1003 Audit 2026-05-07 13:20:49 FAILED USER 172.16.0.8 3 Claire Uwera Auditor USER 2022-01-20 TRUE
4 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi Sysadmin USER 2021-03-15 TRUE
5 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi Sysadmin USER 2021-03-15 TRUE

Performing an inner join to merge logs and emp datasets by matching user_id, department, and access_level.

# SQL equivalent: ON user_id AND department AND access_level all match (using 3 variables)
inner_three <- inner_join(
  logs,
  emp,
  by = c("user_id", "department", "access_level")
)

kable(inner_three)
log_id user_id department login_time status access_level ip_address employee_id full_name job_title hire_date active
1 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi Sysadmin 2021-03-15 TRUE
2 1002 Finance 2026-05-07 13:20:49 SUCCESS ADMIN 10.0.0.5 2 Bob Habimana Analyst 2019-07-01 TRUE
3 1003 Audit 2026-05-07 13:20:49 FAILED USER 172.16.0.8 3 Claire Uwera Auditor 2022-01-20 TRUE
4 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi Sysadmin 2021-03-15 TRUE
5 1001 IT 2026-05-07 13:20:49 FAILED USER 192.168.1.10 1 Alice Nkusi Sysadmin 2021-03-15 TRUE

Selecting columns before joining

inner_selected <- logs |>
  select(user_id, department, status, access_level, ip_address) |>  # keep only what you need from logs
  inner_join(
    emp |> select(user_id, full_name, job_title, hire_date),         # keep only what you need from emp
    by = "user_id"
  )

kable(inner_selected)
user_id department status access_level ip_address full_name job_title hire_date
1001 IT FAILED USER 192.168.1.10 Alice Nkusi Sysadmin 2021-03-15
1002 Finance SUCCESS ADMIN 10.0.0.5 Bob Habimana Analyst 2019-07-01
1003 Audit FAILED USER 172.16.0.8 Claire Uwera Auditor 2022-01-20
1001 IT FAILED USER 192.168.1.10 Alice Nkusi Sysadmin 2021-03-15
1001 IT FAILED USER 192.168.1.10 Alice Nkusi Sysadmin 2021-03-15

ASSIGNMENT 3: USING GROUPBY() AND %>%

GROUP BY is used to organize data into groups based on one or more columns, so that operations like counting, summing, or averaging can be performed on each group separately.

The pipe operator %>% is used to pass the result of one step directly into the next step, making code easier to read and follow like a sequence of actions.

# Finding failed attemps
failed_logins <- logs %>%
    filter(status == "FAILED")



#Presentation of suspicious users by attemps

# Count failed attempts by user
suspicious_users <- failed_logins %>%
    group_by(user_id) %>%
    summarise(
      failed_attempts = n()
    ) %>%
  
  arrange(desc(failed_attempts))


print(suspicious_users)
## # A tibble: 2 × 2
##   user_id failed_attempts
##     <int>           <int>
## 1    1001               3
## 2    1003               1
# Disable scientific notation
options(scipen = 999)
# Barplot

ggplot(suspicious_users, aes(x = user_id, y = failed_attempts)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Users with Failed Logins",
    x = "User ID",
    y = "Failed Login Attempts"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Login Status Distribution

#library(ggplot2)
ggplot(logs, aes(x = status, fill = status)) +
  geom_bar() +
  labs(
    title = "Distribution of Login Status",
    x = "Login Status",
    y = "Count"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Login Count per Department

library(dplyr)

logins_by_dept <- logs %>%
  group_by(department) %>%
  summarise(total_logins = n())

logins_by_dept
## # A tibble: 3 × 2
##   department total_logins
##   <chr>             <int>
## 1 Audit                 1
## 2 Finance               1
## 3 IT                    3
library(ggplot2)

ggplot(logins_by_dept, 
       aes(x = reorder(department, -total_logins), y = total_logins)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Number of Logins per Department",
    x = "Department",
    y = "Total Logins"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Assignment 4: USING trace() AND recover()

The trace() function is used to monitor function execution by inserting debugging messages without modifying the original code.

The recover() function allows debugging by pausing execution when an error occurs and enabling inspection of the program state.

# Demonstrating trace()
trace(dplyr::inner_join, tracer = quote(cat("Running join...\n")), print = FALSE)
## Tracing function "inner_join" in package "dplyr"
## [1] "inner_join"
inner_one <- inner_join(logs, emp, by = "user_id")
## Running join...
untrace(dplyr::inner_join)
## Untracing function "inner_join" in package "dplyr"
# Demonstrating recover()
options(error = recover)
# Example error (intentional)
# inner_error <- inner_join(logs, emp, by = "wrong_column")
options(error = NULL)

These debugging tools help identify issues in data processing and ensure correctness in data merging operations.

ASSIGNMENT 5: CREATING A FUNCTION TO CALCULATE THE MEAN, MEDINA AND STANDARD DEVIATION

MEAN()

This function calculates the average of a set of values. It first removes missing values and then divides the total sum by the number of remaining observations.

my_mean <- function(x) {
  x <- x[!is.na(x)]   # remove missing values
  sum(x) / length(x)
}
# Testing the function
# sample data
test_data <- c(10, 20, 30, 40)

# calculate mean
my_mean(test_data)
## [1] 25
# Testing the function including missing values

# sample data
test_data <- c(10, 20, 30, NA, 40)

# calculate mean
my_mean(test_data)
## [1] 25

MEDIAN

This function calculates the median by sorting the values and selecting the middle value (or averaging the two middle values if the dataset size is even).

my_median <- function(x) {
  x <- sort(x)      # sortING values
  n <- length(x)
  
  if (n %% 2 == 1) {
    # odd number of values
    return(x[(n + 1) / 2])
  } else {
    # even number of values
    mid1 <- x[n / 2]
    mid2 <- x[(n / 2) + 1]
    return((mid1 + mid2) / 2)
  }
}
# Testing the function
my_median(c(10, 20, 30, 40, 50))
## [1] 30

STANDARD DEVIATION

This function computes the standard deviation by measuring how far values are spread from the mean.

my_sd <- function(x) {
  m <- my_mean(x) 
  n <- length(x)
  
  variance <- sum((x - m)^2) / (n - 1)
  result <- sqrt(variance)
  
  return(result)
}
# Testing the standard deviation function
my_sd(c(10, 20, 30, 40, 50))
## [1] 15.81139

ASSIGNMENT 6: USING lapply(), sapply(), vapply(), and map() FUCNTIONS

The apply functions are used in R to perform operations on multiple elements efficiently without using loops.

# Example small dataset

numbers <- list(a = 1:5, b = 6:10, c = c(2, 4, 6, 8))

Using lapply()

The below example applies the mean() function to each element and returns the results as a list.

lapply(numbers, mean)
## $a
## [1] 3
## 
## $b
## [1] 8
## 
## $c
## [1] 5

Using sapply()

This is Similar to lapply() but simplifies the output into a vector.

sapply(numbers, mean)
## a b c 
## 3 8 5

Using vapply()

It is a version of sapply() where the output type (numeric) is explicitly defined.

vapply(numbers, mean, numeric(1))
## a b c 
## 3 8 5

Using map()

library(purrr)
map(numbers, mean)
## $a
## [1] 3
## 
## $b
## [1] 8
## 
## $c
## [1] 5

Comparison and Conclusion

Comparison: These functions perform similar tasks but differ in how they return results and how strictly they enforce output types.

Conclusion: Using these functions improves efficiency and readability when applying operations across multiple datasets.