ALL ASSIGNMENTS - R For Data Science

Assignement 1: Importing Data From Different Sources

Importing data from different sources allows analysts to combine and work with information stored in various formats and systems for analysis and reporting. In this assignment, We will imported data from a created data frame, a CSV file, and a PostgreSQL database to demonstrate different data access methods in R programming language.

Creating and consuming a dataframe with R

#1 Creating a dataset with 100,000 rows (simulating "big data")
n <- 100000  

# Generate variables
data <- data.frame(
  id = 1:n,   # Unique ID for each row
  
  age = sample(18:70, n, replace = TRUE),  
  # Random ages between 18 and 70
  
  salary = rnorm(n, mean = 50000, sd = 15000),  
  # Normally distributed salaries
  
  department = sample(c("IT", "HR", "Finance", "Sales"), n, replace = TRUE),  
  # Random department categories
  
  experience = sample(0:40, n, replace = TRUE)  
  # Years of experience
)
kable(head(data))

id	age	salary	department	experience
1	48	35105.06	Finance	17
2	58	20879.84	Finance	20
3	25	39407.48	Sales	26
4	48	42250.00	HR	9
5	35	37878.72	HR	9
6	24	48283.05	HR	6

kable(tail(data))

	id	age	salary	department	experience
99995	99995	68	54057.77	HR	15
99996	99996	62	60363.95	IT	15
99997	99997	62	56260.00	IT	35
99998	99998	60	53618.39	HR	6
99999	99999	35	34713.28	IT	8
100000	100000	30	24685.59	Finance	2

Importing Data From a Database

Importing from a database (PostgreSQL) will enables direct access to structured, live data for efficient querying and analysis.

Before working with a database, all required libraries and packages must be imported and loaded to establish and manage the connection.

#install.packages("DBI")
#install.packages("RSQLite")
#install.packages("dplyr")

# Load packages into memory
library(DBI)
library(RSQLite)
library(dplyr)
library(knitr)
library(ggplot2)


# Create PostgreSQL connection

conn <- dbConnect(
    RPostgres::Postgres(),
    dbname = "system_audit_db",
    host = "localhost",
    port = 5432,
    user = "postgres",
    password = "arthur"
)

# Read Data from PostgreSQL

logs <- dbGetQuery(
    conn,  "SELECT * FROM audit_logs"
)

head(logs)

##   log_id user_id department          login_time  status access_level
## 1      1    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 2      2    1002    Finance 2026-05-07 13:20:49 SUCCESS        ADMIN
## 3      3    1003      Audit 2026-05-07 13:20:49  FAILED         USER
## 4      4    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 5      5    1001         IT 2026-05-07 13:20:49  FAILED         USER
##     ip_address
## 1 192.168.1.10
## 2     10.0.0.5
## 3   172.16.0.8
## 4 192.168.1.10
## 5 192.168.1.10

tail(logs)

##   log_id user_id department          login_time  status access_level
## 1      1    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 2      2    1002    Finance 2026-05-07 13:20:49 SUCCESS        ADMIN
## 3      3    1003      Audit 2026-05-07 13:20:49  FAILED         USER
## 4      4    1001         IT 2026-05-07 13:20:49  FAILED         USER
## 5      5    1001         IT 2026-05-07 13:20:49  FAILED         USER
##     ip_address
## 1 192.168.1.10
## 2     10.0.0.5
## 3   172.16.0.8
## 4 192.168.1.10
## 5 192.168.1.10

Importing Data From a XML file

#install.packages("xml2")
library(xml2)

#Reading the XML file
xml_data <- read_xml("C:/Users/abutera/Documents/data.xml")

#Viewing the structure
xml_data

## {xml_document}
## <employees>
## [1] <employee>\n  <id>1</id>\n  <name>Arthur</name>\n  <role>Analyst</role>\n ...
## [2] <employee>\n  <id>2</id>\n  <name>Jane</name>\n  <role>Manager</role>\n</ ...

#Extracting the values into a dataframe
employees <- xml_find_all(xml_data, "//employee")

ids <- xml_text(xml_find_all(employees, "id"))
names <- xml_text(xml_find_all(employees, "name"))
roles <- xml_text(xml_find_all(employees, "role"))

df <- data.frame(id = ids, name = names, role = roles)

kable(df)

id	name	role
1	Arthur	Analyst
2	Jane	Manager

Importing Data From a statistical Package

library(haven)
data <- read_sav("C:/Users/abutera/Documents/audit_logs.sav")
kable(data)

user_id	department	status	login_time	ip_address
U001	IT	SUCCESS	08:00	192.168.1.10
U002	Finance	FAILED	08:10	10.0.0.5
U003	HR	FAILED	09:00	172.16.0.8
U001	IT	FAILED	09:15	192.168.1.10
U004	Finance	SUCCESS	10:00	10.0.0.9
U002		FAILED	10:05	10.0.0.5
U005	IT	FAILED	11:00	172.16.0.3
U003	HR	SUCCESS	11:20	192.168.1.11
U001	IT	FAILED	12:00	192.168.1.10

ASSIGNMENT 2: MERGING 2 DATASETS(LOGS AND EMPLOYEES) USING DIFFERENT VARIABLES

First loading a second dataset (Employees) from system_audit_db database to merge with logs dataset

emp <- dbGetQuery(conn, "SELECT * FROM employees")

# First rows
kable(logs)

log_id	user_id	department	login_time	status	access_level	ip_address
1	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10
2	1002	Finance	2026-05-07 13:20:49	SUCCESS	ADMIN	10.0.0.5
3	1003	Audit	2026-05-07 13:20:49	FAILED	USER	172.16.0.8
4	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10
5	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10

# Summary
summary(logs)

##      log_id     user_id         department   login_time                 
##  Min.   :1   Min.   :1001   Length   :5    Min.   :2026-05-07 13:20:49  
##  1st Qu.:2   1st Qu.:1001   N.unique :3    1st Qu.:2026-05-07 13:20:49  
##  Median :3   Median :1001   N.blank  :0    Median :2026-05-07 13:20:49  
##  Mean   :3   Mean   :1002   Min.nchar:2    Mean   :2026-05-07 13:20:49  
##  3rd Qu.:4   3rd Qu.:1002   Max.nchar:7    3rd Qu.:2026-05-07 13:20:49  
##  Max.   :5   Max.   :1003                  Max.   :2026-05-07 13:20:49  
##        status     access_level     ip_address
##  Length   :5   Length   :5     Length   : 5  
##  N.unique :2   N.unique :2     N.unique : 3  
##  N.blank  :0   N.blank  :0     N.blank  : 0  
##  Min.nchar:6   Min.nchar:4     Min.nchar: 8  
##  Max.nchar:7   Max.nchar:5     Max.nchar:12  
##

# viewing all database table data
kable(logs)

log_id	user_id	department	login_time	status	access_level	ip_address
1	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10
2	1002	Finance	2026-05-07 13:20:49	SUCCESS	ADMIN	10.0.0.5
3	1003	Audit	2026-05-07 13:20:49	FAILED	USER	172.16.0.8
4	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10
5	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10

Performing an inner join between logs and emp using user_id as the matching key.

glimpse(logs)

## Rows: 5
## Columns: 7
## $ log_id       <int> 1, 2, 3, 4, 5
## $ user_id      <int> 1001, 1002, 1003, 1001, 1001
## $ department   <chr> "IT", "Finance", "Audit", "IT", "IT"
## $ login_time   <dttm> 2026-05-07 13:20:49, 2026-05-07 13:20:49, 2026-05-07 13:2…
## $ status       <chr> "FAILED", "SUCCESS", "FAILED", "FAILED", "FAILED"
## $ access_level <chr> "USER", "ADMIN", "USER", "USER", "USER"
## $ ip_address   <chr> "192.168.1.10", "10.0.0.5", "172.16.0.8", "192.168.1.10",…

glimpse(emp)

## Rows: 5
## Columns: 8
## $ employee_id  <int> 1, 2, 3, 4, 5
## $ user_id      <int> 1001, 1002, 1003, 1004, 1005
## $ full_name    <chr> "Alice Nkusi", "Bob Habimana", "Claire Uwera", "Denis Mug…
## $ department   <chr> "IT", "Finance", "Audit", "HR", "Marketing"
## $ job_title    <chr> "Sysadmin", "Analyst", "Auditor", "HR Officer", "Designer"
## $ access_level <chr> "USER", "ADMIN", "USER", "USER", "USER"
## $ hire_date    <date> 2021-03-15, 2019-07-01, 2022-01-20, 2020-09-10, 2023-05-0…
## $ active       <lgl> TRUE, TRUE, TRUE, TRUE, FALSE

# SQL: INNER JOIN employees e ON a.user_id = e.user_id
inner_one <- inner_join(
  logs,   # left table  (audit_logs)
  emp,    # right table (employees)
  by = "user_id"   # the single matching column
)

kable(inner_one)

log_id	user_id	department.x	login_time	status	access_level.x	ip_address	employee_id	full_name	department.y	job_title	access_level.y	hire_date	active
1	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	IT	Sysadmin	USER	2021-03-15	TRUE
2	1002	Finance	2026-05-07 13:20:49	SUCCESS	ADMIN	10.0.0.5	2	Bob Habimana	Finance	Analyst	ADMIN	2019-07-01	TRUE
3	1003	Audit	2026-05-07 13:20:49	FAILED	USER	172.16.0.8	3	Claire Uwera	Audit	Auditor	USER	2022-01-20	TRUE
4	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	IT	Sysadmin	USER	2021-03-15	TRUE
5	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	IT	Sysadmin	USER	2021-03-15	TRUE

Merging two datasets by matching records where both user_id and department are the same in each table.

# SQL equivalent: ON a.user_id = e.user_id AND a.department = e.department (using 2 variables)
inner_two <- inner_join(
  logs,
  emp,
  by = c("user_id", "department")  # BOTH must match simultaneously
)
kable(inner_two)

log_id	user_id	department	login_time	status	access_level.x	ip_address	employee_id	full_name	job_title	access_level.y	hire_date	active
1	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	Sysadmin	USER	2021-03-15	TRUE
2	1002	Finance	2026-05-07 13:20:49	SUCCESS	ADMIN	10.0.0.5	2	Bob Habimana	Analyst	ADMIN	2019-07-01	TRUE
3	1003	Audit	2026-05-07 13:20:49	FAILED	USER	172.16.0.8	3	Claire Uwera	Auditor	USER	2022-01-20	TRUE
4	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	Sysadmin	USER	2021-03-15	TRUE
5	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	Sysadmin	USER	2021-03-15	TRUE

Performing an inner join to merge logs and emp datasets by matching user_id, department, and access_level.

# SQL equivalent: ON user_id AND department AND access_level all match (using 3 variables)
inner_three <- inner_join(
  logs,
  emp,
  by = c("user_id", "department", "access_level")
)

kable(inner_three)

log_id	user_id	department	login_time	status	access_level	ip_address	employee_id	full_name	job_title	hire_date	active
1	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	Sysadmin	2021-03-15	TRUE
2	1002	Finance	2026-05-07 13:20:49	SUCCESS	ADMIN	10.0.0.5	2	Bob Habimana	Analyst	2019-07-01	TRUE
3	1003	Audit	2026-05-07 13:20:49	FAILED	USER	172.16.0.8	3	Claire Uwera	Auditor	2022-01-20	TRUE
4	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	Sysadmin	2021-03-15	TRUE
5	1001	IT	2026-05-07 13:20:49	FAILED	USER	192.168.1.10	1	Alice Nkusi	Sysadmin	2021-03-15	TRUE

Selecting columns before joining

inner_selected <- logs |>
  select(user_id, department, status, access_level, ip_address) |>  # keep only what you need from logs
  inner_join(
    emp |> select(user_id, full_name, job_title, hire_date),         # keep only what you need from emp
    by = "user_id"
  )

kable(inner_selected)

user_id	department	status	access_level	ip_address	full_name	job_title	hire_date
1001	IT	FAILED	USER	192.168.1.10	Alice Nkusi	Sysadmin	2021-03-15
1002	Finance	SUCCESS	ADMIN	10.0.0.5	Bob Habimana	Analyst	2019-07-01
1003	Audit	FAILED	USER	172.16.0.8	Claire Uwera	Auditor	2022-01-20
1001	IT	FAILED	USER	192.168.1.10	Alice Nkusi	Sysadmin	2021-03-15
1001	IT	FAILED	USER	192.168.1.10	Alice Nkusi	Sysadmin	2021-03-15

ASSIGNMENT 3: USING GROUPBY() AND %>%

GROUP BY is used to organize data into groups based on one or more columns, so that operations like counting, summing, or averaging can be performed on each group separately.

The pipe operator %>% is used to pass the result of one step directly into the next step, making code easier to read and follow like a sequence of actions.

# Finding failed attemps
failed_logins <- logs %>%
    filter(status == "FAILED")



#Presentation of suspicious users by attemps

# Count failed attempts by user
suspicious_users <- failed_logins %>%
    group_by(user_id) %>%
    summarise(
      failed_attempts = n()
    ) %>%
  
  arrange(desc(failed_attempts))


print(suspicious_users)

## # A tibble: 2 × 2
##   user_id failed_attempts
##     <int>           <int>
## 1    1001               3
## 2    1003               1

# Disable scientific notation
options(scipen = 999)
# Barplot

ggplot(suspicious_users, aes(x = user_id, y = failed_attempts)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Users with Failed Logins",
    x = "User ID",
    y = "Failed Login Attempts"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Login Status Distribution

#library(ggplot2)
ggplot(logs, aes(x = status, fill = status)) +
  geom_bar() +
  labs(
    title = "Distribution of Login Status",
    x = "Login Status",
    y = "Count"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Login Count per Department

library(dplyr)

logins_by_dept <- logs %>%
  group_by(department) %>%
  summarise(total_logins = n())

logins_by_dept

## # A tibble: 3 × 2
##   department total_logins
##   <chr>             <int>
## 1 Audit                 1
## 2 Finance               1
## 3 IT                    3

library(ggplot2)

ggplot(logins_by_dept, 
       aes(x = reorder(department, -total_logins), y = total_logins)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Number of Logins per Department",
    x = "Department",
    y = "Total Logins"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Assignment 4: USING trace() AND recover()

The trace() function is used to monitor function execution by inserting debugging messages without modifying the original code.

The recover() function allows debugging by pausing execution when an error occurs and enabling inspection of the program state.

# Demonstrating trace()
trace(dplyr::inner_join, tracer = quote(cat("Running join...\n")), print = FALSE)

## Tracing function "inner_join" in package "dplyr"

## [1] "inner_join"

inner_one <- inner_join(logs, emp, by = "user_id")

## Running join...

untrace(dplyr::inner_join)

## Untracing function "inner_join" in package "dplyr"

# Demonstrating recover()
options(error = recover)
# Example error (intentional)
# inner_error <- inner_join(logs, emp, by = "wrong_column")
options(error = NULL)

These debugging tools help identify issues in data processing and ensure correctness in data merging operations.

ASSIGNMENT 5: CREATING A FUNCTION TO CALCULATE THE MEAN, MEDINA AND STANDARD DEVIATION

MEAN()

This function calculates the average of a set of values. It first removes missing values and then divides the total sum by the number of remaining observations.

my_mean <- function(x) {
  x <- x[!is.na(x)]   # remove missing values
  sum(x) / length(x)
}

# Testing the function
# sample data
test_data <- c(10, 20, 30, 40)

# calculate mean
my_mean(test_data)

## [1] 25

# Testing the function including missing values

# sample data
test_data <- c(10, 20, 30, NA, 40)

# calculate mean
my_mean(test_data)

## [1] 25

MEDIAN

This function calculates the median by sorting the values and selecting the middle value (or averaging the two middle values if the dataset size is even).

my_median <- function(x) {
  x <- sort(x)      # sortING values
  n <- length(x)
  
  if (n %% 2 == 1) {
    # odd number of values
    return(x[(n + 1) / 2])
  } else {
    # even number of values
    mid1 <- x[n / 2]
    mid2 <- x[(n / 2) + 1]
    return((mid1 + mid2) / 2)
  }
}

# Testing the function
my_median(c(10, 20, 30, 40, 50))

## [1] 30

STANDARD DEVIATION

This function computes the standard deviation by measuring how far values are spread from the mean.

my_sd <- function(x) {
  m <- my_mean(x) 
  n <- length(x)
  
  variance <- sum((x - m)^2) / (n - 1)
  result <- sqrt(variance)
  
  return(result)
}

# Testing the standard deviation function
my_sd(c(10, 20, 30, 40, 50))

## [1] 15.81139

ASSIGNMENT 6: USING lapply(), sapply(), vapply(), and map() FUCNTIONS

The apply functions are used in R to perform operations on multiple elements efficiently without using loops.

# Example small dataset

numbers <- list(a = 1:5, b = 6:10, c = c(2, 4, 6, 8))

Using lapply()

The below example applies the mean() function to each element and returns the results as a list.

lapply(numbers, mean)

## $a
## [1] 3
## 
## $b
## [1] 8
## 
## $c
## [1] 5

Using sapply()

This is Similar to lapply() but simplifies the output into a vector.

sapply(numbers, mean)

## a b c 
## 3 8 5

Using vapply()

It is a version of sapply() where the output type (numeric) is explicitly defined.

vapply(numbers, mean, numeric(1))

## a b c 
## 3 8 5

Using map()

library(purrr)
map(numbers, mean)

## $a
## [1] 3
## 
## $b
## [1] 8
## 
## $c
## [1] 5

Comparison and Conclusion

Comparison: These functions perform similar tasks but differ in how they return results and how strictly they enforce output types.

Conclusion: Using these functions improves efficiency and readability when applying operations across multiple datasets.