20251MBI040_Dieudonne

#####. Homework 1: Import data from statistical package and from data management system.
library(readxl)
library(readr)
library(dplyr)
library(DBI)
library(RMariaDB)
library(tidyverse)

finalethics <- read_excel("final grade - Ethics 2026 Jan -students.xlsx")
View(finalethics)
summary(finalethics)

##       ...1               ID           PR           Quizzes      
##  Min.   : 1.00   Length   :46   Min.   :0.000   Min.   : 6.200  
##  1st Qu.:12.25   N.unique :46   1st Qu.:3.000   1st Qu.: 9.025  
##  Median :23.50   N.blank  : 0   Median :3.000   Median : 9.500  
##  Mean   :23.50   Min.nchar:11   Mean   :3.337   Mean   : 9.293  
##  3rd Qu.:34.75   Max.nchar:11   3rd Qu.:4.000   3rd Qu.: 9.800  
##  Max.   :46.00                  Max.   :5.000   Max.   :10.000  
##       Pres           MT/30           FE/40         Total/100    
##  Min.   : 8.00   Min.   :15.27   Min.   :23.11   Min.   :63.81  
##  1st Qu.:11.00   1st Qu.:21.27   1st Qu.:29.33   1st Qu.:75.08  
##  Median :12.00   Median :23.45   Median :31.56   Median :78.96  
##  Mean   :11.46   Mean   :23.12   Mean   :31.23   Mean   :78.44  
##  3rd Qu.:12.00   3rd Qu.:25.09   3rd Qu.:32.89   3rd Qu.:83.50  
##  Max.   :14.00   Max.   :27.82   Max.   :37.33   Max.   :88.08  
##    Average/20   
##  Min.   :12.76  
##  1st Qu.:15.02  
##  Median :15.79  
##  Mean   :15.69  
##  3rd Qu.:16.70  
##  Max.   :17.62

variable.names(finalethics)

## [1] "...1"       "ID"         "PR"         "Quizzes"    "Pres"      
## [6] "MT/30"      "FE/40"      "Total/100"  "Average/20"

##### dataset 2:

advanced<- read_excel("advanced.xlsx")

View(advanced)
summary(advanced)

##        #                 ID           Names     Project /20    Presentation /30
##  Min.   : 1.00   Length   :46   Length   :46   Min.   :15.00   Min.   :26.00   
##  1st Qu.:12.25   N.unique :46   N.unique :46   1st Qu.:15.00   1st Qu.:26.00   
##  Median :23.50   N.blank  : 0   N.blank  : 0   Median :16.00   Median :27.00   
##  Mean   :23.50   Min.nchar:11   Min.nchar:11   Mean   :15.59   Mean   :26.85   
##  3rd Qu.:34.75   Max.nchar:11   Max.nchar:28   3rd Qu.:16.00   3rd Qu.:27.00   
##  Max.   :46.00                                 Max.   :16.00   Max.   :28.00   
##  Application /10  Final Exam /40    Total /100     Average /20   
##  Min.   : 8.000   Min.   :28.00   Min.   :78.00   Min.   :15.60  
##  1st Qu.: 9.000   1st Qu.:30.25   1st Qu.:82.00   1st Qu.:16.40  
##  Median :10.000   Median :34.50   Median :86.00   Median :17.20  
##  Mean   : 9.674   Mean   :33.43   Mean   :85.54   Mean   :17.11  
##  3rd Qu.:10.000   3rd Qu.:36.00   3rd Qu.:89.00   3rd Qu.:17.80  
##  Max.   :10.000   Max.   :39.00   Max.   :91.00   Max.   :18.20

## data set 3:
con <- dbConnect(
  RMariaDB::MariaDB(),
  user     = "root",
  password = "",
  dbname   = "user_dieudonne",
  host     = "localhost",
  port     = 3306
)
dbListTables(con)

## [1] "students"

dbListFields(con, "students")

## [1] "Identity" "Names"    "Marks"    "Class"

my_tb <- dbReadTable(con, "students")
View(my_tb)

## data set 4: world population
world_population <- read_csv("world_population.csv")
View(world_population)
summary(world_population)

##       Rank               CCA3     Country/Territory      Capital   
##  Min.   :  1.00   Length   :234   Length   :234     Length   :234  
##  1st Qu.: 59.25   N.unique :234   N.unique :234     N.unique :234  
##  Median :117.50   N.blank  :  0   N.blank  :  0     N.blank  :  0  
##  Mean   :117.50   Min.nchar:  3   Min.nchar:  4     Min.nchar:  4  
##  3rd Qu.:175.75   Max.nchar:  3   Max.nchar: 32     Max.nchar: 19  
##  Max.   :234.00                                                    
##      Continent   2022 Population     2020 Population     2015 Population    
##  Length   :234   Min.   :5.100e+02   Min.   :5.200e+02   Min.   :5.640e+02  
##  N.unique :  6   1st Qu.:4.197e+05   1st Qu.:4.153e+05   1st Qu.:4.047e+05  
##  N.blank  :  0   Median :5.560e+06   Median :5.493e+06   Median :5.307e+06  
##  Min.nchar:  4   Mean   :3.407e+07   Mean   :3.350e+07   Mean   :3.173e+07  
##  Max.nchar: 13   3rd Qu.:2.248e+07   3rd Qu.:2.145e+07   3rd Qu.:1.973e+07  
##                  Max.   :1.426e+09   Max.   :1.425e+09   Max.   :1.394e+09  
##  2010 Population     2000 Population     1990 Population    
##  Min.   :5.960e+02   Min.   :6.510e+02   Min.   :7.000e+02  
##  1st Qu.:3.931e+05   1st Qu.:3.272e+05   1st Qu.:2.641e+05  
##  Median :4.943e+06   Median :4.293e+06   Median :3.825e+06  
##  Mean   :2.985e+07   Mean   :2.627e+07   Mean   :2.271e+07  
##  3rd Qu.:1.916e+07   3rd Qu.:1.576e+07   3rd Qu.:1.187e+07  
##  Max.   :1.348e+09   Max.   :1.264e+09   Max.   :1.154e+09  
##  1980 Population     1970 Population       Area (km²)       Density (per km²)  
##  Min.   :      733   Min.   :      752   Min.   :       1   Min.   :2.610e-02  
##  1st Qu.:   229614   1st Qu.:   155997   1st Qu.:    2650   1st Qu.:3.842e+01  
##  Median :  3141146   Median :  2604830   Median :   81200   Median :9.535e+01  
##  Mean   : 18984617   Mean   : 15786909   Mean   :  581449   Mean   :4.521e+02  
##  3rd Qu.:  9826054   3rd Qu.:  8817329   3rd Qu.:  430426   3rd Qu.:2.389e+02  
##  Max.   :982372466   Max.   :822534450   Max.   :17098242   Max.   :2.317e+04  
##   Growth Rate    World Population Percentage
##  Min.   :0.912   Min.   : 0.0000            
##  1st Qu.:1.002   1st Qu.: 0.0100            
##  Median :1.008   Median : 0.0700            
##  Mean   :1.010   Mean   : 0.4271            
##  3rd Qu.:1.017   3rd Qu.: 0.2800            
##  Max.   :1.069   Max.   :17.8800

variable.names(world_population)

##  [1] "Rank"                        "CCA3"                       
##  [3] "Country/Territory"           "Capital"                    
##  [5] "Continent"                   "2022 Population"            
##  [7] "2020 Population"             "2015 Population"            
##  [9] "2010 Population"             "2000 Population"            
## [11] "1990 Population"             "1980 Population"            
## [13] "1970 Population"             "Area (km²)"                 
## [15] "Density (per km²)"           "Growth Rate"                
## [17] "World Population Percentage"

length(variable.names(world_population))

## [1] 17

head(world_population, n = 5)

## # A tibble: 5 × 17
##    Rank CCA3  `Country/Territory` Capital          Continent `2022 Population`
##   <dbl> <chr> <chr>               <chr>            <chr>                 <dbl>
## 1    36 AFG   Afghanistan         Kabul            Asia               41128771
## 2   138 ALB   Albania             Tirana           Europe              2842321
## 3    34 DZA   Algeria             Algiers          Africa             44903225
## 4   213 ASM   American Samoa      Pago Pago        Oceania               44273
## 5   203 AND   Andorra             Andorra la Vella Europe                79824
## # ℹ 11 more variables: `2020 Population` <dbl>, `2015 Population` <dbl>,
## #   `2010 Population` <dbl>, `2000 Population` <dbl>, `1990 Population` <dbl>,
## #   `1980 Population` <dbl>, `1970 Population` <dbl>, `Area (km²)` <dbl>,
## #   `Density (per km²)` <dbl>, `Growth Rate` <dbl>,
## #   `World Population Percentage` <dbl>

tail(world_population, n = 10)

## # A tibble: 10 × 17
##     Rank CCA3  `Country/Territory` Capital      Continent     `2022 Population`
##    <dbl> <chr> <chr>               <chr>        <chr>                     <dbl>
##  1    43 UZB   Uzbekistan          Tashkent     Asia                   34627652
##  2   181 VUT   Vanuatu             Port-Vila    Oceania                  326740
##  3   234 VAT   Vatican City        Vatican City Europe                      510
##  4    51 VEN   Venezuela           Caracas      South America          28301696
##  5    16 VNM   Vietnam             Hanoi        Asia                   98186856
##  6   226 WLF   Wallis and Futuna   Mata-Utu     Oceania                   11572
##  7   172 ESH   Western Sahara      El Aaiún     Africa                   575986
##  8    46 YEM   Yemen               Sanaa        Asia                   33696614
##  9    63 ZMB   Zambia              Lusaka       Africa                 20017675
## 10    74 ZWE   Zimbabwe            Harare       Africa                 16320537
## # ℹ 11 more variables: `2020 Population` <dbl>, `2015 Population` <dbl>,
## #   `2010 Population` <dbl>, `2000 Population` <dbl>, `1990 Population` <dbl>,
## #   `1980 Population` <dbl>, `1970 Population` <dbl>, `Area (km²)` <dbl>,
## #   `Density (per km²)` <dbl>, `Growth Rate` <dbl>,
## #   `World Population Percentage` <dbl>

typeof(world_population)

## [1] "list"

typeof(world_population$Rank)

## [1] "double"

typeof(world_population$CCA3)

## [1] "character"

nrow(world_population)

## [1] 234

ncol(world_population)

## [1] 17

dim(world_population)

## [1] 234  17

glimpse(world_population)

## Rows: 234
## Columns: 17
## $ Rank                          <dbl> 36, 138, 34, 213, 203, 42, 224, 201, 33,…
## $ CCA3                          <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO"…
## $ `Country/Territory`           <chr> "Afghanistan", "Albania", "Algeria", "Am…
## $ Capital                       <chr> "Kabul", "Tirana", "Algiers", "Pago Pago…
## $ Continent                     <chr> "Asia", "Europe", "Africa", "Oceania", "…
## $ `2022 Population`             <dbl> 41128771, 2842321, 44903225, 44273, 7982…
## $ `2020 Population`             <dbl> 38972230, 2866849, 43451666, 46189, 7770…
## $ `2015 Population`             <dbl> 33753499, 2882481, 39543154, 51368, 7174…
## $ `2010 Population`             <dbl> 28189672, 2913399, 35856344, 54849, 7151…
## $ `2000 Population`             <dbl> 19542982, 3182021, 30774621, 58230, 6609…
## $ `1990 Population`             <dbl> 10694796, 3295066, 25518074, 47818, 5356…
## $ `1980 Population`             <dbl> 12486631, 2941651, 18739378, 32886, 3561…
## $ `1970 Population`             <dbl> 10752971, 2324731, 13795915, 27075, 1986…
## $ `Area (km²)`                  <dbl> 652230, 28748, 2381741, 199, 468, 124670…
## $ `Density (per km²)`           <dbl> 63.0587, 98.8702, 18.8531, 222.4774, 170…
## $ `Growth Rate`                 <dbl> 1.0257, 0.9957, 1.0164, 0.9831, 1.0100, …
## $ `World Population Percentage` <dbl> 0.52, 0.04, 0.56, 0.00, 0.00, 0.45, 0.00…

world_population[duplicated(world_population), ]

## # A tibble: 0 × 17
## # ℹ 17 variables: Rank <dbl>, CCA3 <chr>, Country/Territory <chr>,
## #   Capital <chr>, Continent <chr>, 2022 Population <dbl>,
## #   2020 Population <dbl>, 2015 Population <dbl>, 2010 Population <dbl>,
## #   2000 Population <dbl>, 1990 Population <dbl>, 1980 Population <dbl>,
## #   1970 Population <dbl>, Area (km²) <dbl>, Density (per km²) <dbl>,
## #   Growth Rate <dbl>, World Population Percentage <dbl>

sum(duplicated(world_population, fromLast = TRUE))

## [1] 0

sum(is.na(world_population$Rank))

## [1] 0

sum(is.na(world_population$CCA3))

## [1] 0

sum(is.na(world_population$Capital))

## [1] 0

sum(is.na(world_population$Continent))

## [1] 0

#####. Homework 2: Merging data sets
#####. a. Merging finalethics and advanced by ID
lessons <- merge(finalethics, advanced, by = "ID")
View(lessons)

#####. Homework 3: dplyr verbs on world_population

# FIX 1: was "World_population" (capital W) — corrected to world_population
# FIX 2: arrange(desc(world_population)) — corrected to arrange(desc(avg_population))

result <- world_population %>%

  # 1. Rename columns
  rename(
    Country  = `Country/Territory`,
    Pop_2022 = `2022 Population`
  ) %>%

  # 2. Select only useful columns
  select(Country, CCA3, Continent, Pop_2022) %>%

  # 3. Filter: keep large population countries
  filter(Pop_2022 > 50000000) %>%

  # 4. Create new variable
  mutate(
    Pop_Millions = Pop_2022 / 1000000
  ) %>%

  # 5. Group by continent
  group_by(Continent) %>%

  # 6. Summarise
  summarise(
    avg_population   = mean(Pop_Millions,  na.rm = TRUE),
    total_population = sum(Pop_2022,       na.rm = TRUE),
    countries        = n()
  ) %>%

  # 7. Arrange by average population descending
  arrange(desc(avg_population))

# View result
result

## # A tibble: 5 × 4
##   Continent     avg_population total_population countries
##   <chr>                  <dbl>            <dbl>     <int>
## 1 Asia                   324.        4214854597        13
## 2 North America          233.         465793982         2
## 3 South America          134.         267187522         2
## 4 Africa                 104.         731340571         7
## 5 Europe                  83.9        419256195         5

#####. Homework 4: Trace() function
# The trace() function inserts temporary debugging code into a function without
# permanently changing its source code. It is useful when investigating functions
# in external packages or deeply nested scripts.

calculate_grade <- function(score, total = 100) {
  pct   <- round((score / total) * 100, 1)
  grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >= 70) "C" else "F"
  cat("Score:", pct, "% -> Grade:", grade, "\n")
  return(invisible(list(pct = pct, grade = grade)))
}

calculate_grade(3)

## Score: 3 % -> Grade: F

trace(calculate_grade, tracer = browser)

## [1] "calculate_grade"

# Test with tracing active
calculate_grade(85)

## Tracing calculate_grade(85) on entry 
## Called from: eval(expr, p)
## debug: {
##     pct <- round((score/total) * 100, 1)
##     grade <- if (pct >= 90) 
##         "A"
##     else if (pct >= 80) 
##         "B"
##     else if (pct >= 70) 
##         "C"
##     else "F"
##     cat("Score:", pct, "% -> Grade:", grade, "\n")
##     return(invisible(list(pct = pct, grade = grade)))
## }
## debug: pct <- round((score/total) * 100, 1)
## debug: grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >= 
##     70) "C" else "F"
## debug: if (pct >= 80) "B" else if (pct >= 70) "C" else "F"
## debug: [1] "B"
## debug: cat("Score:", pct, "% -> Grade:", grade, "\n")
## Score: 85 % -> Grade: B 
## debug: return(invisible(list(pct = pct, grade = grade)))

calculate_grade(72)

## Tracing calculate_grade(72) on entry 
## Called from: eval(expr, p)
## debug: {
##     pct <- round((score/total) * 100, 1)
##     grade <- if (pct >= 90) 
##         "A"
##     else if (pct >= 80) 
##         "B"
##     else if (pct >= 70) 
##         "C"
##     else "F"
##     cat("Score:", pct, "% -> Grade:", grade, "\n")
##     return(invisible(list(pct = pct, grade = grade)))
## }
## debug: pct <- round((score/total) * 100, 1)
## debug: grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >= 
##     70) "C" else "F"
## debug: if (pct >= 80) "B" else if (pct >= 70) "C" else "F"
## debug: if (pct >= 70) "C" else "F"
## debug: [1] "C"
## debug: cat("Score:", pct, "% -> Grade:", grade, "\n")
## Score: 72 % -> Grade: C 
## debug: return(invisible(list(pct = pct, grade = grade)))

calculate_grade(95)

## Tracing calculate_grade(95) on entry 
## Called from: eval(expr, p)
## debug: {
##     pct <- round((score/total) * 100, 1)
##     grade <- if (pct >= 90) 
##         "A"
##     else if (pct >= 80) 
##         "B"
##     else if (pct >= 70) 
##         "C"
##     else "F"
##     cat("Score:", pct, "% -> Grade:", grade, "\n")
##     return(invisible(list(pct = pct, grade = grade)))
## }
## debug: pct <- round((score/total) * 100, 1)
## debug: grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >= 
##     70) "C" else "F"
## debug: [1] "A"
## debug: cat("Score:", pct, "% -> Grade:", grade, "\n")
## Score: 95 % -> Grade: A 
## debug: return(invisible(list(pct = pct, grade = grade)))

untrace(calculate_grade)

###### Recover() function
# By executing options(error = recover), R halts on error and presents the call
# stack — the sequence of function calls that led to the crash — letting you
# inspect the environment at each level interactively.

#### Homework 5 & 6

### a. Custom mean function
data_mean <- function(x) {
  if (!is.numeric(x)) stop("Input must be a numeric vector")
  if (length(x) == 0)  stop("Input vector cannot be empty")

  total  <- sum(x)
  count  <- length(x)
  result <- total / count
  return(result)
}

### b. Statistical summary function
my_summary_stats <- function(x) {
  if (!is.numeric(x)) stop("Error: This function only works with numeric variables.")

  results <- data.frame(
    Mean    = round(mean(x,   na.rm = TRUE), 2),
    Median  = round(median(x, na.rm = TRUE), 2),
    Std_Dev = round(sd(x,     na.rm = TRUE), 2),
    Min     = min(x, na.rm = TRUE),
    Max     = max(x, na.rm = TRUE)
  )
  return(results)
}

# World population sample dataset
world_pop_data <- data.frame(
  Nation        = c("China", "India", "United States", "Nigeria", "Rwanda", "Kenya"),
  Continent     = c("Asia",  "Asia",  "North America", "Africa",  "Africa", "Africa"),
  Pop_2023      = c(1425671352, 1428627663, 339996563, 223804632, 14094683, 54027487),
  Land_Area_km2 = c(9596960,   3287263,    9833517,   923768,    26338,    580367)
)

cat("--- Summary Statistics for 2023 Population ---\n")

## --- Summary Statistics for 2023 Population ---

print(my_summary_stats(world_pop_data$Pop_2023))

##        Mean    Median   Std_Dev      Min        Max
## 1 581037063 281900598 665837096 14094683 1428627663

cat("\n--- Summary Statistics for Land Area (km2) ---\n")

## 
## --- Summary Statistics for Land Area (km2) ---

print(my_summary_stats(world_pop_data$Land_Area_km2))

##      Mean  Median Std_Dev   Min     Max
## 1 4041369 2105516 4534329 26338 9833517

### Custom two-sample t-test function
custom_t_test <- function(group1, group2) {
  test_results <- t.test(group1, group2)

  clean_output <- data.frame(
    T_Statistic  = round(test_results$statistic, 3),
    P_Value      = signif(test_results$p.value, 4),
    Mean_Group1  = round(test_results$estimate[1], 2),
    Mean_Group2  = round(test_results$estimate[2], 2),
    Significant  = ifelse(test_results$p.value < 0.05, "Yes", "No")
  )
  rownames(clean_output) <- NULL
  return(clean_output)
}

africa_area <- world_pop_data$Land_Area_km2[world_pop_data$Continent == "Africa"]
asia_area   <- world_pop_data$Land_Area_km2[world_pop_data$Continent == "Asia"]

cat("--- Two-Sample T-Test: Land Area (Africa vs. Asia) ---\n")

## --- Two-Sample T-Test: Land Area (Africa vs. Asia) ---

print(custom_t_test(africa_area, asia_area))

##   T_Statistic P_Value Mean_Group1 Mean_Group2 Significant
## 1      -1.874  0.3095    510157.7     6442112          No

### Homework 6: Apply family
numeric_data <- world_pop_data[, c("Pop_2023", "Land_Area_km2")]

# 1. lapply()
cat("--- 1. lapply() Output (Returns a List) ---\n")

## --- 1. lapply() Output (Returns a List) ---

print(lapply(numeric_data, function(x) round(x / 1000000, 2)))

## $Pop_2023
## [1] 1425.67 1428.63  340.00  223.80   14.09   54.03
## 
## $Land_Area_km2
## [1] 9.60 3.29 9.83 0.92 0.03 0.58

# 2. sapply()
cat("\n--- 2. sapply() Output (Returns a Matrix) ---\n")

## 
## --- 2. sapply() Output (Returns a Matrix) ---

print(sapply(numeric_data, function(x) round(x / 1000000, 2)))

##      Pop_2023 Land_Area_km2
## [1,]  1425.67          9.60
## [2,]  1428.63          3.29
## [3,]   340.00          9.83
## [4,]   223.80          0.92
## [5,]    14.09          0.03
## [6,]    54.03          0.58

# 3. vapply()
cat("\n--- 3. vapply() Output (Safe and Strict) ---\n")

## 
## --- 3. vapply() Output (Safe and Strict) ---

print(vapply(numeric_data, function(x) round(x / 1000000, 2), FUN.VALUE = numeric(6)))

##      Pop_2023 Land_Area_km2
## [1,]  1425.67          9.60
## [2,]  1428.63          3.29
## [3,]   340.00          9.83
## [4,]   223.80          0.92
## [5,]    14.09          0.03
## [6,]    54.03          0.58

# 4. tapply()
cat("\n--- 4. tapply() Output (Grouped Calculation) ---\n")

## 
## --- 4. tapply() Output (Grouped Calculation) ---

print(tapply(world_pop_data$Pop_2023, world_pop_data$Continent, mean))

##        Africa          Asia North America 
##      97308934    1427149508     339996563

# 5. mapply()
cat("\n--- 5. mapply() Output (Population Density) ---\n")

## 
## --- 5. mapply() Output (Population Density) ---

density_calc <- mapply(function(pop, area) pop / area,
                       world_pop_data$Pop_2023,
                       world_pop_data$Land_Area_km2)
print(round(density_calc, 2))

## [1] 148.55 434.59  34.58 242.27 535.15  93.09

20251MBI040_Dieudonne_Nirembere

NIREMBERE Dieudonne

24/05/2026