#####. Homework 1: Import data from statistical package and from data management system.
library(readxl)
library(readr)
library(dplyr)
library(DBI)
library(RMariaDB)
library(tidyverse)
finalethics <- read_excel("final grade - Ethics 2026 Jan -students.xlsx")
View(finalethics)
summary(finalethics)
## ...1 ID PR Quizzes
## Min. : 1.00 Length :46 Min. :0.000 Min. : 6.200
## 1st Qu.:12.25 N.unique :46 1st Qu.:3.000 1st Qu.: 9.025
## Median :23.50 N.blank : 0 Median :3.000 Median : 9.500
## Mean :23.50 Min.nchar:11 Mean :3.337 Mean : 9.293
## 3rd Qu.:34.75 Max.nchar:11 3rd Qu.:4.000 3rd Qu.: 9.800
## Max. :46.00 Max. :5.000 Max. :10.000
## Pres MT/30 FE/40 Total/100
## Min. : 8.00 Min. :15.27 Min. :23.11 Min. :63.81
## 1st Qu.:11.00 1st Qu.:21.27 1st Qu.:29.33 1st Qu.:75.08
## Median :12.00 Median :23.45 Median :31.56 Median :78.96
## Mean :11.46 Mean :23.12 Mean :31.23 Mean :78.44
## 3rd Qu.:12.00 3rd Qu.:25.09 3rd Qu.:32.89 3rd Qu.:83.50
## Max. :14.00 Max. :27.82 Max. :37.33 Max. :88.08
## Average/20
## Min. :12.76
## 1st Qu.:15.02
## Median :15.79
## Mean :15.69
## 3rd Qu.:16.70
## Max. :17.62
variable.names(finalethics)
## [1] "...1" "ID" "PR" "Quizzes" "Pres"
## [6] "MT/30" "FE/40" "Total/100" "Average/20"
##### dataset 2:
advanced<- read_excel("advanced.xlsx")
View(advanced)
summary(advanced)
## # ID Names Project /20 Presentation /30
## Min. : 1.00 Length :46 Length :46 Min. :15.00 Min. :26.00
## 1st Qu.:12.25 N.unique :46 N.unique :46 1st Qu.:15.00 1st Qu.:26.00
## Median :23.50 N.blank : 0 N.blank : 0 Median :16.00 Median :27.00
## Mean :23.50 Min.nchar:11 Min.nchar:11 Mean :15.59 Mean :26.85
## 3rd Qu.:34.75 Max.nchar:11 Max.nchar:28 3rd Qu.:16.00 3rd Qu.:27.00
## Max. :46.00 Max. :16.00 Max. :28.00
## Application /10 Final Exam /40 Total /100 Average /20
## Min. : 8.000 Min. :28.00 Min. :78.00 Min. :15.60
## 1st Qu.: 9.000 1st Qu.:30.25 1st Qu.:82.00 1st Qu.:16.40
## Median :10.000 Median :34.50 Median :86.00 Median :17.20
## Mean : 9.674 Mean :33.43 Mean :85.54 Mean :17.11
## 3rd Qu.:10.000 3rd Qu.:36.00 3rd Qu.:89.00 3rd Qu.:17.80
## Max. :10.000 Max. :39.00 Max. :91.00 Max. :18.20
## data set 3:
con <- dbConnect(
RMariaDB::MariaDB(),
user = "root",
password = "",
dbname = "user_dieudonne",
host = "localhost",
port = 3306
)
dbListTables(con)
## [1] "students"
dbListFields(con, "students")
## [1] "Identity" "Names" "Marks" "Class"
my_tb <- dbReadTable(con, "students")
View(my_tb)
## data set 4: world population
world_population <- read_csv("world_population.csv")
View(world_population)
summary(world_population)
## Rank CCA3 Country/Territory Capital
## Min. : 1.00 Length :234 Length :234 Length :234
## 1st Qu.: 59.25 N.unique :234 N.unique :234 N.unique :234
## Median :117.50 N.blank : 0 N.blank : 0 N.blank : 0
## Mean :117.50 Min.nchar: 3 Min.nchar: 4 Min.nchar: 4
## 3rd Qu.:175.75 Max.nchar: 3 Max.nchar: 32 Max.nchar: 19
## Max. :234.00
## Continent 2022 Population 2020 Population 2015 Population
## Length :234 Min. :5.100e+02 Min. :5.200e+02 Min. :5.640e+02
## N.unique : 6 1st Qu.:4.197e+05 1st Qu.:4.153e+05 1st Qu.:4.047e+05
## N.blank : 0 Median :5.560e+06 Median :5.493e+06 Median :5.307e+06
## Min.nchar: 4 Mean :3.407e+07 Mean :3.350e+07 Mean :3.173e+07
## Max.nchar: 13 3rd Qu.:2.248e+07 3rd Qu.:2.145e+07 3rd Qu.:1.973e+07
## Max. :1.426e+09 Max. :1.425e+09 Max. :1.394e+09
## 2010 Population 2000 Population 1990 Population
## Min. :5.960e+02 Min. :6.510e+02 Min. :7.000e+02
## 1st Qu.:3.931e+05 1st Qu.:3.272e+05 1st Qu.:2.641e+05
## Median :4.943e+06 Median :4.293e+06 Median :3.825e+06
## Mean :2.985e+07 Mean :2.627e+07 Mean :2.271e+07
## 3rd Qu.:1.916e+07 3rd Qu.:1.576e+07 3rd Qu.:1.187e+07
## Max. :1.348e+09 Max. :1.264e+09 Max. :1.154e+09
## 1980 Population 1970 Population Area (km²) Density (per km²)
## Min. : 733 Min. : 752 Min. : 1 Min. :2.610e-02
## 1st Qu.: 229614 1st Qu.: 155997 1st Qu.: 2650 1st Qu.:3.842e+01
## Median : 3141146 Median : 2604830 Median : 81200 Median :9.535e+01
## Mean : 18984617 Mean : 15786909 Mean : 581449 Mean :4.521e+02
## 3rd Qu.: 9826054 3rd Qu.: 8817329 3rd Qu.: 430426 3rd Qu.:2.389e+02
## Max. :982372466 Max. :822534450 Max. :17098242 Max. :2.317e+04
## Growth Rate World Population Percentage
## Min. :0.912 Min. : 0.0000
## 1st Qu.:1.002 1st Qu.: 0.0100
## Median :1.008 Median : 0.0700
## Mean :1.010 Mean : 0.4271
## 3rd Qu.:1.017 3rd Qu.: 0.2800
## Max. :1.069 Max. :17.8800
variable.names(world_population)
## [1] "Rank" "CCA3"
## [3] "Country/Territory" "Capital"
## [5] "Continent" "2022 Population"
## [7] "2020 Population" "2015 Population"
## [9] "2010 Population" "2000 Population"
## [11] "1990 Population" "1980 Population"
## [13] "1970 Population" "Area (km²)"
## [15] "Density (per km²)" "Growth Rate"
## [17] "World Population Percentage"
length(variable.names(world_population))
## [1] 17
head(world_population, n = 5)
## # A tibble: 5 × 17
## Rank CCA3 `Country/Territory` Capital Continent `2022 Population`
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## # ℹ 11 more variables: `2020 Population` <dbl>, `2015 Population` <dbl>,
## # `2010 Population` <dbl>, `2000 Population` <dbl>, `1990 Population` <dbl>,
## # `1980 Population` <dbl>, `1970 Population` <dbl>, `Area (km²)` <dbl>,
## # `Density (per km²)` <dbl>, `Growth Rate` <dbl>,
## # `World Population Percentage` <dbl>
tail(world_population, n = 10)
## # A tibble: 10 × 17
## Rank CCA3 `Country/Territory` Capital Continent `2022 Population`
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 43 UZB Uzbekistan Tashkent Asia 34627652
## 2 181 VUT Vanuatu Port-Vila Oceania 326740
## 3 234 VAT Vatican City Vatican City Europe 510
## 4 51 VEN Venezuela Caracas South America 28301696
## 5 16 VNM Vietnam Hanoi Asia 98186856
## 6 226 WLF Wallis and Futuna Mata-Utu Oceania 11572
## 7 172 ESH Western Sahara El Aaiún Africa 575986
## 8 46 YEM Yemen Sanaa Asia 33696614
## 9 63 ZMB Zambia Lusaka Africa 20017675
## 10 74 ZWE Zimbabwe Harare Africa 16320537
## # ℹ 11 more variables: `2020 Population` <dbl>, `2015 Population` <dbl>,
## # `2010 Population` <dbl>, `2000 Population` <dbl>, `1990 Population` <dbl>,
## # `1980 Population` <dbl>, `1970 Population` <dbl>, `Area (km²)` <dbl>,
## # `Density (per km²)` <dbl>, `Growth Rate` <dbl>,
## # `World Population Percentage` <dbl>
typeof(world_population)
## [1] "list"
typeof(world_population$Rank)
## [1] "double"
typeof(world_population$CCA3)
## [1] "character"
nrow(world_population)
## [1] 234
ncol(world_population)
## [1] 17
dim(world_population)
## [1] 234 17
glimpse(world_population)
## Rows: 234
## Columns: 17
## $ Rank <dbl> 36, 138, 34, 213, 203, 42, 224, 201, 33,…
## $ CCA3 <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO"…
## $ `Country/Territory` <chr> "Afghanistan", "Albania", "Algeria", "Am…
## $ Capital <chr> "Kabul", "Tirana", "Algiers", "Pago Pago…
## $ Continent <chr> "Asia", "Europe", "Africa", "Oceania", "…
## $ `2022 Population` <dbl> 41128771, 2842321, 44903225, 44273, 7982…
## $ `2020 Population` <dbl> 38972230, 2866849, 43451666, 46189, 7770…
## $ `2015 Population` <dbl> 33753499, 2882481, 39543154, 51368, 7174…
## $ `2010 Population` <dbl> 28189672, 2913399, 35856344, 54849, 7151…
## $ `2000 Population` <dbl> 19542982, 3182021, 30774621, 58230, 6609…
## $ `1990 Population` <dbl> 10694796, 3295066, 25518074, 47818, 5356…
## $ `1980 Population` <dbl> 12486631, 2941651, 18739378, 32886, 3561…
## $ `1970 Population` <dbl> 10752971, 2324731, 13795915, 27075, 1986…
## $ `Area (km²)` <dbl> 652230, 28748, 2381741, 199, 468, 124670…
## $ `Density (per km²)` <dbl> 63.0587, 98.8702, 18.8531, 222.4774, 170…
## $ `Growth Rate` <dbl> 1.0257, 0.9957, 1.0164, 0.9831, 1.0100, …
## $ `World Population Percentage` <dbl> 0.52, 0.04, 0.56, 0.00, 0.00, 0.45, 0.00…
world_population[duplicated(world_population), ]
## # A tibble: 0 × 17
## # ℹ 17 variables: Rank <dbl>, CCA3 <chr>, Country/Territory <chr>,
## # Capital <chr>, Continent <chr>, 2022 Population <dbl>,
## # 2020 Population <dbl>, 2015 Population <dbl>, 2010 Population <dbl>,
## # 2000 Population <dbl>, 1990 Population <dbl>, 1980 Population <dbl>,
## # 1970 Population <dbl>, Area (km²) <dbl>, Density (per km²) <dbl>,
## # Growth Rate <dbl>, World Population Percentage <dbl>
sum(duplicated(world_population, fromLast = TRUE))
## [1] 0
sum(is.na(world_population$Rank))
## [1] 0
sum(is.na(world_population$CCA3))
## [1] 0
sum(is.na(world_population$Capital))
## [1] 0
sum(is.na(world_population$Continent))
## [1] 0
#####. Homework 2: Merging data sets
#####. a. Merging finalethics and advanced by ID
lessons <- merge(finalethics, advanced, by = "ID")
View(lessons)
#####. Homework 3: dplyr verbs on world_population
# FIX 1: was "World_population" (capital W) — corrected to world_population
# FIX 2: arrange(desc(world_population)) — corrected to arrange(desc(avg_population))
result <- world_population %>%
# 1. Rename columns
rename(
Country = `Country/Territory`,
Pop_2022 = `2022 Population`
) %>%
# 2. Select only useful columns
select(Country, CCA3, Continent, Pop_2022) %>%
# 3. Filter: keep large population countries
filter(Pop_2022 > 50000000) %>%
# 4. Create new variable
mutate(
Pop_Millions = Pop_2022 / 1000000
) %>%
# 5. Group by continent
group_by(Continent) %>%
# 6. Summarise
summarise(
avg_population = mean(Pop_Millions, na.rm = TRUE),
total_population = sum(Pop_2022, na.rm = TRUE),
countries = n()
) %>%
# 7. Arrange by average population descending
arrange(desc(avg_population))
# View result
result
## # A tibble: 5 × 4
## Continent avg_population total_population countries
## <chr> <dbl> <dbl> <int>
## 1 Asia 324. 4214854597 13
## 2 North America 233. 465793982 2
## 3 South America 134. 267187522 2
## 4 Africa 104. 731340571 7
## 5 Europe 83.9 419256195 5
#####. Homework 4: Trace() function
# The trace() function inserts temporary debugging code into a function without
# permanently changing its source code. It is useful when investigating functions
# in external packages or deeply nested scripts.
calculate_grade <- function(score, total = 100) {
pct <- round((score / total) * 100, 1)
grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >= 70) "C" else "F"
cat("Score:", pct, "% -> Grade:", grade, "\n")
return(invisible(list(pct = pct, grade = grade)))
}
calculate_grade(3)
## Score: 3 % -> Grade: F
trace(calculate_grade, tracer = browser)
## [1] "calculate_grade"
# Test with tracing active
calculate_grade(85)
## Tracing calculate_grade(85) on entry
## Called from: eval(expr, p)
## debug: {
## pct <- round((score/total) * 100, 1)
## grade <- if (pct >= 90)
## "A"
## else if (pct >= 80)
## "B"
## else if (pct >= 70)
## "C"
## else "F"
## cat("Score:", pct, "% -> Grade:", grade, "\n")
## return(invisible(list(pct = pct, grade = grade)))
## }
## debug: pct <- round((score/total) * 100, 1)
## debug: grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >=
## 70) "C" else "F"
## debug: if (pct >= 80) "B" else if (pct >= 70) "C" else "F"
## debug: [1] "B"
## debug: cat("Score:", pct, "% -> Grade:", grade, "\n")
## Score: 85 % -> Grade: B
## debug: return(invisible(list(pct = pct, grade = grade)))
calculate_grade(72)
## Tracing calculate_grade(72) on entry
## Called from: eval(expr, p)
## debug: {
## pct <- round((score/total) * 100, 1)
## grade <- if (pct >= 90)
## "A"
## else if (pct >= 80)
## "B"
## else if (pct >= 70)
## "C"
## else "F"
## cat("Score:", pct, "% -> Grade:", grade, "\n")
## return(invisible(list(pct = pct, grade = grade)))
## }
## debug: pct <- round((score/total) * 100, 1)
## debug: grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >=
## 70) "C" else "F"
## debug: if (pct >= 80) "B" else if (pct >= 70) "C" else "F"
## debug: if (pct >= 70) "C" else "F"
## debug: [1] "C"
## debug: cat("Score:", pct, "% -> Grade:", grade, "\n")
## Score: 72 % -> Grade: C
## debug: return(invisible(list(pct = pct, grade = grade)))
calculate_grade(95)
## Tracing calculate_grade(95) on entry
## Called from: eval(expr, p)
## debug: {
## pct <- round((score/total) * 100, 1)
## grade <- if (pct >= 90)
## "A"
## else if (pct >= 80)
## "B"
## else if (pct >= 70)
## "C"
## else "F"
## cat("Score:", pct, "% -> Grade:", grade, "\n")
## return(invisible(list(pct = pct, grade = grade)))
## }
## debug: pct <- round((score/total) * 100, 1)
## debug: grade <- if (pct >= 90) "A" else if (pct >= 80) "B" else if (pct >=
## 70) "C" else "F"
## debug: [1] "A"
## debug: cat("Score:", pct, "% -> Grade:", grade, "\n")
## Score: 95 % -> Grade: A
## debug: return(invisible(list(pct = pct, grade = grade)))
untrace(calculate_grade)
###### Recover() function
# By executing options(error = recover), R halts on error and presents the call
# stack — the sequence of function calls that led to the crash — letting you
# inspect the environment at each level interactively.
#### Homework 5 & 6
### a. Custom mean function
data_mean <- function(x) {
if (!is.numeric(x)) stop("Input must be a numeric vector")
if (length(x) == 0) stop("Input vector cannot be empty")
total <- sum(x)
count <- length(x)
result <- total / count
return(result)
}
### b. Statistical summary function
my_summary_stats <- function(x) {
if (!is.numeric(x)) stop("Error: This function only works with numeric variables.")
results <- data.frame(
Mean = round(mean(x, na.rm = TRUE), 2),
Median = round(median(x, na.rm = TRUE), 2),
Std_Dev = round(sd(x, na.rm = TRUE), 2),
Min = min(x, na.rm = TRUE),
Max = max(x, na.rm = TRUE)
)
return(results)
}
# World population sample dataset
world_pop_data <- data.frame(
Nation = c("China", "India", "United States", "Nigeria", "Rwanda", "Kenya"),
Continent = c("Asia", "Asia", "North America", "Africa", "Africa", "Africa"),
Pop_2023 = c(1425671352, 1428627663, 339996563, 223804632, 14094683, 54027487),
Land_Area_km2 = c(9596960, 3287263, 9833517, 923768, 26338, 580367)
)
cat("--- Summary Statistics for 2023 Population ---\n")
## --- Summary Statistics for 2023 Population ---
print(my_summary_stats(world_pop_data$Pop_2023))
## Mean Median Std_Dev Min Max
## 1 581037063 281900598 665837096 14094683 1428627663
cat("\n--- Summary Statistics for Land Area (km2) ---\n")
##
## --- Summary Statistics for Land Area (km2) ---
print(my_summary_stats(world_pop_data$Land_Area_km2))
## Mean Median Std_Dev Min Max
## 1 4041369 2105516 4534329 26338 9833517
### Custom two-sample t-test function
custom_t_test <- function(group1, group2) {
test_results <- t.test(group1, group2)
clean_output <- data.frame(
T_Statistic = round(test_results$statistic, 3),
P_Value = signif(test_results$p.value, 4),
Mean_Group1 = round(test_results$estimate[1], 2),
Mean_Group2 = round(test_results$estimate[2], 2),
Significant = ifelse(test_results$p.value < 0.05, "Yes", "No")
)
rownames(clean_output) <- NULL
return(clean_output)
}
africa_area <- world_pop_data$Land_Area_km2[world_pop_data$Continent == "Africa"]
asia_area <- world_pop_data$Land_Area_km2[world_pop_data$Continent == "Asia"]
cat("--- Two-Sample T-Test: Land Area (Africa vs. Asia) ---\n")
## --- Two-Sample T-Test: Land Area (Africa vs. Asia) ---
print(custom_t_test(africa_area, asia_area))
## T_Statistic P_Value Mean_Group1 Mean_Group2 Significant
## 1 -1.874 0.3095 510157.7 6442112 No
### Homework 6: Apply family
numeric_data <- world_pop_data[, c("Pop_2023", "Land_Area_km2")]
# 1. lapply()
cat("--- 1. lapply() Output (Returns a List) ---\n")
## --- 1. lapply() Output (Returns a List) ---
print(lapply(numeric_data, function(x) round(x / 1000000, 2)))
## $Pop_2023
## [1] 1425.67 1428.63 340.00 223.80 14.09 54.03
##
## $Land_Area_km2
## [1] 9.60 3.29 9.83 0.92 0.03 0.58
# 2. sapply()
cat("\n--- 2. sapply() Output (Returns a Matrix) ---\n")
##
## --- 2. sapply() Output (Returns a Matrix) ---
print(sapply(numeric_data, function(x) round(x / 1000000, 2)))
## Pop_2023 Land_Area_km2
## [1,] 1425.67 9.60
## [2,] 1428.63 3.29
## [3,] 340.00 9.83
## [4,] 223.80 0.92
## [5,] 14.09 0.03
## [6,] 54.03 0.58
# 3. vapply()
cat("\n--- 3. vapply() Output (Safe and Strict) ---\n")
##
## --- 3. vapply() Output (Safe and Strict) ---
print(vapply(numeric_data, function(x) round(x / 1000000, 2), FUN.VALUE = numeric(6)))
## Pop_2023 Land_Area_km2
## [1,] 1425.67 9.60
## [2,] 1428.63 3.29
## [3,] 340.00 9.83
## [4,] 223.80 0.92
## [5,] 14.09 0.03
## [6,] 54.03 0.58
# 4. tapply()
cat("\n--- 4. tapply() Output (Grouped Calculation) ---\n")
##
## --- 4. tapply() Output (Grouped Calculation) ---
print(tapply(world_pop_data$Pop_2023, world_pop_data$Continent, mean))
## Africa Asia North America
## 97308934 1427149508 339996563
# 5. mapply()
cat("\n--- 5. mapply() Output (Population Density) ---\n")
##
## --- 5. mapply() Output (Population Density) ---
density_calc <- mapply(function(pop, area) pop / area,
world_pop_data$Pop_2023,
world_pop_data$Land_Area_km2)
print(round(density_calc, 2))
## [1] 148.55 434.59 34.58 242.27 535.15 93.09