library(DBI) #provides a database interface in R, and RMySQL is the driver for connecting to MySQL.
library(RMySQL)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
#creating a connection object
con <- dbConnect(MySQL(),
user = "root",
password = "",
dbname = "rukundo_patrick",
host = "127.0.0.1")
dbIsValid(con)
## [1] TRUE
dbListTables(con)
## [1] "rukundo_s_child"
Data <- dbReadTable(con, "rukundo_s_child")
View(Data)
data("Titanic")
str(Titanic)
## 'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
## - attr(*, "dimnames")=List of 4
## ..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
## ..$ Sex : chr [1:2] "Male" "Female"
## ..$ Age : chr [1:2] "Child" "Adult"
## ..$ Survived: chr [1:2] "No" "Yes"
summary(Titanic)
## Number of cases in table: 2201
## Number of factors: 4
## Test for independence of all factors:
## Chisq = 1637.4, df = 25, p-value = 0
## Chi-squared approximation may be incorrect
variable.names(Titanic)
## [1] "Male" "Female"
#1.3 Importing data downloaded from Kaggles
df10<-read_csv("C:/Users/PC/Downloads/world_wide_population.csv")
## Rows: 234 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): CCA3, Country/Territory, Capital, Continent
## dbl (13): Rank, 2022 Population, 2020 Population, 2015 Population, 2010 Popu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(df10)
df20<-read_csv("C:/Users/PC/Downloads/CO2_emissions.csv")
## New names:
## Rows: 215 Columns: 35
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, country_code, Region, Indicator Name dbl (31): 1990, 1991,
## 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`
View(df20)
variable.names(df10)
## [1] "Rank" "CCA3"
## [3] "Country/Territory" "Capital"
## [5] "Continent" "2022 Population"
## [7] "2020 Population" "2015 Population"
## [9] "2010 Population" "2000 Population"
## [11] "1990 Population" "1980 Population"
## [13] "1970 Population" "Area (km²)"
## [15] "Density (per km²)" "Growth Rate"
## [17] "World Population Percentage"
merged_df <- merge(df10, df20,
by.x= c("Continent", "Country/Territory"),
by.y = c("Region", "Country Name")
)
View(merged_df)
##3.1The use of group_by and %>% When you want to perform calculations (mean, sum, count) separately for each group.
df10 %>%
mutate(
size_category = case_when(
`2022 Population` >= 100000000 ~ "Large (100M+)",
`2022 Population` >= 10000000 ~ "Medium (10M-100M)",
TRUE ~ "Small (under 10M)"
)
)
## # A tibble: 234 × 18
## Rank CCA3 `Country/Territory` Capital Continent `2022 Population`
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## 7 224 AIA Anguilla The Valley North Ame… 15857
## 8 201 ATG Antigua and Barbuda Saint John’s North Ame… 93763
## 9 33 ARG Argentina Buenos Aires South Ame… 45510318
## 10 140 ARM Armenia Yerevan Asia 2780469
## # ℹ 224 more rows
## # ℹ 12 more variables: `2020 Population` <dbl>, `2015 Population` <dbl>,
## # `2010 Population` <dbl>, `2000 Population` <dbl>, `1990 Population` <dbl>,
## # `1980 Population` <dbl>, `1970 Population` <dbl>, `Area (km²)` <dbl>,
## # `Density (per km²)` <dbl>, `Growth Rate` <dbl>,
## # `World Population Percentage` <dbl>, size_category <chr>
##3.2 The Group_by() function
summarise() collapses each continent group into ONE summary row. grouped1 is passed as the first argument (the data to summarise). total_population: adds up the 2022 Population of all countries per continent. na.rm = TRUE means ignore any missing (NA) values during the sum. n_countries: counts how many rows (countries) exist in each continent group. The result has one row per continent (6 rows total)
grouped1 <- group_by(df10, Continent)
# Step 2: summarise within each group
summary1 <- summarise(
grouped1,
total_population = sum(`2022 Population`, na.rm = TRUE),
n_countries = n()
)
# Step 3: sort the result
result1 <- arrange(summary1, desc(total_population))
result1
## # A tibble: 6 × 3
## Continent total_population n_countries
## <chr> <dbl> <int>
## 1 Asia 4721383274 50
## 2 Africa 1426730932 57
## 3 Europe 743147538 50
## 4 North America 600296136 40
## 5 South America 436816608 14
## 6 Oceania 45038554 23
trace() is helpful for monitoring the flow of a function by inserting custom code that prints or inspects variables whenever the function is called, making it easier to see what inputs or intermediate values are being processed
For example, tracing a function like trace(“mean”, quote(cat(“Tracing input:”, x, “”))) lets you watch the arguments passed to mean()
# Example with trace()
my_sum <- function(x, y) {
result <- x + y
return(result)
}
# Insert tracing to monitor inputs
trace("my_sum", quote(cat("Tracing: x =", x, "y =", y, "\n")))
## [1] "my_sum"
# Call the function
my_sum(5, 10)
## Tracing my_sum(5, 10) on entry
## Tracing: x = 5 y = 10
## [1] 15
##4.2 using recover() in debugging with examples On the other hand, recover() is used when an error occurs; it allows you to jump into the environment where the error happened and inspect variables interactively.
# Define two variables
# Function that uses both variables
bad_function <- function(x,z) {
y <- x + z # Now 'z' is defined, so no error
return(y)
}
# Set error option to recover (will only trigger if an error occurs)
options(error = recover)
# Call the function
bad_function(2,4)
## [1] 6
##5 creating a function to define the Mean
# Define a custom summary function
my_summary <- function(x) {
if(!is.numeric(x)) {
stop("Input must be numeric")
}
list(
mean = sum(x, na.rm = TRUE) / length(x),
median = median(x, na.rm = TRUE),
IQR = IQR(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE)
)
}
# Apply the summary function to each column
summary_population <- my_summary(df10$`2022 Population`)
summary_emissions <- my_summary(df20$`1991`)
# Print results
cat("Population Summary:\n")
## Population Summary:
print(summary_population)
## $mean
## [1] 34074415
##
## $median
## [1] 5559945
##
## $IQR
## [1] 22056766
##
## $min
## [1] 510
##
## $max
## [1] 1425887337
cat("\nCO2 Emissions Summary:\n")
##
## CO2 Emissions Summary:
print(summary_emissions)
## $mean
## [1] 3.711909
##
## $median
## [1] 1.941825
##
## $IQR
## [1] 6.044712
##
## $min
## [1] 0.001158007
##
## $max
## [1] 31.7785
# Perform a two-sample t-test
t_test_result <- t.test(df10$`2022 Population`, df20$`1991`)
# Print t-test results
cat("\nTwo-Sample t-test Result:\n")
##
## Two-Sample t-test Result:
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: df10$`2022 Population` and df20$`1991`
## t = 3.8112, df = 233, p-value = 0.0001771
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 16459463 51689358
## sample estimates:
## mean of x mean of y
## 3.407441e+07 4.290648e+00
##6. sapply(), Vapply(), map(), mapply()
Role: A simplified version of lapply() that tries to return results as a vector or matrix instead of a list.
When to use: When you want a clean, simplified output (vector/matrix) from applying a function to each element.
Role: Similar to sapply(), but safer because you specify the expected output type.
When to use: When you want predictable output and avoid surprises (e.g., always numeric).
Role: A tidyverse alternative to lapply(), designed for functional programming.
When to use: When working with tidyverse pipelines (%>%) and you want consistent, readable code
Role: multivariate apply:applies a function to multiple arguments in parallel.
When to use: When you want to iterate over multiple vectors simultaneously.
we use library(purrr) is because the map() function (and its variants like map_dbl(), map_chr() are not part of base R. They come from the purrr package, which is part of the tidyverse.
# Example of data
numbers <- list(1:5, 6:10)
# 1. sapply() → Simplifies list results into a vector
cat("sapply() Example:\n")
## sapply() Example:
print(sapply(numbers, mean)) # Output: numeric vector (3, 8)
## [1] 3 8
# 2. vapply() → Safer, enforces output type
cat("\nvapply() Example:\n")
##
## vapply() Example:
print(vapply(numbers, mean, numeric(1))) # Output: numeric vector (3, 8)
## [1] 3 8
# 3. map() → Tidyverse-friendly, flexible outputs
library(purrr)
cat("\nmap() Example:\n")
##
## map() Example:
print(map(numbers, mean)) # Output: list of means
## [[1]]
## [1] 3
##
## [[2]]
## [1] 8
print(map_dbl(numbers, mean)) # Output: numeric vector (3, 8)
## [1] 3 8
# 4. mapply() → Applies a function across multiple vectors in parallel
x <- 1:5
y <- 6:10
cat("\nmapply() Example:\n")
##
## mapply() Example:
print(mapply(sum, x, y)) # Output: element-wise sums (7, 9, 11, 13, 15)
## [1] 7 9 11 13 15