Importing dataset form different sources

1.1. importing dataset form dbms

library(DBI) #provides a database interface in R, and RMySQL is the driver for connecting to MySQL.
library(RMySQL)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
#creating a connection object
con <- dbConnect(MySQL(), 
                 user = "root",     
                 password = "",      
                 dbname = "sserumanya charles",   
                 host = "127.0.0.1") 
dbIsValid(con)
## [1] TRUE
dbListTables(con)
## [1] "charles"
Data <- dbReadTable(con, "charles")
View(Data)

Assignment 1.2: Importing dataset from built in R

data("Titanic")
str(Titanic)
##  'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
##  - attr(*, "dimnames")=List of 4
##   ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
##   ..$ Sex     : chr [1:2] "Male" "Female"
##   ..$ Age     : chr [1:2] "Child" "Adult"
##   ..$ Survived: chr [1:2] "No" "Yes"
summary(Titanic)
## Number of cases in table: 2201 
## Number of factors: 4 
## Test for independence of all factors:
##  Chisq = 1637.4, df = 25, p-value = 0
##  Chi-squared approximation may be incorrect
variable.names(Titanic)
## [1] "Male"   "Female"

#1.3 Importing data downloaded from Kaggles

df10<-read_csv("C:/Users/PC/Downloads/world_wide_population.csv")
## Rows: 234 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): CCA3, Country/Territory, Capital, Continent
## dbl (13): Rank, 2022 Population, 2020 Population, 2015 Population, 2010 Popu...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(df10)
df20<-read_csv("C:/Users/PC/Downloads/CO2_emissions.csv")
## New names:
## Rows: 215 Columns: 35
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, country_code, Region, Indicator Name dbl (31): 1990, 1991,
## 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`
View(df20)

Assignment 2:Merging dataset with 2 to 3 variables

variable.names(df10)
##  [1] "Rank"                        "CCA3"                       
##  [3] "Country/Territory"           "Capital"                    
##  [5] "Continent"                   "2022 Population"            
##  [7] "2020 Population"             "2015 Population"            
##  [9] "2010 Population"             "2000 Population"            
## [11] "1990 Population"             "1980 Population"            
## [13] "1970 Population"             "Area (km²)"                 
## [15] "Density (per km²)"           "Growth Rate"                
## [17] "World Population Percentage"
merged_df <- merge(df10, df20, 
                   by.x= c("Continent", "Country/Territory"),
                   by.y = c("Region", "Country Name")
)
View(merged_df)

##3.1The use of group_by and %>% When you want to perform calculations (mean, sum, count) separately for each group.

df10 %>%
  mutate(
    size_category = case_when(
      `2022 Population` >= 100000000 ~ "Large (100M+)",
      `2022 Population` >= 10000000  ~ "Medium (10M-100M)",
      TRUE                          ~ "Small (under 10M)"
    )
  )
## # A tibble: 234 × 18
##     Rank CCA3  `Country/Territory` Capital          Continent  `2022 Population`
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   213 ASM   American Samoa      Pago Pago        Oceania                44273
##  5   203 AND   Andorra             Andorra la Vella Europe                 79824
##  6    42 AGO   Angola              Luanda           Africa              35588987
##  7   224 AIA   Anguilla            The Valley       North Ame…             15857
##  8   201 ATG   Antigua and Barbuda Saint John’s     North Ame…             93763
##  9    33 ARG   Argentina           Buenos Aires     South Ame…          45510318
## 10   140 ARM   Armenia             Yerevan          Asia                 2780469
## # ℹ 224 more rows
## # ℹ 12 more variables: `2020 Population` <dbl>, `2015 Population` <dbl>,
## #   `2010 Population` <dbl>, `2000 Population` <dbl>, `1990 Population` <dbl>,
## #   `1980 Population` <dbl>, `1970 Population` <dbl>, `Area (km²)` <dbl>,
## #   `Density (per km²)` <dbl>, `Growth Rate` <dbl>,
## #   `World Population Percentage` <dbl>, size_category <chr>

##3.2 The Group_by() function

summarise() collapses each continent group into ONE summary row. grouped1 is passed as the first argument (the data to summarise). total_population: adds up the 2022 Population of all countries per continent. na.rm = TRUE means ignore any missing (NA) values during the sum. n_countries: counts how many rows (countries) exist in each continent group. The result has one row per continent (6 rows total)

grouped1 <- group_by(df10, Continent)

# Step 2: summarise within each group
summary1 <- summarise(
  grouped1,
  total_population = sum(`2022 Population`, na.rm = TRUE),
  n_countries      = n()
)

# Step 3: sort the result
result1 <- arrange(summary1, desc(total_population))

result1
## # A tibble: 6 × 3
##   Continent     total_population n_countries
##   <chr>                    <dbl>       <int>
## 1 Asia                4721383274          50
## 2 Africa              1426730932          57
## 3 Europe               743147538          50
## 4 North America        600296136          40
## 5 South America        436816608          14
## 6 Oceania               45038554          23

4.1. using trace() in debugging with examples

trace() is helpful for monitoring the flow of a function by inserting custom code that prints or inspects variables whenever the function is called, making it easier to see what inputs or intermediate values are being processed

For example, tracing a function like trace(“mean”, quote(cat(“Tracing input:”, x, “”))) lets you watch the arguments passed to mean()

# Example with trace()
my_sum <- function(x, y) {
  result <- x + y
  return(result)
}

# Insert tracing to monitor inputs
trace("my_sum", quote(cat("Tracing: x =", x, "y =", y, "\n")))
## [1] "my_sum"
# Call the function
my_sum(5, 10)
## Tracing my_sum(5, 10) on entry 
## Tracing: x = 5 y = 10
## [1] 15

##4.2 using recover() in debugging with examples On the other hand, recover() is used when an error occurs; it allows you to jump into the environment where the error happened and inspect variables interactively.

# Define two variables

# Function that uses both variables
bad_function <- function(x,z) {
  y <- x + z   # Now 'z' is defined, so no error
  return(y)
}

# Set error option to recover (will only trigger if an error occurs)
options(error = recover)

# Call the function
bad_function(2,4)
## [1] 6

##5 creating a function to define the Mean

# Define a custom summary function
my_summary <- function(x) {
  if(!is.numeric(x)) {
    stop("Input must be numeric")
  }
  
  list(
    mean   = sum(x, na.rm = TRUE) / length(x),
    median = median(x, na.rm = TRUE),
    IQR    = IQR(x, na.rm = TRUE),
    min    = min(x, na.rm = TRUE),
    max    = max(x, na.rm = TRUE)
  )
}

# Apply the summary function to each column
summary_population <- my_summary(df10$`2022 Population`)
summary_emissions  <- my_summary(df20$`1991`)

# Print results
cat("Population Summary:\n")
## Population Summary:
print(summary_population)
## $mean
## [1] 34074415
## 
## $median
## [1] 5559945
## 
## $IQR
## [1] 22056766
## 
## $min
## [1] 510
## 
## $max
## [1] 1425887337
cat("\nCO2 Emissions Summary:\n")
## 
## CO2 Emissions Summary:
print(summary_emissions)
## $mean
## [1] 3.711909
## 
## $median
## [1] 1.941825
## 
## $IQR
## [1] 6.044712
## 
## $min
## [1] 0.001158007
## 
## $max
## [1] 31.7785
# Perform a two-sample t-test
t_test_result <- t.test(df10$`2022 Population`, df20$`1991`)

# Print t-test results
cat("\nTwo-Sample t-test Result:\n")
## 
## Two-Sample t-test Result:
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  df10$`2022 Population` and df20$`1991`
## t = 3.8112, df = 233, p-value = 0.0001771
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  16459463 51689358
## sample estimates:
##    mean of x    mean of y 
## 3.407441e+07 4.290648e+00

##6. sapply(), Vapply(), map(), mapply()

Role: A simplified version of lapply() that tries to return results as a vector or matrix instead of a list.

When to use: When you want a clean, simplified output (vector/matrix) from applying a function to each element.

Role: Similar to sapply(), but safer because you specify the expected output type.

When to use: When you want predictable output and avoid surprises (e.g., always numeric).

Role: A tidyverse alternative to lapply(), designed for functional programming.

When to use: When working with tidyverse pipelines (%>%) and you want consistent, readable code

Role: multivariate apply:applies a function to multiple arguments in parallel.

When to use: When you want to iterate over multiple vectors simultaneously.

we use library(purrr) is because the map() function (and its variants like map_dbl(), map_chr() are not part of base R. They come from the purrr package, which is part of the tidyverse.

# Example of data
numbers <- list(1:5, 6:10)

# 1. sapply() → Simplifies list results into a vector

cat("sapply() Example:\n")
## sapply() Example:
print(sapply(numbers, mean))   # Output: numeric vector (3, 8)
## [1] 3 8
# 2. vapply() → Safer, enforces output type

cat("\nvapply() Example:\n")
## 
## vapply() Example:
print(vapply(numbers, mean, numeric(1)))  # Output: numeric vector (3, 8)
## [1] 3 8
# 3. map() → Tidyverse-friendly, flexible outputs

library(purrr)

cat("\nmap() Example:\n")
## 
## map() Example:
print(map(numbers, mean))      # Output: list of means
## [[1]]
## [1] 3
## 
## [[2]]
## [1] 8
print(map_dbl(numbers, mean))  # Output: numeric vector (3, 8)
## [1] 3 8
# 4. mapply() → Applies a function across multiple vectors in parallel
x <- 1:5
y <- 6:10
cat("\nmapply() Example:\n")
## 
## mapply() Example:
print(mapply(sum, x, y))       # Output: element-wise sums (7, 9, 11, 13, 15)
## [1]  7  9 11 13 15