assign 1: Importing datasets from different sources

1.1.Importing datasets from dbms

library(DBI)
library(RMySQL)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Creating a connection object
con <- dbConnect(MySQL(), 
                 user = "root",     
                 password = "",      
                 dbname = "db_rtrial",   
                 host = "127.0.0.1") 
dbIsValid(con)
## [1] TRUE
dbListTables(con)
## [1] "students"
my_data <- dbReadTable(con, "students")
View(my_data)

1.2 Importing dataset from builtin R

data("Titanic")
str(Titanic)
##  'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
##  - attr(*, "dimnames")=List of 4
##   ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
##   ..$ Sex     : chr [1:2] "Male" "Female"
##   ..$ Age     : chr [1:2] "Child" "Adult"
##   ..$ Survived: chr [1:2] "No" "Yes"
summary(Titanic)
## Number of cases in table: 2201 
## Number of factors: 4 
## Test for independence of all factors:
##  Chisq = 1637.4, df = 25, p-value = 0
##  Chi-squared approximation may be incorrect
variable.names(Titanic)
## [1] "Male"   "Female"

1.3 Importing dataset downloaded from Kaggle

df1 <- read_csv("~/FirstRProject/world_population.csv")
## Rows: 234 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): CCA3, Country/Territory, Capital, Continent
## dbl (13): Rank, 2022 Population, 2020 Population, 2015 Population, 2010 Popu...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(df1)
df2<- read_csv("~/FirstRProject/CO2_emission.csv")
## New names:
## Rows: 215 Columns: 35
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, country_code, Region, Indicator Name dbl (31): 1990, 1991,
## 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`
View(df2)

Assign 2: Merging datasets with 2 or 3 variables

## checking variable names of each datasets
variable.names(df1)
##  [1] "Rank"                        "CCA3"                       
##  [3] "Country/Territory"           "Capital"                    
##  [5] "Continent"                   "2022 Population"            
##  [7] "2020 Population"             "2015 Population"            
##  [9] "2010 Population"             "2000 Population"            
## [11] "1990 Population"             "1980 Population"            
## [13] "1970 Population"             "Area (km²)"                 
## [15] "Density (per km²)"           "Growth Rate"                
## [17] "World Population Percentage"
variable.names(df2)
##  [1] "Country Name"   "country_code"   "Region"         "Indicator Name"
##  [5] "1990"           "1991"           "1992"           "1993"          
##  [9] "1994"           "1995"           "1996"           "1997"          
## [13] "1998"           "1999"           "2000"           "2001"          
## [17] "2002"           "2003"           "2004"           "2005"          
## [21] "2006"           "2007"           "2008"           "2009"          
## [25] "2010"           "2011"           "2012"           "2013"          
## [29] "2014"           "2015"           "2016"           "2017"          
## [33] "2018"           "2019...34"      "2019...35"
merged_data <- merge(
  df1,
  df2,
  by.x = c("Continent", "Country/Territory"),
  by.y = c("Region", "Country Name")
)
variable.names(merged_data)
##  [1] "Continent"                   "Country/Territory"          
##  [3] "Rank"                        "CCA3"                       
##  [5] "Capital"                     "2022 Population"            
##  [7] "2020 Population"             "2015 Population"            
##  [9] "2010 Population"             "2000 Population"            
## [11] "1990 Population"             "1980 Population"            
## [13] "1970 Population"             "Area (km²)"                 
## [15] "Density (per km²)"           "Growth Rate"                
## [17] "World Population Percentage" "country_code"               
## [19] "Indicator Name"              "1990"                       
## [21] "1991"                        "1992"                       
## [23] "1993"                        "1994"                       
## [25] "1995"                        "1996"                       
## [27] "1997"                        "1998"                       
## [29] "1999"                        "2000"                       
## [31] "2001"                        "2002"                       
## [33] "2003"                        "2004"                       
## [35] "2005"                        "2006"                       
## [37] "2007"                        "2008"                       
## [39] "2009"                        "2010"                       
## [41] "2011"                        "2012"                       
## [43] "2013"                        "2014"                       
## [45] "2015"                        "2016"                       
## [47] "2017"                        "2018"                       
## [49] "2019...34"                   "2019...35"
View(merged_data)

Assign 3: Discuss the use of group_by and %>%(operator)

In R, group_by() and %>%(pipe operator) are commonly used with the dplyr package for data manipulation and analysis. ### 3.1. The Pipe Operator %>%

The %>% operator means:

“Take the output from the previous step and pass it to the next step.” We use %>% because makes code cleaner,makes code easier to read,easier to understand.

Without %>%

summary_data <- summarize(
  group_by(df1, Continent),
  Avg_Population = mean(`2022 Population`)
)

print(summary_data)
## # A tibble: 6 × 2
##   Continent     Avg_Population
##   <chr>                  <dbl>
## 1 Africa             25030367.
## 2 Asia               94427665.
## 3 Europe             14862951.
## 4 North America      15007403.
## 5 Oceania             1958198 
## 6 South America      31201186.

With %>%

summary_data <- df1 %>%
  group_by(Continent) %>%
  summarize(
    Avg_Population = mean(`2022 Population`)
  )

print(summary_data) #This is easier to read because the operations flow step by step.
## # A tibble: 6 × 2
##   Continent     Avg_Population
##   <chr>                  <dbl>
## 1 Africa             25030367.
## 2 Asia               94427665.
## 3 Europe             14862951.
## 4 North America      15007403.
## 5 Oceania             1958198 
## 6 South America      31201186.

3.2 The group_by() Function

group_by() is used to divide data into groups,then perform calculations on each group separately. Commonly used with: summarize(), mutate(), filter() functions in dplyr library

# Finding Average Population per Continent using group_by

continent_population <- df1 %>% # this Passes dataset df1 to the next operation.
  group_by(Continent) %>%  #Groups all countries according to continent.
  summarize(
    Avg_Population = mean(`2022 Population`) #Calculates average population for each continent.
  )
View(continent_population)

4. Using trace() and recover() in debugging with examples

Debugging is the process of finding and fixing errors in a program. R provides debugging tools such as trace() and recover() to help programmers identify problems in functions and code execution.

4.1 using trace()

trace() is used to monitor function execution, insert debugging messages, understand how functions work.

#Example
addition <- function(a, b) {
  result <- a + b
  return(result)
}

trace(addition,
      tracer = quote(print("Function is running")))
## [1] "addition"
addition(5, 3)
## Tracing addition(5, 3) on entry 
## [1] "Function is running"
## [1] 8
#Removing Trace
untrace(addition)

4.2 Use of recover() in R

recover() is an interactive debugger that pauses execution when an error occurs and lets you inspect the call stack and variable environments at each frame.

#Example
div <- function(a, b) {
  res <- a / b
  #print(x)
  return(res)
}

options(error = recover)

div(10, 2)
## [1] 5
#Disable Recover Mode
options(error = NULL)

5.Creating a function to define the mean

data_mean <- function(x) {
  
  # Input validation
  if (!is.numeric(x)) {
    stop("Input must be a numeric vector")
  }
  if (length(x) == 0) {
    stop("Input vector cannot be empty")
  }
  
  total  <- sum(x)        
  count  <- length(x)     
  result <- total / count 
  
  return(result)
}

data_mean(df1$`2022 Population`)
## [1] 34074415

6. sapply() and vapply(), map() and mapply()

6.1 sapply(): Simplify Apply

Applies a function over a list/vector and automatically simplifies the result into a vector, matrix, or list. sapply(X, FUN, …)

#Example: Square each number
numbers <- c(2, 4, 6, 8, 10)

result <- sapply(numbers, function(x) x^2)
result
## [1]   4  16  36  64 100

6.2 vapply(): Safe Apply (Validated Output)

Same as sapply() but you declare the expected output type and length upfront crashes loudly if the result doesn’t match. Much safer for production code. vapply(X, FUN, FUN.VALUE, …)

#Example: Get string lengths safely
fruits <- c("apple", "banana", "kiwi", "mango")

result <- vapply(fruits, nchar, FUN.VALUE = numeric(1))
result
##  apple banana   kiwi  mango 
##      5      6      4      5

6.3 map(): purrr’s Consistent Apply

Like sapply() but always returns a list no surprises. Has typed variants for strict control. library(purrr) map(.x, .f, …) # always returns a list map_dbl(.x, .f, …) # always returns numeric (double) map_chr(.x, .f, …) # always returns character map_lgl(.x, .f, …) # always returns logical (TRUE/FALSE) map_int(.x, .f, …) # always returns integer map_df(.x, .f, …) # always returns a data frame

#Example: Basic map() returns a list
library(purrr)

numbers <- list(4, 9, 16, 25)

result <- map(numbers, sqrt)
result
## [[1]]
## [1] 2
## 
## [[2]]
## [1] 3
## 
## [[3]]
## [1] 4
## 
## [[4]]
## [1] 5
#Example: map_dbl() returns a clean numeric vector
result <- map_dbl(numbers, sqrt)
result
## [1] 2 3 4 5

6.4 mapply(): Multivariate Apply

Applies a function over multiple vectors/lists simultaneously like a parallel sapply() that feeds multiple arguments at once. mapply(FUN, …, MoreArgs = NULL, SIMPLIFY = TRUE)

#Example: Basic: Add two vectors element by element
x <- c(1, 2, 3, 4, 5)
y <- c(10, 20, 30, 40, 50)

result <- mapply(function(a, b) a + b, x, y)
result
## [1] 11 22 33 44 55