library(DBI)
library(RMySQL)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Creating a connection object
con <- dbConnect(MySQL(),
user = "root",
password = "",
dbname = "db_rtrial",
host = "127.0.0.1")
dbIsValid(con)
## [1] TRUE
dbListTables(con)
## [1] "students"
my_data <- dbReadTable(con, "students")
View(my_data)
data("Titanic")
str(Titanic)
## 'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
## - attr(*, "dimnames")=List of 4
## ..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
## ..$ Sex : chr [1:2] "Male" "Female"
## ..$ Age : chr [1:2] "Child" "Adult"
## ..$ Survived: chr [1:2] "No" "Yes"
summary(Titanic)
## Number of cases in table: 2201
## Number of factors: 4
## Test for independence of all factors:
## Chisq = 1637.4, df = 25, p-value = 0
## Chi-squared approximation may be incorrect
variable.names(Titanic)
## [1] "Male" "Female"
df1 <- read_csv("~/FirstRProject/world_population.csv")
## Rows: 234 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): CCA3, Country/Territory, Capital, Continent
## dbl (13): Rank, 2022 Population, 2020 Population, 2015 Population, 2010 Popu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(df1)
df2<- read_csv("~/FirstRProject/CO2_emission.csv")
## New names:
## Rows: 215 Columns: 35
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, country_code, Region, Indicator Name dbl (31): 1990, 1991,
## 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`
View(df2)
## checking variable names of each datasets
variable.names(df1)
## [1] "Rank" "CCA3"
## [3] "Country/Territory" "Capital"
## [5] "Continent" "2022 Population"
## [7] "2020 Population" "2015 Population"
## [9] "2010 Population" "2000 Population"
## [11] "1990 Population" "1980 Population"
## [13] "1970 Population" "Area (km²)"
## [15] "Density (per km²)" "Growth Rate"
## [17] "World Population Percentage"
variable.names(df2)
## [1] "Country Name" "country_code" "Region" "Indicator Name"
## [5] "1990" "1991" "1992" "1993"
## [9] "1994" "1995" "1996" "1997"
## [13] "1998" "1999" "2000" "2001"
## [17] "2002" "2003" "2004" "2005"
## [21] "2006" "2007" "2008" "2009"
## [25] "2010" "2011" "2012" "2013"
## [29] "2014" "2015" "2016" "2017"
## [33] "2018" "2019...34" "2019...35"
merged_data <- merge(
df1,
df2,
by.x = c("Continent", "Country/Territory"),
by.y = c("Region", "Country Name")
)
variable.names(merged_data)
## [1] "Continent" "Country/Territory"
## [3] "Rank" "CCA3"
## [5] "Capital" "2022 Population"
## [7] "2020 Population" "2015 Population"
## [9] "2010 Population" "2000 Population"
## [11] "1990 Population" "1980 Population"
## [13] "1970 Population" "Area (km²)"
## [15] "Density (per km²)" "Growth Rate"
## [17] "World Population Percentage" "country_code"
## [19] "Indicator Name" "1990"
## [21] "1991" "1992"
## [23] "1993" "1994"
## [25] "1995" "1996"
## [27] "1997" "1998"
## [29] "1999" "2000"
## [31] "2001" "2002"
## [33] "2003" "2004"
## [35] "2005" "2006"
## [37] "2007" "2008"
## [39] "2009" "2010"
## [41] "2011" "2012"
## [43] "2013" "2014"
## [45] "2015" "2016"
## [47] "2017" "2018"
## [49] "2019...34" "2019...35"
View(merged_data)
In R, group_by() and %>%(pipe operator) are commonly used with the dplyr package for data manipulation and analysis. ### 3.1. The Pipe Operator %>%
The %>% operator means:
“Take the output from the previous step and pass it to the next step.” We use %>% because makes code cleaner,makes code easier to read,easier to understand.
summary_data <- summarize(
group_by(df1, Continent),
Avg_Population = mean(`2022 Population`)
)
print(summary_data)
## # A tibble: 6 × 2
## Continent Avg_Population
## <chr> <dbl>
## 1 Africa 25030367.
## 2 Asia 94427665.
## 3 Europe 14862951.
## 4 North America 15007403.
## 5 Oceania 1958198
## 6 South America 31201186.
summary_data <- df1 %>%
group_by(Continent) %>%
summarize(
Avg_Population = mean(`2022 Population`)
)
print(summary_data) #This is easier to read because the operations flow step by step.
## # A tibble: 6 × 2
## Continent Avg_Population
## <chr> <dbl>
## 1 Africa 25030367.
## 2 Asia 94427665.
## 3 Europe 14862951.
## 4 North America 15007403.
## 5 Oceania 1958198
## 6 South America 31201186.
group_by() is used to divide data into groups,then perform calculations on each group separately. Commonly used with: summarize(), mutate(), filter() functions in dplyr library
# Finding Average Population per Continent using group_by
continent_population <- df1 %>% # this Passes dataset df1 to the next operation.
group_by(Continent) %>% #Groups all countries according to continent.
summarize(
Avg_Population = mean(`2022 Population`) #Calculates average population for each continent.
)
View(continent_population)
Debugging is the process of finding and fixing errors in a program. R provides debugging tools such as trace() and recover() to help programmers identify problems in functions and code execution.
trace() is used to monitor function execution, insert debugging messages, understand how functions work.
#Example
addition <- function(a, b) {
result <- a + b
return(result)
}
trace(addition,
tracer = quote(print("Function is running")))
## [1] "addition"
addition(5, 3)
## Tracing addition(5, 3) on entry
## [1] "Function is running"
## [1] 8
#Removing Trace
untrace(addition)
recover() is an interactive debugger that pauses execution when an error occurs and lets you inspect the call stack and variable environments at each frame.
#Example
div <- function(a, b) {
res <- a / b
#print(x)
return(res)
}
options(error = recover)
div(10, 2)
## [1] 5
#Disable Recover Mode
options(error = NULL)
data_mean <- function(x) {
# Input validation
if (!is.numeric(x)) {
stop("Input must be a numeric vector")
}
if (length(x) == 0) {
stop("Input vector cannot be empty")
}
total <- sum(x)
count <- length(x)
result <- total / count
return(result)
}
data_mean(df1$`2022 Population`)
## [1] 34074415
Applies a function over a list/vector and automatically simplifies the result into a vector, matrix, or list. sapply(X, FUN, …)
#Example: Square each number
numbers <- c(2, 4, 6, 8, 10)
result <- sapply(numbers, function(x) x^2)
result
## [1] 4 16 36 64 100
Same as sapply() but you declare the expected output type and length upfront crashes loudly if the result doesn’t match. Much safer for production code. vapply(X, FUN, FUN.VALUE, …)
#Example: Get string lengths safely
fruits <- c("apple", "banana", "kiwi", "mango")
result <- vapply(fruits, nchar, FUN.VALUE = numeric(1))
result
## apple banana kiwi mango
## 5 6 4 5
Like sapply() but always returns a list no surprises. Has typed variants for strict control. library(purrr) map(.x, .f, …) # always returns a list map_dbl(.x, .f, …) # always returns numeric (double) map_chr(.x, .f, …) # always returns character map_lgl(.x, .f, …) # always returns logical (TRUE/FALSE) map_int(.x, .f, …) # always returns integer map_df(.x, .f, …) # always returns a data frame
#Example: Basic map() returns a list
library(purrr)
numbers <- list(4, 9, 16, 25)
result <- map(numbers, sqrt)
result
## [[1]]
## [1] 2
##
## [[2]]
## [1] 3
##
## [[3]]
## [1] 4
##
## [[4]]
## [1] 5
#Example: map_dbl() returns a clean numeric vector
result <- map_dbl(numbers, sqrt)
result
## [1] 2 3 4 5
Applies a function over multiple vectors/lists simultaneously like a parallel sapply() that feeds multiple arguments at once. mapply(FUN, …, MoreArgs = NULL, SIMPLIFY = TRUE)
#Example: Basic: Add two vectors element by element
x <- c(1, 2, 3, 4, 5)
y <- c(10, 20, 30, 40, 50)
result <- mapply(function(a, b) a + b, x, y)
result
## [1] 11 22 33 44 55