library(pdftools)
## Using poppler version 26.04.0
library(readxl)
library(DBI)
library(RSQLite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
con <- dbConnect(RSQLite::SQLite(),"R-database.db")
dbGetQuery(con, "select * from fishes")
## id fish_name habitat average_weight_kg lifespan_years diet
## 1 1 Tilapia Freshwater 171.0 20 Herbivore
## 2 2 Salmon Brackish 213.0 19 Omnivore
## 3 3 Tuna Brackish 275.0 24 Omnivore
## 4 4 Goldfish Brackish 37.5 26 Carnivore
## 5 5 Catfish Freshwater 243.5 12 Omnivore
## 6 6 Clownfish Marine 124.6 37 Omnivore
## 7 7 Rainbow Trout Freshwater 225.9 33 Planktivore
## 8 8 Sardine Freshwater 28.6 24 Herbivore
## 9 9 Piranha Freshwater 281.7 13 Carnivore
## 10 10 Blue Tang Marine 194.2 25 Carnivore
## 11 11 Mackerel Marine 246.9 38 Planktivore
## 12 12 Swordfish Marine 223.6 24 Carnivore
## 13 13 Snapper Marine 291.0 27 Omnivore
## 14 14 Carp Marine 7.5 9 Omnivore
## 15 15 Barracuda Freshwater 161.2 23 Herbivore
## 16 16 Guppy Marine 136.7 14 Planktivore
## 17 17 Bass Freshwater 58.4 33 Planktivore
## 18 18 Shark Freshwater 84.2 5 Herbivore
## 19 19 Eel Marine 230.9 21 Carnivore
## 20 20 Anchovy Brackish 212.6 29 Herbivore
dbDisconnect(con)
fishes <- read.csv("Fish.csv")
head(fishes,10)
## Species Weight Length1 Length2 Length3 Height Width
## 1 Bream 242 23.2 25.4 30.0 11.5200 4.0200
## 2 Bream 290 24.0 26.3 31.2 12.4800 4.3056
## 3 Bream 340 23.9 26.5 31.1 12.3778 4.6961
## 4 Bream 363 26.3 29.0 33.5 12.7300 4.4555
## 5 Bream 430 26.5 29.0 34.0 12.4440 5.1340
## 6 Bream 450 26.8 29.7 34.7 13.6024 4.9274
## 7 Bream 500 26.8 29.7 34.5 14.1795 5.2785
## 8 Bream 390 27.6 30.0 35.0 12.6700 4.6900
## 9 Bream 450 27.6 30.0 35.1 14.0049 4.8438
## 10 Bream 500 28.5 30.7 36.2 14.2266 4.9594
fish_excel <- read_excel("fish_excel.xlsx")
head(fish_excel,10)
## # A tibble: 10 × 7
## Species Weight Length1 Length2 Length3 Height Width
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bream 242 23.2 25.4 30 11.5 4.02
## 2 Bream 290 24 26.3 31.2 12.5 4.31
## 3 Bream 340 23.9 26.5 31.1 12.4 4.70
## 4 Bream 363 26.3 29 33.5 12.7 4.46
## 5 Bream 430 26.5 29 34 12.4 5.13
## 6 Bream 450 26.8 29.7 34.7 13.6 4.93
## 7 Bream 500 26.8 29.7 34.5 14.2 5.28
## 8 Bream 390 27.6 30 35 12.7 4.69
## 9 Bream 450 27.6 30 35.1 14.0 4.84
## 10 Bream 500 28.5 30.7 36.2 14.2 4.96
fishes_pdf = pdf_text("fish_pdf.pdf")
head(fishes,20)
## Species Weight Length1 Length2 Length3 Height Width
## 1 Bream 242 23.2 25.4 30.0 11.5200 4.0200
## 2 Bream 290 24.0 26.3 31.2 12.4800 4.3056
## 3 Bream 340 23.9 26.5 31.1 12.3778 4.6961
## 4 Bream 363 26.3 29.0 33.5 12.7300 4.4555
## 5 Bream 430 26.5 29.0 34.0 12.4440 5.1340
## 6 Bream 450 26.8 29.7 34.7 13.6024 4.9274
## 7 Bream 500 26.8 29.7 34.5 14.1795 5.2785
## 8 Bream 390 27.6 30.0 35.0 12.6700 4.6900
## 9 Bream 450 27.6 30.0 35.1 14.0049 4.8438
## 10 Bream 500 28.5 30.7 36.2 14.2266 4.9594
## 11 Bream 475 28.4 31.0 36.2 14.2628 5.1042
## 12 Bream 500 28.7 31.0 36.2 14.3714 4.8146
## 13 Bream 500 29.1 31.5 36.4 13.7592 4.3680
## 14 Bream 340 29.5 32.0 37.3 13.9129 5.0728
## 15 Bream 600 29.4 32.0 37.2 14.9544 5.1708
## 16 Bream 600 29.4 32.0 37.2 15.4380 5.5800
## 17 Bream 700 30.4 33.0 38.3 14.8604 5.2854
## 18 Bream 700 30.4 33.0 38.5 14.9380 5.1975
## 19 Bream 610 30.9 33.5 38.6 15.6330 5.1338
## 20 Bream 650 31.0 33.5 38.7 14.4738 5.7276
gdp = read.csv("countries_gdp.csv")
head(gdp)
## country year gdp_billion_usd
## 1 United States 2018 635.02
## 2 China 2020 6129.85
## 3 Germany 2019 18414.42
## 4 Japan 2022 2182.60
## 5 India 2021 804.25
## 6 Brazil 2018 5473.76
population = read.csv("countries_population.csv")
head(population)
## country year population
## 1 United States 2020 1350993687
## 2 China 2022 777605304
## 3 Germany 2022 413936598
## 4 India 2018 99407116
## 5 Brazil 2019 622455910
## 6 Canada 2018 500914620
#Their is Difference Way we can make The Marging Their is
#merge
#inner_join
#left_join
#right_join
#full_join
merge(gdp, population, by = c("country", "year"))
## country year gdp_billion_usd population
## 1 Australia 2020 6954.0 1468907436
## 2 Rwanda 2019 17456.5 1440190226
inner_join(gdp, population, by = c("country", "year"))
## country year gdp_billion_usd population
## 1 Rwanda 2019 17456.5 1440190226
## 2 Australia 2020 6954.0 1468907436
merge(gdp, population, by = c("country", "year"), all.x = TRUE)
## country year gdp_billion_usd population
## 1 Argentina 2018 18246.00 NA
## 2 Australia 2020 6954.00 1468907436
## 3 Brazil 2018 5473.76 NA
## 4 Canada 2022 15054.45 NA
## 5 China 2020 6129.85 NA
## 6 Egypt 2022 3129.41 NA
## 7 France 2022 4978.95 NA
## 8 Germany 2019 18414.42 NA
## 9 India 2021 804.25 NA
## 10 Indonesia 2020 15097.11 NA
## 11 Italy 2020 2564.23 NA
## 12 Japan 2022 2182.60 NA
## 13 Kenya 2018 18972.60 NA
## 14 Mexico 2019 23930.75 NA
## 15 Nigeria 2021 14735.75 NA
## 16 Norway 2021 1979.22 NA
## 17 Rwanda 2019 17456.50 1440190226
## 18 South Africa 2022 10493.80 NA
## 19 Spain 2021 2426.94 NA
## 20 United States 2018 635.02 NA
left_join(gdp, population, by = c("country", "year"))
## country year gdp_billion_usd population
## 1 United States 2018 635.02 NA
## 2 China 2020 6129.85 NA
## 3 Germany 2019 18414.42 NA
## 4 Japan 2022 2182.60 NA
## 5 India 2021 804.25 NA
## 6 Brazil 2018 5473.76 NA
## 7 Canada 2022 15054.45 NA
## 8 France 2022 4978.95 NA
## 9 South Africa 2022 10493.80 NA
## 10 Nigeria 2021 14735.75 NA
## 11 Kenya 2018 18972.60 NA
## 12 Rwanda 2019 17456.50 1440190226
## 13 Australia 2020 6954.00 1468907436
## 14 Mexico 2019 23930.75 NA
## 15 Italy 2020 2564.23 NA
## 16 Spain 2021 2426.94 NA
## 17 Indonesia 2020 15097.11 NA
## 18 Argentina 2018 18246.00 NA
## 19 Egypt 2022 3129.41 NA
## 20 Norway 2021 1979.22 NA
merge(gdp, population, by = c("country", "year"), all.y = TRUE)
## country year gdp_billion_usd population
## 1 Australia 2020 6954.0 1468907436
## 2 Bangladesh 2022 NA 472619987
## 3 Brazil 2019 NA 622455910
## 4 Canada 2018 NA 500914620
## 5 China 2022 NA 777605304
## 6 France 2018 NA 817314859
## 7 Germany 2022 NA 413936598
## 8 India 2018 NA 99407116
## 9 Indonesia 2021 NA 815874363
## 10 Italy 2019 NA 1148056643
## 11 Kenya 2020 NA 763938025
## 12 Mexico 2018 NA 1309098873
## 13 Nigeria 2020 NA 350297012
## 14 Pakistan 2020 NA 1375389012
## 15 Rwanda 2019 17456.5 1440190226
## 16 South Africa 2020 NA 974691209
## 17 Spain 2019 NA 351904183
## 18 Sweden 2019 NA 69934737
## 19 Uganda 2020 NA 121125830
## 20 United States 2020 NA 1350993687
right_join(gdp, population, by = c("country", "year"))
## country year gdp_billion_usd population
## 1 Rwanda 2019 17456.5 1440190226
## 2 Australia 2020 6954.0 1468907436
## 3 United States 2020 NA 1350993687
## 4 China 2022 NA 777605304
## 5 Germany 2022 NA 413936598
## 6 India 2018 NA 99407116
## 7 Brazil 2019 NA 622455910
## 8 Canada 2018 NA 500914620
## 9 France 2018 NA 817314859
## 10 South Africa 2020 NA 974691209
## 11 Nigeria 2020 NA 350297012
## 12 Kenya 2020 NA 763938025
## 13 Mexico 2018 NA 1309098873
## 14 Italy 2019 NA 1148056643
## 15 Spain 2019 NA 351904183
## 16 Indonesia 2021 NA 815874363
## 17 Pakistan 2020 NA 1375389012
## 18 Bangladesh 2022 NA 472619987
## 19 Uganda 2020 NA 121125830
## 20 Sweden 2019 NA 69934737
merge(gdp, population, by = c("country", "year"), all = TRUE)
## country year gdp_billion_usd population
## 1 Argentina 2018 18246.00 NA
## 2 Australia 2020 6954.00 1468907436
## 3 Bangladesh 2022 NA 472619987
## 4 Brazil 2018 5473.76 NA
## 5 Brazil 2019 NA 622455910
## 6 Canada 2018 NA 500914620
## 7 Canada 2022 15054.45 NA
## 8 China 2020 6129.85 NA
## 9 China 2022 NA 777605304
## 10 Egypt 2022 3129.41 NA
## 11 France 2018 NA 817314859
## 12 France 2022 4978.95 NA
## 13 Germany 2019 18414.42 NA
## 14 Germany 2022 NA 413936598
## 15 India 2018 NA 99407116
## 16 India 2021 804.25 NA
## 17 Indonesia 2020 15097.11 NA
## 18 Indonesia 2021 NA 815874363
## 19 Italy 2019 NA 1148056643
## 20 Italy 2020 2564.23 NA
## 21 Japan 2022 2182.60 NA
## 22 Kenya 2018 18972.60 NA
## 23 Kenya 2020 NA 763938025
## 24 Mexico 2018 NA 1309098873
## 25 Mexico 2019 23930.75 NA
## 26 Nigeria 2020 NA 350297012
## 27 Nigeria 2021 14735.75 NA
## 28 Norway 2021 1979.22 NA
## 29 Pakistan 2020 NA 1375389012
## 30 Rwanda 2019 17456.50 1440190226
## 31 South Africa 2020 NA 974691209
## 32 South Africa 2022 10493.80 NA
## 33 Spain 2019 NA 351904183
## 34 Spain 2021 2426.94 NA
## 35 Sweden 2019 NA 69934737
## 36 Uganda 2020 NA 121125830
## 37 United States 2018 635.02 NA
## 38 United States 2020 NA 1350993687
full_join(gdp, population, by = c("country", "year"))
## country year gdp_billion_usd population
## 1 United States 2018 635.02 NA
## 2 China 2020 6129.85 NA
## 3 Germany 2019 18414.42 NA
## 4 Japan 2022 2182.60 NA
## 5 India 2021 804.25 NA
## 6 Brazil 2018 5473.76 NA
## 7 Canada 2022 15054.45 NA
## 8 France 2022 4978.95 NA
## 9 South Africa 2022 10493.80 NA
## 10 Nigeria 2021 14735.75 NA
## 11 Kenya 2018 18972.60 NA
## 12 Rwanda 2019 17456.50 1440190226
## 13 Australia 2020 6954.00 1468907436
## 14 Mexico 2019 23930.75 NA
## 15 Italy 2020 2564.23 NA
## 16 Spain 2021 2426.94 NA
## 17 Indonesia 2020 15097.11 NA
## 18 Argentina 2018 18246.00 NA
## 19 Egypt 2022 3129.41 NA
## 20 Norway 2021 1979.22 NA
## 21 United States 2020 NA 1350993687
## 22 China 2022 NA 777605304
## 23 Germany 2022 NA 413936598
## 24 India 2018 NA 99407116
## 25 Brazil 2019 NA 622455910
## 26 Canada 2018 NA 500914620
## 27 France 2018 NA 817314859
## 28 South Africa 2020 NA 974691209
## 29 Nigeria 2020 NA 350297012
## 30 Kenya 2020 NA 763938025
## 31 Mexico 2018 NA 1309098873
## 32 Italy 2019 NA 1148056643
## 33 Spain 2019 NA 351904183
## 34 Indonesia 2021 NA 815874363
## 35 Pakistan 2020 NA 1375389012
## 36 Bangladesh 2022 NA 472619987
## 37 Uganda 2020 NA 121125830
## 38 Sweden 2019 NA 69934737
group_by() is used to split data into groups so you can perform calculations separately for each group. %>% is called the pipe operator in R. The pipe passes the result from one step into the next step.
sales <- data.frame(
country = c("Rwanda", "Rwanda", "Kenya", "Kenya", "Uganda"),
year = c(2020, 2021, 2020, 2021, 2021),
revenue = c(100, 150, 200, 250, 300)
)
sales %>%
group_by(country) %>%
summarise(total_revenue = sum(revenue), min_revenue=min(revenue), max_revenue=max(revenue), mean_revenue=mean(revenue))
## # A tibble: 3 × 5
## country total_revenue min_revenue max_revenue mean_revenue
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Kenya 450 200 250 225
## 2 Rwanda 250 100 150 125
## 3 Uganda 300 300 300 300
They help you:
These are built into base R.
trace() in R is a debugging tool used to temporarily
insert code into an existing function without modifying the original
source code.
It is useful for:
Think of trace() as:
"Run extra code whenever this function executes."
trace(function_name)
Example:
trace(mean)
Now whenever mean() runs, R enters tracing mode.
untrace(mean)
trace(mean)
mean(c(1,2,3))
## trace: mean(c(1, 2, 3))
## [1] 2
trace(
mean,
tracer = quote(print("mean() was called"))
)
## Tracing function "mean" in package "base"
## [1] "mean"
Now run:
mean(c(1,2,3))
## Tracing mean(c(1, 2, 3)) on entry
## [1] "mean() was called"
## [1] 2
tracertracer contains the code that R should execute when the
function runs.
Usually written with:
quote(...)
because R needs unevaluated code.
trace(
mean,
tracer = quote(print(x))
)
## Tracing function "mean" in package "base"
## [1] "mean"
mean(c(10,20,30))
## Tracing mean(c(10, 20, 30)) on entry
## [1] 10 20 30
## [1] 20
calculate_total <- function(price, quantity) {
price * quantity
}
Trace it:
trace(
calculate_total,
tracer = quote({
print(price)
print(quantity)
})
)
## [1] "calculate_total"
Run:
calculate_total(100, 5)
## Tracing calculate_total(100, 5) on entry
## [1] 100
## [1] 5
## [1] 500
trace(
calculate_total,
tracer = quote(print("Before multiplication")),
at = 1
)
## [1] "calculate_total"
at Parameter| Value | Meaning |
|---|---|
at = 1 |
Beginning of function |
at = 2 |
Second expression |
| etc. | Specific expression position |
You can inspect package functions without editing the package source code.
trace(
my_function,
tracer = quote(cat("Function executed\n"))
)
trace(
my_function,
tracer = quote(print(my_variable))
)
trace() and
debug()| trace() | debug() |
|---|---|
| Inject custom code | Step through interactively |
| Good for logging | Good for detailed debugging |
| Non-interactive | Interactive |
trace() and
traceback()| Tool | Purpose |
|---|---|
trace() |
Add debugging behavior |
traceback() |
Show error call stack |
trace() and
recover()| Tool | Purpose |
|---|---|
trace() |
Observe function execution |
recover() |
Enter failed function environments |
process_data <- function(data) {
cleaned <- na.omit(data)
mean(cleaned)
}
Add trace:
trace(
process_data,
tracer = quote(print(data))
)
## [1] "process_data"
Now every call prints the input data.
trace() modifies function behavior temporarily during
the session.
It does not permanently change the original function source code.
Always remove traces after debugging:
untrace(process_data)
recover() lets you enter the environment where the error happened and inspect variables interactively. Enable Recover Mode
options(error = recover)
Now whenever an error occurs, R enters debugging mode.
Example
divide <- function(x, y) {
result <- x / y
log(result)
}
calculate <- function() {
divide(10, "a")
}
calculate()
## Error in x/y: non-numeric argument to binary operator
What Happens Instead of stopping normally, R shows: Enter a frame number, or 0 to exit
Example:
1: calculate() 2: divide(10, “a”)
You can type:
2
to inspect the divide() function environment.
Inside Recover Mode
You can inspect variables:
x
y
Check objects:
ls()
## [1] "calculate" "calculate_total" "con" "divide"
## [5] "fish_excel" "fishes" "fishes_pdf" "gdp"
## [9] "population" "process_data" "sales"
Run expressions:
class(y)
Exit Recover
Type:
c
## function (...) .Primitive("c")
or:
0
## [1] 0
Disable Recover Mode
options(error = NULL)
lapply()sapply()vapply()These functions are used for iteration and functional programming in R.
lapply() — Returns a Listnumbers <- list(1, 2, 3, 4)
result <- lapply(numbers, function(x) x * 2)
print(result)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 6
##
## [[4]]
## [1] 8
sapply() — Simplifies Outputsapply() simplifies the result into vectors, matrices,
or arrays whenever possible.
numbers <- list(1, 2, 3, 4)
result <- sapply(numbers, function(x) x * 2)
print(result)
## [1] 2 4 6 8
numbers <- list(1, 2, 3)
result <- sapply(numbers, function(x) c(x, x^2))
print(result)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 1 4 9
vapply() — Safer Version of
sapply()vapply() requires specifying the expected output
type.
vapply(X, FUN, FUN.VALUE)
numbers <- list(1, 2, 3, 4)
result <- vapply(
numbers,
function(x) x * 2,
numeric(1)
)
print(result)
## [1] 2 4 6 8
words <- list("cat", "dog", "fish")
result <- vapply(
words,
toupper,
character(1)
)
print(result)
## [1] "CAT" "DOG" "FISH"
“CAT” “DOG” “FISH”
sapply() and
vapply()| sapply() | vapply() |
|---|---|
| Automatically guesses output type | Requires fixed output type |
| Easier to write | Safer and more predictable |
| May return unexpected structures | Consistent output |
without using R built-in statistical functions such as:
mean()median()quantile()Mode()We will use: - sapply() - lapply() -
vapply()
and core R logic.
numbers <- c(10, 20, 30, 40, 50, 20, 30, 30)
\[ Mean = \frac{\sum x}{n} \]
custom_mean <- function(x) {
total <- 0
sapply(x, function(value) {
total <<- total + value
})
total / length(x)
}
custom_mean(numbers)
## [1] 28.75
custom_median <- function(x) {
sorted <- sort(x)
n <- length(sorted)
middle <- n / 2
if (n %% 2 == 0) {
left <- sorted[middle]
right <- sorted[middle + 1]
(left + right) / 2
} else {
sorted[(n + 1) / 2]
}
}
custom_median(numbers)
## [1] 30
The most frequently occurring value.
custom_mode <- function(x) {
unique_values <- unique(x)
frequencies <- sapply(unique_values, function(value) {
count <- 0
sapply(x, function(item) {
if (item == value) {
count <<- count + 1
}
})
count
})
unique_values[which.max(frequencies)]
}
custom_mode(numbers)
## [1] 30
Quantiles divide ordered data into intervals.
Example: - 0.25 = first quartile - 0.50 = median - 0.75 = third quartile
\[ Position = (n - 1)p + 1 \]
where: - \(n\) = number of observations - \(p\) = probability
custom_quantile <- function(x, probs) {
sorted <- sort(x)
n <- length(sorted)
results <- sapply(probs, function(p) {
position <- (n - 1) * p + 1
lower <- floor(position)
upper <- ceiling(position)
if (lower == upper) {
sorted[lower]
} else {
lower_value <- sorted[lower]
upper_value <- sorted[upper]
lower_value +
(position - lower) *
(upper_value - lower_value)
}
})
results
}
custom_quantile(
numbers,
probs = c(0.25, 0.5, 0.75)
)
## [1] 20.0 30.0 32.5
vapply() for Safer ComputationExample:
safe_square <- function(x) {
vapply(
x,
function(value) value^2,
numeric(1)
)
}
safe_square(c(1,2,3,4))
## [1] 1 4 9 16
lapply()lapply() always returns a list.
lapply(
numbers,
function(x) x * 2
)
## [[1]]
## [1] 20
##
## [[2]]
## [1] 40
##
## [[3]]
## [1] 60
##
## [[4]]
## [1] 80
##
## [[5]]
## [1] 100
##
## [[6]]
## [1] 40
##
## [[7]]
## [1] 60
##
## [[8]]
## [1] 60
| Function | Purpose | Returns |
|---|---|---|
lapply() |
Iteration | List |
sapply() |
Simplified iteration | Vector/Matrix |
vapply() |
Type-safe iteration | Fixed type |
custom_mean() |
Average | Numeric |
custom_median() |
Middle value | Numeric |
custom_mode() |
Most frequent value | Numeric |
custom_quantile() |
Quartiles/percentiles | Vector |
numbers <- c(10, 20, 30, 40, 50, 20, 30, 30)
custom_mean(numbers)
## [1] 28.75
custom_median(numbers)
## [1] 30
custom_mode(numbers)
## [1] 30
custom_quantile(
numbers,
probs = c(0.25, 0.5, 0.75)
)
## [1] 20.0 30.0 32.5