library(pdftools)
## Using poppler version 26.01.0
library(readxl)
library(DBI)
library(RSQLite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
data <- read.csv("usa_cars.csv")
head(data,10)
## X price brand model year title_status mileage color
## 1 0 6300 toyota cruiser 2008 clean vehicle 274117 black
## 2 1 2899 ford se 2011 clean vehicle 190552 silver
## 3 2 5350 dodge mpv 2018 clean vehicle 39590 silver
## 4 3 25000 ford door 2014 clean vehicle 64146 blue
## 5 4 27700 chevrolet 1500 2018 clean vehicle 6654 red
## 6 5 5700 dodge mpv 2018 clean vehicle 45561 white
## 7 6 7300 chevrolet pk 2010 clean vehicle 149050 black
## 8 7 13350 gmc door 2017 clean vehicle 23525 gray
## 9 8 14600 chevrolet malibu 2018 clean vehicle 9371 silver
## 10 9 5250 ford mpv 2017 clean vehicle 63418 black
## vin lot state country condition
## 1 jtezu11f88k007763 159348797 new jersey usa 10 days left
## 2 2fmdk3gc4bbb02217 166951262 tennessee usa 6 days left
## 3 3c4pdcgg5jt346413 167655728 georgia usa 2 days left
## 4 1ftfw1et4efc23745 167753855 virginia usa 22 hours left
## 5 3gcpcrec2jg473991 167763266 florida usa 22 hours left
## 6 2c4rdgeg9jr237989 167655771 texas usa 2 days left
## 7 1gcsksea1az121133 167753872 georgia usa 22 hours left
## 8 1gks2gkc3hr326762 167692494 california usa 20 hours left
## 9 1g1zd5st5jf191860 167763267 florida usa 22 hours left
## 10 2fmpk3j92hbc12542 167656121 texas usa 2 days left
data <- read_excel("usa_cars.xls")
## New names:
## • `` -> `...1`
head(data,10)
## # A tibble: 10 × 13
## ...1 price brand model year title_status mileage color vin lot state
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 0 6300 toyota crui… 2008 clean vehic… 274117 black jtez… 1.59e8 new …
## 2 1 2899 ford se 2011 clean vehic… 190552 silv… 2fmd… 1.67e8 tenn…
## 3 2 5350 dodge mpv 2018 clean vehic… 39590 silv… 3c4p… 1.68e8 geor…
## 4 3 25000 ford door 2014 clean vehic… 64146 blue 1ftf… 1.68e8 virg…
## 5 4 27700 chevro… 1500 2018 clean vehic… 6654 red 3gcp… 1.68e8 flor…
## 6 5 5700 dodge mpv 2018 clean vehic… 45561 white 2c4r… 1.68e8 texas
## 7 6 7300 chevro… pk 2010 clean vehic… 149050 black 1gcs… 1.68e8 geor…
## 8 7 13350 gmc door 2017 clean vehic… 23525 gray 1gks… 1.68e8 cali…
## 9 8 14600 chevro… mali… 2018 clean vehic… 9371 silv… 1g1z… 1.68e8 flor…
## 10 9 5250 ford mpv 2017 clean vehic… 63418 black 2fmp… 1.68e8 texas
## # ℹ 2 more variables: country <chr>, condition <chr>
con <- dbConnect(RSQLite::SQLite(),"usa-cars.db")
data <- dbGetQuery(con, "select * from usa_cars")
dbDisconnect(con)
head(data,10)
## price_brand_model_year_title_status_mileage_color_vin_lot_state_country_condition
## 1 6300,toyota,cruiser,2008,clean vehicle,274117,black, jtezu11f88k007763,159348797,new jersey, usa,10 days left
## 2 2899,ford,se,2011,clean vehicle,190552,silver, 2fmdk3gc4bbb02217,166951262,tennessee, usa,6 days left
## 3 5350,dodge,mpv,2018,clean vehicle,39590,silver, 3c4pdcgg5jt346413,167655728,georgia, usa,2 days left
## 4 25000,ford,door,2014,clean vehicle,64146,blue, 1ftfw1et4efc23745,167753855,virginia, usa,22 hours left
## 5 27700,chevrolet,1500,2018,clean vehicle,6654,red, 3gcpcrec2jg473991,167763266,florida, usa,22 hours left
## 6 5700,dodge,mpv,2018,clean vehicle,45561,white, 2c4rdgeg9jr237989,167655771,texas, usa,2 days left
## 7 7300,chevrolet,pk,2010,clean vehicle,149050,black, 1gcsksea1az121133,167753872,georgia, usa,22 hours left
## 8 13350,gmc,door,2017,clean vehicle,23525,gray, 1gks2gkc3hr326762,167692494,california, usa,20 hours left
## 9 14600,chevrolet,malibu,2018,clean vehicle,9371,silver, 1g1zd5st5jf191860,167763267,florida, usa,22 hours left
## 10 5250,ford,mpv,2017,clean vehicle,63418,black, 2fmpk3j92hbc12542,167656121,texas, usa,2 days left
gdp = read.csv("gdp.csv")
head(gdp)
## Country.Name Country.Code Year Value
## 1 Afghanistan AFG 2000 3521418060
## 2 Afghanistan AFG 2001 2813571754
## 3 Afghanistan AFG 2002 3825701439
## 4 Afghanistan AFG 2003 4520946819
## 5 Afghanistan AFG 2004 5224896719
## 6 Afghanistan AFG 2005 6203256539
population = read.csv("population.csv")
head(population)
## Country.Name Country.Code Year Value
## 1 Aruba ABW 1960 54922
## 2 Aruba ABW 1961 55578
## 3 Aruba ABW 1962 56320
## 4 Aruba ABW 1963 57002
## 5 Aruba ABW 1964 57619
## 6 Aruba ABW 1965 58190
data <- merge(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
## Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1 ABW 1986 Aruba 405586592 Aruba 59931
## 2 ABW 1987 Aruba 487709497 Aruba 59159
## 3 ABW 1988 Aruba 596648045 Aruba 59331
## 4 ABW 1989 Aruba 695530726 Aruba 60443
## 5 ABW 1990 Aruba 764804469 Aruba 62753
## 6 ABW 1991 Aruba 872067039 Aruba 65896
## 7 ABW 1992 Aruba 958659218 Aruba 69005
## 8 ABW 1993 Aruba 1083240223 Aruba 73685
## 9 ABW 1994 Aruba 1245810056 Aruba 77595
## 10 ABW 1995 Aruba 1320670391 Aruba 79805
data <- inner_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
## Country.Name.x Country.Code Year Value.x Country.Name.y Value.y
## 1 Afghanistan AFG 2000 3521418060 Afghanistan 20130327
## 2 Afghanistan AFG 2001 2813571754 Afghanistan 20284307
## 3 Afghanistan AFG 2002 3825701439 Afghanistan 21378117
## 4 Afghanistan AFG 2003 4520946819 Afghanistan 22733049
## 5 Afghanistan AFG 2004 5224896719 Afghanistan 23560654
## 6 Afghanistan AFG 2005 6203256539 Afghanistan 24404567
## 7 Afghanistan AFG 2006 6971758282 Afghanistan 25424094
## 8 Afghanistan AFG 2007 9747886187 Afghanistan 25909852
## 9 Afghanistan AFG 2008 10109297048 Afghanistan 26482622
## 10 Afghanistan AFG 2009 12416152732 Afghanistan 27466101
data <- merge(gdp, population, by = c("Country.Code", "Year"), all.x = TRUE)
head(data, 10)
## Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1 ABW 1986 Aruba 405586592 Aruba 59931
## 2 ABW 1987 Aruba 487709497 Aruba 59159
## 3 ABW 1988 Aruba 596648045 Aruba 59331
## 4 ABW 1989 Aruba 695530726 Aruba 60443
## 5 ABW 1990 Aruba 764804469 Aruba 62753
## 6 ABW 1991 Aruba 872067039 Aruba 65896
## 7 ABW 1992 Aruba 958659218 Aruba 69005
## 8 ABW 1993 Aruba 1083240223 Aruba 73685
## 9 ABW 1994 Aruba 1245810056 Aruba 77595
## 10 ABW 1995 Aruba 1320670391 Aruba 79805
data <- left_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
## Country.Name.x Country.Code Year Value.x Country.Name.y Value.y
## 1 Afghanistan AFG 2000 3521418060 Afghanistan 20130327
## 2 Afghanistan AFG 2001 2813571754 Afghanistan 20284307
## 3 Afghanistan AFG 2002 3825701439 Afghanistan 21378117
## 4 Afghanistan AFG 2003 4520946819 Afghanistan 22733049
## 5 Afghanistan AFG 2004 5224896719 Afghanistan 23560654
## 6 Afghanistan AFG 2005 6203256539 Afghanistan 24404567
## 7 Afghanistan AFG 2006 6971758282 Afghanistan 25424094
## 8 Afghanistan AFG 2007 9747886187 Afghanistan 25909852
## 9 Afghanistan AFG 2008 10109297048 Afghanistan 26482622
## 10 Afghanistan AFG 2009 12416152732 Afghanistan 27466101
data <- merge(gdp, population, by = c("Country.Code", "Year"), all.y = TRUE)
head(data, 10)
## Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1 ABW 1960 <NA> NA Aruba 54922
## 2 ABW 1961 <NA> NA Aruba 55578
## 3 ABW 1962 <NA> NA Aruba 56320
## 4 ABW 1963 <NA> NA Aruba 57002
## 5 ABW 1964 <NA> NA Aruba 57619
## 6 ABW 1965 <NA> NA Aruba 58190
## 7 ABW 1966 <NA> NA Aruba 58694
## 8 ABW 1967 <NA> NA Aruba 58990
## 9 ABW 1968 <NA> NA Aruba 59069
## 10 ABW 1969 <NA> NA Aruba 59052
data <- right_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
## Country.Name.x Country.Code Year Value.x Country.Name.y Value.y
## 1 Afghanistan AFG 2000 3521418060 Afghanistan 20130327
## 2 Afghanistan AFG 2001 2813571754 Afghanistan 20284307
## 3 Afghanistan AFG 2002 3825701439 Afghanistan 21378117
## 4 Afghanistan AFG 2003 4520946819 Afghanistan 22733049
## 5 Afghanistan AFG 2004 5224896719 Afghanistan 23560654
## 6 Afghanistan AFG 2005 6203256539 Afghanistan 24404567
## 7 Afghanistan AFG 2006 6971758282 Afghanistan 25424094
## 8 Afghanistan AFG 2007 9747886187 Afghanistan 25909852
## 9 Afghanistan AFG 2008 10109297048 Afghanistan 26482622
## 10 Afghanistan AFG 2009 12416152732 Afghanistan 27466101
data <- merge(gdp, population, by = c("Country.Code", "Year"), all = TRUE)
head(data, 10)
## Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1 ABW 1960 <NA> NA Aruba 54922
## 2 ABW 1961 <NA> NA Aruba 55578
## 3 ABW 1962 <NA> NA Aruba 56320
## 4 ABW 1963 <NA> NA Aruba 57002
## 5 ABW 1964 <NA> NA Aruba 57619
## 6 ABW 1965 <NA> NA Aruba 58190
## 7 ABW 1966 <NA> NA Aruba 58694
## 8 ABW 1967 <NA> NA Aruba 58990
## 9 ABW 1968 <NA> NA Aruba 59069
## 10 ABW 1969 <NA> NA Aruba 59052
data <- full_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
## Country.Name.x Country.Code Year Value.x Country.Name.y Value.y
## 1 Afghanistan AFG 2000 3521418060 Afghanistan 20130327
## 2 Afghanistan AFG 2001 2813571754 Afghanistan 20284307
## 3 Afghanistan AFG 2002 3825701439 Afghanistan 21378117
## 4 Afghanistan AFG 2003 4520946819 Afghanistan 22733049
## 5 Afghanistan AFG 2004 5224896719 Afghanistan 23560654
## 6 Afghanistan AFG 2005 6203256539 Afghanistan 24404567
## 7 Afghanistan AFG 2006 6971758282 Afghanistan 25424094
## 8 Afghanistan AFG 2007 9747886187 Afghanistan 25909852
## 9 Afghanistan AFG 2008 10109297048 Afghanistan 26482622
## 10 Afghanistan AFG 2009 12416152732 Afghanistan 27466101
group_by() is used to split data into groups so you can perform calculations separately for each group. %>% is called the pipe operator in R. The pipe passes the result from one step into the next step.
sales <- data.frame(
country = c("BK Group Plc", "MTN Rwanda", "Bralirwa", "Cimerwa", "Equity Bank Ltd", "BK Group Plc", "MTN Rwanda", "Bralirwa", "Cimerwa", "Equity Bank Ltd"),
year = c(2025,2025,2025,2025,2025,2024,2024,2024,2024,2024),
revenue = c(281, 295, 263, 155, 36, 280, 281, 275, 134, 40)
)
sales %>% group_by(country) %>% summarise(total_revenue = sum(revenue))
## # A tibble: 5 × 2
## country total_revenue
## <chr> <dbl>
## 1 BK Group Plc 561
## 2 Bralirwa 538
## 3 Cimerwa 289
## 4 Equity Bank Ltd 76
## 5 MTN Rwanda 576
They help you:
These are built into base R.
trace() in R is a debugging tool used to temporarily
insert code into an existing function without modifying the original
source code.
It is useful for:
Example:
trace(mean, quote(print('Trace called')))
## Tracing function "mean" in package "base"
## [1] "mean"
Now whenever mean() runs, R enters tracing mode.
mean(c(1,2,3))
## Tracing mean(c(1, 2, 3)) on entry
## [1] "Trace called"
## [1] 2
untrace(mean)
## Untracing function "mean" in package "base"
mean(c(1,2,3))
## [1] 2
recover() lets you enter the environment where the error happened and inspect variables interactively. Enable Recover Mode
options(error = recover)
Now whenever an error occurs, R enters debugging mode.
Example
divide <- function(x, y) {
result <- x / y
}
divide(1, 'a')
## Error in x/y: non-numeric argument to binary operator
What Happens Instead of stopping normally, R shows: Enter a frame number, or 0 to exit
lapply()sapply()vapply()These functions are used for iteration and functional programming in R.
lapply() — Returns a Listnumbers <- list(1, 2, 3, 4)
result <- lapply(numbers, function(x) x * 2)
print(result)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 6
##
## [[4]]
## [1] 8
sapply() — Simplifies Outputsapply() simplifies the result into vectors, matrices,
or arrays whenever possible.
numbers <- list(1, 2, 3, 4)
result <- sapply(numbers, function(x) x * 2)
print(result)
## [1] 2 4 6 8
numbers <- list(1, 2, 3)
result <- sapply(numbers, function(x) c(x, x^2))
print(result)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 1 4 9
vapply() — Safer Version of
sapply()vapply() requires specifying the expected output
type.
vapply(X, FUN, FUN.VALUE)
numbers <- list(1, 2, 3, 4)
result <- vapply( numbers, function(x) x * 2, numeric(1))
print(result)
## [1] 2 4 6 8
sapply() and
vapply()| sapply() | vapply() |
|---|---|
| Automatically guesses output type | Requires fixed output type |
| Easier to write | Safer and more predictable |
| May return unexpected structures | Consistent output |
without using R built-in statistical functions such as:
mean()median()quantile()Mode()We will use: - sapply() - lapply() -
vapply()
and core R logic.
numbers <- c(10,20,30,40,50,20,30,30)
\[ Mean = \frac{\sum x}{n} \]
my_mean <- function(x) {
total <- 0
sapply(x, function(value) {
total <<- total + value
})
total / length(x)
}
my_mean(numbers)
## [1] 28.75
my_median <- function(x) {
sorted <- sort(x)
n <- length(sorted)
middle <- n / 2
if (n %% 2 == 0) {
left <- sorted[middle]
right <- sorted[middle + 1]
(left + right) / 2
} else {
sorted[(n + 1) / 2]
}
}
my_median(numbers)
## [1] 30
The most frequently occurring value.
my_mode <- function(x) {
new_values <- unique(x)
frequencies <- sapply(new_values, function(value) {
count <- 0
sapply(x, function(item) {
if (item == value) {
count <<- count + 1
}
})
count
})
new_values[which.max(frequencies)]
}
my_mode(numbers)
## [1] 30
Quantiles divide ordered data into intervals.
Example: - 0.25 = first quartile - 0.50 = median - 0.75 = third quartile
\[ Position = (n - 1)p + 1 \]
where: - \(n\) = number of observations - \(p\) = probability
my_quantile <- function(x, probs) {
sorted <- sort(x)
n <- length(sorted)
results <- sapply(probs, function(p) {
position <- (n - 1) * p + 1
lower <- floor(position)
upper <- ceiling(position)
if (lower == upper) {
sorted[lower]
} else {
lower_value <- sorted[lower]
upper_value <- sorted[upper]
lower_value +
(position - lower) *
(upper_value - lower_value)
}
})
return(results)
}
my_quantile(numbers,probs = c(0.25, 0.5, 0.75))
## [1] 20.0 30.0 32.5