library(pdftools)
## Using poppler version 26.01.0
library(readxl)
library(DBI)
library(RSQLite)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)

Assignment 1

extracting from CSV file

data <- read.csv("usa_cars.csv")
head(data,10)
##    X price     brand   model year  title_status mileage  color
## 1  0  6300    toyota cruiser 2008 clean vehicle  274117  black
## 2  1  2899      ford      se 2011 clean vehicle  190552 silver
## 3  2  5350     dodge     mpv 2018 clean vehicle   39590 silver
## 4  3 25000      ford    door 2014 clean vehicle   64146   blue
## 5  4 27700 chevrolet    1500 2018 clean vehicle    6654    red
## 6  5  5700     dodge     mpv 2018 clean vehicle   45561  white
## 7  6  7300 chevrolet      pk 2010 clean vehicle  149050  black
## 8  7 13350       gmc    door 2017 clean vehicle   23525   gray
## 9  8 14600 chevrolet  malibu 2018 clean vehicle    9371 silver
## 10 9  5250      ford     mpv 2017 clean vehicle   63418  black
##                    vin       lot      state country     condition
## 1    jtezu11f88k007763 159348797 new jersey     usa  10 days left
## 2    2fmdk3gc4bbb02217 166951262  tennessee     usa   6 days left
## 3    3c4pdcgg5jt346413 167655728    georgia     usa   2 days left
## 4    1ftfw1et4efc23745 167753855   virginia     usa 22 hours left
## 5    3gcpcrec2jg473991 167763266    florida     usa 22 hours left
## 6    2c4rdgeg9jr237989 167655771      texas     usa   2 days left
## 7    1gcsksea1az121133 167753872    georgia     usa 22 hours left
## 8    1gks2gkc3hr326762 167692494 california     usa 20 hours left
## 9    1g1zd5st5jf191860 167763267    florida     usa 22 hours left
## 10   2fmpk3j92hbc12542 167656121      texas     usa   2 days left

Extracting from excel file

data <- read_excel("usa_cars.xls")
## New names:
## • `` -> `...1`
head(data,10)
## # A tibble: 10 × 13
##     ...1 price brand   model  year title_status mileage color vin      lot state
##    <dbl> <dbl> <chr>   <chr> <dbl> <chr>          <dbl> <chr> <chr>  <dbl> <chr>
##  1     0  6300 toyota  crui…  2008 clean vehic…  274117 black jtez… 1.59e8 new …
##  2     1  2899 ford    se     2011 clean vehic…  190552 silv… 2fmd… 1.67e8 tenn…
##  3     2  5350 dodge   mpv    2018 clean vehic…   39590 silv… 3c4p… 1.68e8 geor…
##  4     3 25000 ford    door   2014 clean vehic…   64146 blue  1ftf… 1.68e8 virg…
##  5     4 27700 chevro… 1500   2018 clean vehic…    6654 red   3gcp… 1.68e8 flor…
##  6     5  5700 dodge   mpv    2018 clean vehic…   45561 white 2c4r… 1.68e8 texas
##  7     6  7300 chevro… pk     2010 clean vehic…  149050 black 1gcs… 1.68e8 geor…
##  8     7 13350 gmc     door   2017 clean vehic…   23525 gray  1gks… 1.68e8 cali…
##  9     8 14600 chevro… mali…  2018 clean vehic…    9371 silv… 1g1z… 1.68e8 flor…
## 10     9  5250 ford    mpv    2017 clean vehic…   63418 black 2fmp… 1.68e8 texas
## # ℹ 2 more variables: country <chr>, condition <chr>

Extracting from database

con <- dbConnect(RSQLite::SQLite(),"usa-cars.db")
data <- dbGetQuery(con, "select * from usa_cars")
dbDisconnect(con)
head(data,10)
##                                 price_brand_model_year_title_status_mileage_color_vin_lot_state_country_condition
## 1  6300,toyota,cruiser,2008,clean vehicle,274117,black,  jtezu11f88k007763,159348797,new jersey, usa,10 days left
## 2          2899,ford,se,2011,clean vehicle,190552,silver,  2fmdk3gc4bbb02217,166951262,tennessee, usa,6 days left
## 3           5350,dodge,mpv,2018,clean vehicle,39590,silver,  3c4pdcgg5jt346413,167655728,georgia, usa,2 days left
## 4         25000,ford,door,2014,clean vehicle,64146,blue,  1ftfw1et4efc23745,167753855,virginia, usa,22 hours left
## 5       27700,chevrolet,1500,2018,clean vehicle,6654,red,  3gcpcrec2jg473991,167763266,florida, usa,22 hours left
## 6              5700,dodge,mpv,2018,clean vehicle,45561,white,  2c4rdgeg9jr237989,167655771,texas, usa,2 days left
## 7      7300,chevrolet,pk,2010,clean vehicle,149050,black,  1gcsksea1az121133,167753872,georgia, usa,22 hours left
## 8        13350,gmc,door,2017,clean vehicle,23525,gray,  1gks2gkc3hr326762,167692494,california, usa,20 hours left
## 9  14600,chevrolet,malibu,2018,clean vehicle,9371,silver,  1g1zd5st5jf191860,167763267,florida, usa,22 hours left
## 10              5250,ford,mpv,2017,clean vehicle,63418,black,  2fmpk3j92hbc12542,167656121,texas, usa,2 days left

Assignment 2

Merging datasets from 2 to 3 variables

gdp = read.csv("gdp.csv")
head(gdp)
##   Country.Name Country.Code Year      Value
## 1  Afghanistan          AFG 2000 3521418060
## 2  Afghanistan          AFG 2001 2813571754
## 3  Afghanistan          AFG 2002 3825701439
## 4  Afghanistan          AFG 2003 4520946819
## 5  Afghanistan          AFG 2004 5224896719
## 6  Afghanistan          AFG 2005 6203256539
population = read.csv("population.csv")
head(population)
##   Country.Name Country.Code Year Value
## 1        Aruba          ABW 1960 54922
## 2        Aruba          ABW 1961 55578
## 3        Aruba          ABW 1962 56320
## 4        Aruba          ABW 1963 57002
## 5        Aruba          ABW 1964 57619
## 6        Aruba          ABW 1965 58190
data <- merge(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
##    Country.Code Year Country.Name.x    Value.x Country.Name.y Value.y
## 1           ABW 1986          Aruba  405586592          Aruba   59931
## 2           ABW 1987          Aruba  487709497          Aruba   59159
## 3           ABW 1988          Aruba  596648045          Aruba   59331
## 4           ABW 1989          Aruba  695530726          Aruba   60443
## 5           ABW 1990          Aruba  764804469          Aruba   62753
## 6           ABW 1991          Aruba  872067039          Aruba   65896
## 7           ABW 1992          Aruba  958659218          Aruba   69005
## 8           ABW 1993          Aruba 1083240223          Aruba   73685
## 9           ABW 1994          Aruba 1245810056          Aruba   77595
## 10          ABW 1995          Aruba 1320670391          Aruba   79805
data <- inner_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101
data <- merge(gdp, population, by = c("Country.Code", "Year"), all.x = TRUE)
head(data, 10)
##    Country.Code Year Country.Name.x    Value.x Country.Name.y Value.y
## 1           ABW 1986          Aruba  405586592          Aruba   59931
## 2           ABW 1987          Aruba  487709497          Aruba   59159
## 3           ABW 1988          Aruba  596648045          Aruba   59331
## 4           ABW 1989          Aruba  695530726          Aruba   60443
## 5           ABW 1990          Aruba  764804469          Aruba   62753
## 6           ABW 1991          Aruba  872067039          Aruba   65896
## 7           ABW 1992          Aruba  958659218          Aruba   69005
## 8           ABW 1993          Aruba 1083240223          Aruba   73685
## 9           ABW 1994          Aruba 1245810056          Aruba   77595
## 10          ABW 1995          Aruba 1320670391          Aruba   79805
data <- left_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101
data <- merge(gdp, population, by = c("Country.Code", "Year"), all.y = TRUE)
head(data, 10)
##    Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1           ABW 1960           <NA>      NA          Aruba   54922
## 2           ABW 1961           <NA>      NA          Aruba   55578
## 3           ABW 1962           <NA>      NA          Aruba   56320
## 4           ABW 1963           <NA>      NA          Aruba   57002
## 5           ABW 1964           <NA>      NA          Aruba   57619
## 6           ABW 1965           <NA>      NA          Aruba   58190
## 7           ABW 1966           <NA>      NA          Aruba   58694
## 8           ABW 1967           <NA>      NA          Aruba   58990
## 9           ABW 1968           <NA>      NA          Aruba   59069
## 10          ABW 1969           <NA>      NA          Aruba   59052
data <- right_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101
data <- merge(gdp, population, by = c("Country.Code", "Year"), all = TRUE)
head(data, 10)
##    Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1           ABW 1960           <NA>      NA          Aruba   54922
## 2           ABW 1961           <NA>      NA          Aruba   55578
## 3           ABW 1962           <NA>      NA          Aruba   56320
## 4           ABW 1963           <NA>      NA          Aruba   57002
## 5           ABW 1964           <NA>      NA          Aruba   57619
## 6           ABW 1965           <NA>      NA          Aruba   58190
## 7           ABW 1966           <NA>      NA          Aruba   58694
## 8           ABW 1967           <NA>      NA          Aruba   58990
## 9           ABW 1968           <NA>      NA          Aruba   59069
## 10          ABW 1969           <NA>      NA          Aruba   59052
data <- full_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)
##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101

Assignment 3

How Groupby work

group_by() is used to split data into groups so you can perform calculations separately for each group. %>% is called the pipe operator in R. The pipe passes the result from one step into the next step.

sales <- data.frame(
  country = c("BK Group Plc", "MTN Rwanda", "Bralirwa", "Cimerwa", "Equity Bank Ltd", "BK Group Plc", "MTN Rwanda", "Bralirwa", "Cimerwa", "Equity Bank Ltd"),
  year = c(2025,2025,2025,2025,2025,2024,2024,2024,2024,2024),
  revenue = c(281, 295, 263, 155, 36, 280, 281, 275, 134, 40)
)
sales %>% group_by(country) %>% summarise(total_revenue = sum(revenue))
## # A tibble: 5 × 2
##   country         total_revenue
##   <chr>                   <dbl>
## 1 BK Group Plc              561
## 2 Bralirwa                  538
## 3 Cimerwa                   289
## 4 Equity Bank Ltd            76
## 5 MTN Rwanda                576

Assignment 4

How to use trace() & recover ()

They help you:

  • see where an error happened
  • inspect function calls
  • debug complex code

These are built into base R.

trace() in R is a debugging tool used to temporarily insert code into an existing function without modifying the original source code.

It is useful for:

  • Debugging package functions
  • Monitoring function calls
  • Inspecting arguments
  • Logging execution flow

Basic Idea

Example:

trace(mean, quote(print('Trace called')))
## Tracing function "mean" in package "base"
## [1] "mean"

Now whenever mean() runs, R enters tracing mode.

mean(c(1,2,3))
## Tracing mean(c(1, 2, 3)) on entry 
## [1] "Trace called"
## [1] 2

Remove Trace

untrace(mean)
## Untracing function "mean" in package "base"

Check trace removed

mean(c(1,2,3))
## [1] 2

2. recover()

recover() lets you enter the environment where the error happened and inspect variables interactively. Enable Recover Mode

options(error = recover)

Now whenever an error occurs, R enters debugging mode.

Example

divide <- function(x, y) {
  result <- x / y
}
divide(1, 'a')
## Error in x/y: non-numeric argument to binary operator

What Happens Instead of stopping normally, R shows: Enter a frame number, or 0 to exit

Assignment 5

  • lapply()
  • sapply()
  • vapply()

These functions are used for iteration and functional programming in R.

1. lapply() — Returns a List

numbers <- list(1, 2, 3, 4)
result <- lapply(numbers, function(x) x * 2)
print(result)
## [[1]]
## [1] 2
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 6
## 
## [[4]]
## [1] 8

2. sapply() — Simplifies Output

sapply() simplifies the result into vectors, matrices, or arrays whenever possible.

Example — Producing a Vector

numbers <- list(1, 2, 3, 4)
result <- sapply(numbers, function(x) x * 2)
print(result)
## [1] 2 4 6 8

Example — Producing a Matrix

numbers <- list(1, 2, 3)
result <- sapply(numbers, function(x) c(x, x^2))
print(result)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    1    4    9

3. vapply() — Safer Version of sapply()

vapply() requires specifying the expected output type.

Syntax

vapply(X, FUN, FUN.VALUE)

Example — Numeric Vector

numbers <- list(1, 2, 3, 4)
result <- vapply( numbers, function(x) x * 2, numeric(1))
print(result)
## [1] 2 4 6 8

Difference Between sapply() and vapply()

sapply() vapply()
Automatically guesses output type Requires fixed output type
Easier to write Safer and more predictable
May return unexpected structures Consistent output

Assignment 6

  • Mean
  • Median
  • Mode
  • Quantiles

without using R built-in statistical functions such as:

  • mean()
  • median()
  • quantile()
  • Mode()

We will use: - sapply() - lapply() - vapply()

and core R logic.

Sample Data

numbers <- c(10,20,30,40,50,20,30,30)

1. Mean Function

Formula

\[ Mean = \frac{\sum x}{n} \]

Implementation

my_mean <- function(x) {
  total <- 0
  sapply(x, function(value) {
    total <<- total + value
  })
  total / length(x)
}

Test

my_mean(numbers)
## [1] 28.75

2. Custom Median Function

Steps

  1. Sort values
  2. Find middle position
  3. If even:
    • average middle two values
  4. If odd:
    • return middle value

Implementation

my_median <- function(x) {
  sorted <- sort(x)
  n <- length(sorted)
  middle <- n / 2
  if (n %% 2 == 0) {
    left <- sorted[middle]
    right <- sorted[middle + 1]
    (left + right) / 2
  } else {
    sorted[(n + 1) / 2]
  }
}

Test

my_median(numbers)
## [1] 30

3. Custom Mode Function

Mode

The most frequently occurring value.

Implementation

my_mode <- function(x) {
  new_values <- unique(x)
  frequencies <- sapply(new_values, function(value) {
    count <- 0
    sapply(x, function(item) {
      if (item == value) {
        count <<- count + 1
      }
    })
    count
  })
  new_values[which.max(frequencies)]
}

Test

my_mode(numbers)
## [1] 30

4. Custom Quantile Function

Quantiles

Quantiles divide ordered data into intervals.

Example: - 0.25 = first quartile - 0.50 = median - 0.75 = third quartile

Formula

\[ Position = (n - 1)p + 1 \]

where: - \(n\) = number of observations - \(p\) = probability

Implementation

my_quantile <- function(x, probs) {
  sorted <- sort(x)
  n <- length(sorted)
  results <- sapply(probs, function(p) {
    position <- (n - 1) * p + 1
    lower <- floor(position)
    upper <- ceiling(position)
    if (lower == upper) {
      sorted[lower]
    } else {
      lower_value <- sorted[lower]
      upper_value <- sorted[upper]
      lower_value +
        (position - lower) *
        (upper_value - lower_value)
    }
  })
  return(results)
}

Test

my_quantile(numbers,probs = c(0.25, 0.5, 0.75))
## [1] 20.0 30.0 32.5