ASSIGNMENT.knit

library(pdftools)

## Using poppler version 26.01.0

library(readxl)
library(DBI)
library(RSQLite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)

Assignment 1

extracting from CSV file

data <- read.csv("usa_cars.csv")
head(data,10)

##    X price     brand   model year  title_status mileage  color
## 1  0  6300    toyota cruiser 2008 clean vehicle  274117  black
## 2  1  2899      ford      se 2011 clean vehicle  190552 silver
## 3  2  5350     dodge     mpv 2018 clean vehicle   39590 silver
## 4  3 25000      ford    door 2014 clean vehicle   64146   blue
## 5  4 27700 chevrolet    1500 2018 clean vehicle    6654    red
## 6  5  5700     dodge     mpv 2018 clean vehicle   45561  white
## 7  6  7300 chevrolet      pk 2010 clean vehicle  149050  black
## 8  7 13350       gmc    door 2017 clean vehicle   23525   gray
## 9  8 14600 chevrolet  malibu 2018 clean vehicle    9371 silver
## 10 9  5250      ford     mpv 2017 clean vehicle   63418  black
##                    vin       lot      state country     condition
## 1    jtezu11f88k007763 159348797 new jersey     usa  10 days left
## 2    2fmdk3gc4bbb02217 166951262  tennessee     usa   6 days left
## 3    3c4pdcgg5jt346413 167655728    georgia     usa   2 days left
## 4    1ftfw1et4efc23745 167753855   virginia     usa 22 hours left
## 5    3gcpcrec2jg473991 167763266    florida     usa 22 hours left
## 6    2c4rdgeg9jr237989 167655771      texas     usa   2 days left
## 7    1gcsksea1az121133 167753872    georgia     usa 22 hours left
## 8    1gks2gkc3hr326762 167692494 california     usa 20 hours left
## 9    1g1zd5st5jf191860 167763267    florida     usa 22 hours left
## 10   2fmpk3j92hbc12542 167656121      texas     usa   2 days left

Extracting from excel file

data <- read_excel("usa_cars.xls")

## New names:
## • `` -> `...1`

head(data,10)

## # A tibble: 10 × 13
##     ...1 price brand   model  year title_status mileage color vin      lot state
##    <dbl> <dbl> <chr>   <chr> <dbl> <chr>          <dbl> <chr> <chr>  <dbl> <chr>
##  1     0  6300 toyota  crui…  2008 clean vehic…  274117 black jtez… 1.59e8 new …
##  2     1  2899 ford    se     2011 clean vehic…  190552 silv… 2fmd… 1.67e8 tenn…
##  3     2  5350 dodge   mpv    2018 clean vehic…   39590 silv… 3c4p… 1.68e8 geor…
##  4     3 25000 ford    door   2014 clean vehic…   64146 blue  1ftf… 1.68e8 virg…
##  5     4 27700 chevro… 1500   2018 clean vehic…    6654 red   3gcp… 1.68e8 flor…
##  6     5  5700 dodge   mpv    2018 clean vehic…   45561 white 2c4r… 1.68e8 texas
##  7     6  7300 chevro… pk     2010 clean vehic…  149050 black 1gcs… 1.68e8 geor…
##  8     7 13350 gmc     door   2017 clean vehic…   23525 gray  1gks… 1.68e8 cali…
##  9     8 14600 chevro… mali…  2018 clean vehic…    9371 silv… 1g1z… 1.68e8 flor…
## 10     9  5250 ford    mpv    2017 clean vehic…   63418 black 2fmp… 1.68e8 texas
## # ℹ 2 more variables: country <chr>, condition <chr>

Extracting from database

con <- dbConnect(RSQLite::SQLite(),"usa-cars.db")
data <- dbGetQuery(con, "select * from usa_cars")
dbDisconnect(con)
head(data,10)

##                                 price_brand_model_year_title_status_mileage_color_vin_lot_state_country_condition
## 1  6300,toyota,cruiser,2008,clean vehicle,274117,black,  jtezu11f88k007763,159348797,new jersey, usa,10 days left
## 2          2899,ford,se,2011,clean vehicle,190552,silver,  2fmdk3gc4bbb02217,166951262,tennessee, usa,6 days left
## 3           5350,dodge,mpv,2018,clean vehicle,39590,silver,  3c4pdcgg5jt346413,167655728,georgia, usa,2 days left
## 4         25000,ford,door,2014,clean vehicle,64146,blue,  1ftfw1et4efc23745,167753855,virginia, usa,22 hours left
## 5       27700,chevrolet,1500,2018,clean vehicle,6654,red,  3gcpcrec2jg473991,167763266,florida, usa,22 hours left
## 6              5700,dodge,mpv,2018,clean vehicle,45561,white,  2c4rdgeg9jr237989,167655771,texas, usa,2 days left
## 7      7300,chevrolet,pk,2010,clean vehicle,149050,black,  1gcsksea1az121133,167753872,georgia, usa,22 hours left
## 8        13350,gmc,door,2017,clean vehicle,23525,gray,  1gks2gkc3hr326762,167692494,california, usa,20 hours left
## 9  14600,chevrolet,malibu,2018,clean vehicle,9371,silver,  1g1zd5st5jf191860,167763267,florida, usa,22 hours left
## 10              5250,ford,mpv,2017,clean vehicle,63418,black,  2fmpk3j92hbc12542,167656121,texas, usa,2 days left

Assignment 2

Merging datasets from 2 to 3 variables

gdp = read.csv("gdp.csv")
head(gdp)

##   Country.Name Country.Code Year      Value
## 1  Afghanistan          AFG 2000 3521418060
## 2  Afghanistan          AFG 2001 2813571754
## 3  Afghanistan          AFG 2002 3825701439
## 4  Afghanistan          AFG 2003 4520946819
## 5  Afghanistan          AFG 2004 5224896719
## 6  Afghanistan          AFG 2005 6203256539

population = read.csv("population.csv")
head(population)

##   Country.Name Country.Code Year Value
## 1        Aruba          ABW 1960 54922
## 2        Aruba          ABW 1961 55578
## 3        Aruba          ABW 1962 56320
## 4        Aruba          ABW 1963 57002
## 5        Aruba          ABW 1964 57619
## 6        Aruba          ABW 1965 58190

data <- merge(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)

##    Country.Code Year Country.Name.x    Value.x Country.Name.y Value.y
## 1           ABW 1986          Aruba  405586592          Aruba   59931
## 2           ABW 1987          Aruba  487709497          Aruba   59159
## 3           ABW 1988          Aruba  596648045          Aruba   59331
## 4           ABW 1989          Aruba  695530726          Aruba   60443
## 5           ABW 1990          Aruba  764804469          Aruba   62753
## 6           ABW 1991          Aruba  872067039          Aruba   65896
## 7           ABW 1992          Aruba  958659218          Aruba   69005
## 8           ABW 1993          Aruba 1083240223          Aruba   73685
## 9           ABW 1994          Aruba 1245810056          Aruba   77595
## 10          ABW 1995          Aruba 1320670391          Aruba   79805

data <- inner_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)

##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101

data <- merge(gdp, population, by = c("Country.Code", "Year"), all.x = TRUE)
head(data, 10)

##    Country.Code Year Country.Name.x    Value.x Country.Name.y Value.y
## 1           ABW 1986          Aruba  405586592          Aruba   59931
## 2           ABW 1987          Aruba  487709497          Aruba   59159
## 3           ABW 1988          Aruba  596648045          Aruba   59331
## 4           ABW 1989          Aruba  695530726          Aruba   60443
## 5           ABW 1990          Aruba  764804469          Aruba   62753
## 6           ABW 1991          Aruba  872067039          Aruba   65896
## 7           ABW 1992          Aruba  958659218          Aruba   69005
## 8           ABW 1993          Aruba 1083240223          Aruba   73685
## 9           ABW 1994          Aruba 1245810056          Aruba   77595
## 10          ABW 1995          Aruba 1320670391          Aruba   79805

data <- left_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)

##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101

data <- merge(gdp, population, by = c("Country.Code", "Year"), all.y = TRUE)
head(data, 10)

##    Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1           ABW 1960           <NA>      NA          Aruba   54922
## 2           ABW 1961           <NA>      NA          Aruba   55578
## 3           ABW 1962           <NA>      NA          Aruba   56320
## 4           ABW 1963           <NA>      NA          Aruba   57002
## 5           ABW 1964           <NA>      NA          Aruba   57619
## 6           ABW 1965           <NA>      NA          Aruba   58190
## 7           ABW 1966           <NA>      NA          Aruba   58694
## 8           ABW 1967           <NA>      NA          Aruba   58990
## 9           ABW 1968           <NA>      NA          Aruba   59069
## 10          ABW 1969           <NA>      NA          Aruba   59052

data <- right_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)

##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101

data <- merge(gdp, population, by = c("Country.Code", "Year"), all = TRUE)
head(data, 10)

##    Country.Code Year Country.Name.x Value.x Country.Name.y Value.y
## 1           ABW 1960           <NA>      NA          Aruba   54922
## 2           ABW 1961           <NA>      NA          Aruba   55578
## 3           ABW 1962           <NA>      NA          Aruba   56320
## 4           ABW 1963           <NA>      NA          Aruba   57002
## 5           ABW 1964           <NA>      NA          Aruba   57619
## 6           ABW 1965           <NA>      NA          Aruba   58190
## 7           ABW 1966           <NA>      NA          Aruba   58694
## 8           ABW 1967           <NA>      NA          Aruba   58990
## 9           ABW 1968           <NA>      NA          Aruba   59069
## 10          ABW 1969           <NA>      NA          Aruba   59052

data <- full_join(gdp, population, by = c("Country.Code", "Year"))
head(data, 10)

##    Country.Name.x Country.Code Year     Value.x Country.Name.y  Value.y
## 1     Afghanistan          AFG 2000  3521418060    Afghanistan 20130327
## 2     Afghanistan          AFG 2001  2813571754    Afghanistan 20284307
## 3     Afghanistan          AFG 2002  3825701439    Afghanistan 21378117
## 4     Afghanistan          AFG 2003  4520946819    Afghanistan 22733049
## 5     Afghanistan          AFG 2004  5224896719    Afghanistan 23560654
## 6     Afghanistan          AFG 2005  6203256539    Afghanistan 24404567
## 7     Afghanistan          AFG 2006  6971758282    Afghanistan 25424094
## 8     Afghanistan          AFG 2007  9747886187    Afghanistan 25909852
## 9     Afghanistan          AFG 2008 10109297048    Afghanistan 26482622
## 10    Afghanistan          AFG 2009 12416152732    Afghanistan 27466101

Assignment 3

How Groupby work

group_by() is used to split data into groups so you can perform calculations separately for each group. %>% is called the pipe operator in R. The pipe passes the result from one step into the next step.

sales <- data.frame(
  country = c("BK Group Plc", "MTN Rwanda", "Bralirwa", "Cimerwa", "Equity Bank Ltd", "BK Group Plc", "MTN Rwanda", "Bralirwa", "Cimerwa", "Equity Bank Ltd"),
  year = c(2025,2025,2025,2025,2025,2024,2024,2024,2024,2024),
  revenue = c(281, 295, 263, 155, 36, 280, 281, 275, 134, 40)
)
sales %>% group_by(country) %>% summarise(total_revenue = sum(revenue))

## # A tibble: 5 × 2
##   country         total_revenue
##   <chr>                   <dbl>
## 1 BK Group Plc              561
## 2 Bralirwa                  538
## 3 Cimerwa                   289
## 4 Equity Bank Ltd            76
## 5 MTN Rwanda                576

Assignment 4

How to use trace() & recover ()

They help you:

see where an error happened
inspect function calls
debug complex code

These are built into base R.

trace() in R is a debugging tool used to temporarily insert code into an existing function without modifying the original source code.

It is useful for:

Debugging package functions
Monitoring function calls
Inspecting arguments
Logging execution flow

Basic Idea

Example:

trace(mean, quote(print('Trace called')))

## Tracing function "mean" in package "base"

## [1] "mean"

Now whenever mean() runs, R enters tracing mode.

mean(c(1,2,3))

## Tracing mean(c(1, 2, 3)) on entry 
## [1] "Trace called"

## [1] 2

Remove Trace

untrace(mean)

## Untracing function "mean" in package "base"

Check trace removed

mean(c(1,2,3))

## [1] 2

2. recover()

recover() lets you enter the environment where the error happened and inspect variables interactively. Enable Recover Mode

options(error = recover)

Now whenever an error occurs, R enters debugging mode.

Example

divide <- function(x, y) {
  result <- x / y
}
divide(1, 'a')

## Error in x/y: non-numeric argument to binary operator

What Happens Instead of stopping normally, R shows: Enter a frame number, or 0 to exit

Assignment 5

lapply()
sapply()
vapply()

These functions are used for iteration and functional programming in R.

1. `lapply()` — Returns a List

numbers <- list(1, 2, 3, 4)
result <- lapply(numbers, function(x) x * 2)
print(result)

## [[1]]
## [1] 2
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 6
## 
## [[4]]
## [1] 8

2. `sapply()` — Simplifies Output

sapply() simplifies the result into vectors, matrices, or arrays whenever possible.

Example — Producing a Vector

numbers <- list(1, 2, 3, 4)
result <- sapply(numbers, function(x) x * 2)
print(result)

## [1] 2 4 6 8

Example — Producing a Matrix

numbers <- list(1, 2, 3)
result <- sapply(numbers, function(x) c(x, x^2))
print(result)

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    1    4    9

3. `vapply()` — Safer Version of `sapply()`

vapply() requires specifying the expected output type.

Syntax

vapply(X, FUN, FUN.VALUE)

Example — Numeric Vector

numbers <- list(1, 2, 3, 4)
result <- vapply( numbers, function(x) x * 2, numeric(1))
print(result)

## [1] 2 4 6 8

Difference Between `sapply()` and `vapply()`

sapply()	vapply()
Automatically guesses output type	Requires fixed output type
Easier to write	Safer and more predictable
May return unexpected structures	Consistent output

Assignment 6

Mean
Median
Mode
Quantiles

without using R built-in statistical functions such as:

mean()
median()
quantile()
Mode()

We will use: - sapply() - lapply() - vapply()

and core R logic.

Sample Data

numbers <- c(10,20,30,40,50,20,30,30)

1. Mean Function

Formula

\[ Mean = \frac{\sum x}{n} \]

Implementation

my_mean <- function(x) {
  total <- 0
  sapply(x, function(value) {
    total <<- total + value
  })
  total / length(x)
}

Test

my_mean(numbers)

## [1] 28.75

2. Custom Median Function

Steps

Sort values
Find middle position
If even:
- average middle two values
If odd:
- return middle value

Implementation

my_median <- function(x) {
  sorted <- sort(x)
  n <- length(sorted)
  middle <- n / 2
  if (n %% 2 == 0) {
    left <- sorted[middle]
    right <- sorted[middle + 1]
    (left + right) / 2
  } else {
    sorted[(n + 1) / 2]
  }
}

Test

my_median(numbers)

## [1] 30

3. Custom Mode Function

Mode

The most frequently occurring value.

Implementation

my_mode <- function(x) {
  new_values <- unique(x)
  frequencies <- sapply(new_values, function(value) {
    count <- 0
    sapply(x, function(item) {
      if (item == value) {
        count <<- count + 1
      }
    })
    count
  })
  new_values[which.max(frequencies)]
}

Test

my_mode(numbers)

## [1] 30

4. Custom Quantile Function

Quantiles

Quantiles divide ordered data into intervals.

Example: - 0.25 = first quartile - 0.50 = median - 0.75 = third quartile

Formula

\[ Position = (n - 1)p + 1 \]

where: - \(n\) = number of observations - \(p\) = probability

Implementation

my_quantile <- function(x, probs) {
  sorted <- sort(x)
  n <- length(sorted)
  results <- sapply(probs, function(p) {
    position <- (n - 1) * p + 1
    lower <- floor(position)
    upper <- ceiling(position)
    if (lower == upper) {
      sorted[lower]
    } else {
      lower_value <- sorted[lower]
      upper_value <- sorted[upper]
      lower_value +
        (position - lower) *
        (upper_value - lower_value)
    }
  })
  return(results)
}

Test

my_quantile(numbers,probs = c(0.25, 0.5, 0.75))

## [1] 20.0 30.0 32.5

ADVENTIST UNIVERSITY OF CENTRAL AFRICA

MASTER OF IT IN BIGDATA ANALYTICS

Daniel DUSHIMIRIMANA

20251MBI028

ALL ASSIGNMENT OF R PROGRAMMING

2026-05-24

Assignment 1

extracting from CSV file

Extracting from excel file

Extracting from database

Assignment 2

Merging datasets from 2 to 3 variables

Assignment 3

How Groupby work

Assignment 4

How to use trace() & recover ()

Basic Idea

Remove Trace

Check trace removed

2. recover()

Assignment 5

1. lapply() — Returns a List

2. sapply() — Simplifies Output

Example — Producing a Vector

Example — Producing a Matrix

3. vapply() — Safer Version of sapply()

Syntax

Example — Numeric Vector

Difference Between sapply() and vapply()

Assignment 6

Sample Data

1. Mean Function

Formula

Implementation

Test

2. Custom Median Function

Steps

Implementation

Test

3. Custom Mode Function

Mode

Implementation

Test

4. Custom Quantile Function

Quantiles

Formula

Implementation

Test

1. `lapply()` — Returns a List

2. `sapply()` — Simplifies Output

3. `vapply()` — Safer Version of `sapply()`

Difference Between `sapply()` and `vapply()`