R Markdown

Assignment 1

# Setting the Folder where my CSV File is located
library(readr)

#Importing the data 
my_data <- read.csv("C:/Users/user/Downloads/my_data.csv")

print(my_data)
##   Transaction.ID Client.Type Monthly.Revenue Credit.Score Loan.Status
## 1        TXN1001         SME     450,000,000          720     Current
## 2        TXN1002   Corporate   1,200,000,000          810     Current
## 3        TXN1003         SME     120,000,000          640  Delinquent
## 4        TXN1004         SME     310,000,000          690     Current
## 5        TXN1005      Retail       5,000,000          750     Current
## 6        TXN1006         SME     280,000,000          580     Default
## 7        TXN1007   Corporate   2,500,000,000          840     Current
## 8        TXN1008         SME     150,000,000          610  Delinquent
##   Days.Past.Due
## 1             0
## 2             0
## 3            45
## 4             2
## 5             0
## 6            95
## 7             0
## 8            15

Assignment 2 and 3

2. Merging Datasets

3. Group by() %>% Summarise()

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
suppliers <- read.csv("C:/Users/user/Downloads/book_m.csv")
suppliers
##   supplier_id company_name location
## 1         101     Bralirwa  Gisenyi
## 2         102        Rubis   Kigali
## 3         103          MTN   Kigali
## 4         104      Inyange   Masaka
invoices <- data.frame(invoice_no = c("INV_01", "INV_02", "INV_03", "INV_04"), supplier_id = c(101, 101, 102, 105), amount = c(1500000, 2300000, 5000000, 750000))
invoices
##   invoice_no supplier_id  amount
## 1     INV_01         101 1500000
## 2     INV_02         101 2300000
## 3     INV_03         102 5000000
## 4     INV_04         105  750000
final_report <- suppliers %>%
  left_join(invoices, by = "supplier_id")
final_report
##   supplier_id company_name location invoice_no  amount
## 1         101     Bralirwa  Gisenyi     INV_01 1500000
## 2         101     Bralirwa  Gisenyi     INV_02 2300000
## 3         102        Rubis   Kigali     INV_03 5000000
## 4         103          MTN   Kigali       <NA>      NA
## 5         104      Inyange   Masaka       <NA>      NA
# Assignment 3
summary_report <- final_report %>% 
  group_by(company_name) %>%
  summarise(
    total_invoiced = sum(amount, na.rm = TRUE),
    number_of_invoices = n()
  )
summary_report
## # A tibble: 4 × 3
##   company_name total_invoiced number_of_invoices
##   <chr>                 <dbl>              <int>
## 1 Bralirwa            3800000                  2
## 2 Inyange                   0                  1
## 3 MTN                       0                  1
## 4 Rubis               5000000                  1

Assignment 4.a.

4. Trace()

multiplication <- function(a, b){
  return(a * b)
}

trace(multiplication, tracer = quote(print("Spy report: The function just started!")))
## [1] "multiplication"
multiplication(5, 4)
## Tracing multiplication(5, 4) on entry 
## [1] "Spy report: The function just started!"
## [1] 20
multiplication(12, 30)
## Tracing multiplication(12, 30) on entry 
## [1] "Spy report: The function just started!"
## [1] 360
multiplication(54, 23)
## Tracing multiplication(54, 23) on entry 
## [1] "Spy report: The function just started!"
## [1] 1242
# Inject 'browser()' at the start of the function
trace(multiplication, tracer = browser)
## [1] "multiplication"
multiplication(24, 12)
## Tracing multiplication(24, 12) on entry 
## Called from: eval(expr, p)
## debug: {
##     return(a * b)
## }
## debug: return(a * b)
## [1] 288
#stop tracking the function
untrace(multiplication)

Assignment 5

5. Lappy Functions {lappy(),sapply(),vapply(),mapply()}

#recover() is an powerful debugging tool in R that allows you to inspect the state of your code exactly when and where an error occurs. 
#Unlike simple error messages, it pauses execution and lets you "browse" the environment of any function in the active call stack.
options(error = recover)

#Calculating a simple percentage.
calculate_percentage <- function(value, total){
  if (total == 0){
    stop("cannot divide by zero!")# this will force an error
  }
  result <- value / total
  return(result)
}

#calculate_percentage(100, 0)
# lapply(), sapply(), vapply(), and mapply(), the key distinction is about the input and output type and safety.

fruits <- list("banana","mango","apple","kiwi","pear")

#lapply function can be used when you want a list back no matter what.
lapply(fruits, nchar)
## [[1]]
## [1] 6
## 
## [[2]]
## [1] 5
## 
## [[3]]
## [1] 5
## 
## [[4]]
## [1] 4
## 
## [[5]]
## [1] 4
#sapply function can be used to simplify the output automatically.
sapply(fruits, nchar)
## [1] 6 5 5 4 4
#vapply function can be used as sapply but with a declared output type.
vapply(fruits, nchar, FUN.VALUE = integer(1))
## [1] 6 5 5 4 4
#mapply function, applies a function over multiple inputs in parallel

names <- c("Danny","Judith","Yves")
titles <- c("Eng.","Dr.","Chief ")

mapply(function(title, name) paste(title, name), titles,names)
##          Eng.           Dr.        Chief  
##  "Eng. Danny"  "Dr. Judith" "Chief  Yves"
#incase of a data.frame
students_df <- data.frame(
  Name    = c("Alice", "Bob", "Carol"),
  Math    = c(80, 70, 95),
  English = c(92, 57, 98),
  Science = c(78, 60, 100)
)

weights <- c(Math = 0.4, English = 0.3, Science = 0.3)

#in case of lapply
score_co <- students_df[, c("Math","English","Science")]
# We select only the numeric columns and apply across rows using split:
#split() is the key here it breaks the data.frame into a list of rows so lapply can walk over each student.
lapply(split(score_co, students_df$Name), function(scores) {
  list(mean = mean(as.numeric(scores)),
       min = min(as.numeric(scores)),
       max = max(as.numeric(scores)))
})
## $Alice
## $Alice$mean
## [1] 83.33333
## 
## $Alice$min
## [1] 78
## 
## $Alice$max
## [1] 92
## 
## 
## $Bob
## $Bob$mean
## [1] 62.33333
## 
## $Bob$min
## [1] 57
## 
## $Bob$max
## [1] 70
## 
## 
## $Carol
## $Carol$mean
## [1] 97.66667
## 
## $Carol$min
## [1] 95
## 
## $Carol$max
## [1] 100
students_df
##    Name Math English Science
## 1 Alice   80      92      78
## 2   Bob   70      57      60
## 3 Carol   95      98     100
#in case of sapply
#same as lapply we use split to break it down into a list
students_df$Average <- sapply(
  split(score_co, students_df$Name),
  function(scores) mean(as.numeric(scores))
)
students_df
##    Name Math English Science  Average
## 1 Alice   80      92      78 83.33333
## 2   Bob   70      57      60 62.33333
## 3 Carol   95      98     100 97.66667
#in case of vapply
#you have to select the output or indicate what is the expectation
students_df$Passed <- vapply(
  split(score_co, students_df$Name),
  function(scores) mean(as.numeric(scores)) >= 65,
  FUN.VALUE = logical(1)
)
students_df
##    Name Math English Science  Average Passed
## 1 Alice   80      92      78 83.33333   TRUE
## 2   Bob   70      57      60 62.33333  FALSE
## 3 Carol   95      98     100 97.66667   TRUE
#in case of mapply
#mapply walks down three columns in parallel exactly what it's built for. 
#Each row feeds Math, English, and Science simultaneously into the function.
students_df$Weighted_Avg <- mapply(
  function(math, english, science) {
    sum(c(math, english, science) * weights)
  },
  students_df$Math,
  students_df$English,
  students_df$Science
)
students_df
##    Name Math English Science  Average Passed Weighted_Avg
## 1 Alice   80      92      78 83.33333   TRUE         83.0
## 2   Bob   70      57      60 62.33333  FALSE         63.1
## 3 Carol   95      98     100 97.66667   TRUE         97.4

Assignment 6

5. Summary Statistics

#Function that calculates summary statistics and its application

#Creating custom function for summary statistics

# 1. Mean
mean_d <- function(x, na.rm = TRUE){
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  if (length(x) == 0) {
    return(NA)
  }
  sum(x) / length(x)
}

# 2. Median
median_d <- function(x, na.rm = TRUE){
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  if (length(x) == 0) {
    return(NA)
  }
  sorted <- sort(x)
  if (length(x) %% 2 == 0) {
    return((sorted[length(x)/2] + sorted[length(x)/2 + 1]) / 2)
  }else{
    return(sorted[(length(x) + 1) / 2])
  }
}

# 3. Interquartile range
iqr_d <- function(x, na.rm = TRUE){
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  q1 <- median_d(x[x <= median_d(x)])
  q3 <- median_d(x[x >= median_d(x)])
  q3 - q1
}

# 4. Variance
var_d <- function(x, na.rm = TRUE){
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  sum((x - mean_d(x))^2) / (length(x) - 1)
}

#5. Standard Deviation
std_d <- function(x, na.rm = TRUE){
  sqrt(var_d(x, na.rm = na.rm))
}

#6 range
range_d <- function(x, na.rm = TRUE){
  if(na.rm){
    x <- x[!is.na(x)]
  }
  max(x) - min(x)
}

#7. Putting it all together
summary_d <- function(x){
  list(
    mean = mean_d(x),
    median = median_d(x),
    variance = var_d(x),
    sd = std_d(x),
    range = range_d(x),
    iqr = iqr_d(x),
    min = min(x),
    max = max(x)
  )
}
# creating a vector
ages <- c(14,32,45,29,25,11,20,32,44,50,41,60)

#Applying on our vector
summary_d(ages)
## $mean
## [1] 33.58333
## 
## $median
## [1] 32
## 
## $variance
## [1] 221.7197
## 
## $sd
## [1] 14.89026
## 
## $range
## [1] 49
## 
## $iqr
## [1] 19
## 
## $min
## [1] 11
## 
## $max
## [1] 60