R Markdown
Assignment 1
# Setting the Folder where my CSV File is located
library(readr)
#Importing the data
my_data <- read.csv("C:/Users/user/Downloads/my_data.csv")
print(my_data)
## Transaction.ID Client.Type Monthly.Revenue Credit.Score Loan.Status
## 1 TXN1001 SME 450,000,000 720 Current
## 2 TXN1002 Corporate 1,200,000,000 810 Current
## 3 TXN1003 SME 120,000,000 640 Delinquent
## 4 TXN1004 SME 310,000,000 690 Current
## 5 TXN1005 Retail 5,000,000 750 Current
## 6 TXN1006 SME 280,000,000 580 Default
## 7 TXN1007 Corporate 2,500,000,000 840 Current
## 8 TXN1008 SME 150,000,000 610 Delinquent
## Days.Past.Due
## 1 0
## 2 0
## 3 45
## 4 2
## 5 0
## 6 95
## 7 0
## 8 15
Assignment 2 and 3
2. Merging Datasets
3. Group by() %>% Summarise()
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
suppliers <- read.csv("C:/Users/user/Downloads/book_m.csv")
suppliers
## supplier_id company_name location
## 1 101 Bralirwa Gisenyi
## 2 102 Rubis Kigali
## 3 103 MTN Kigali
## 4 104 Inyange Masaka
invoices <- data.frame(invoice_no = c("INV_01", "INV_02", "INV_03", "INV_04"), supplier_id = c(101, 101, 102, 105), amount = c(1500000, 2300000, 5000000, 750000))
invoices
## invoice_no supplier_id amount
## 1 INV_01 101 1500000
## 2 INV_02 101 2300000
## 3 INV_03 102 5000000
## 4 INV_04 105 750000
final_report <- suppliers %>%
left_join(invoices, by = "supplier_id")
final_report
## supplier_id company_name location invoice_no amount
## 1 101 Bralirwa Gisenyi INV_01 1500000
## 2 101 Bralirwa Gisenyi INV_02 2300000
## 3 102 Rubis Kigali INV_03 5000000
## 4 103 MTN Kigali <NA> NA
## 5 104 Inyange Masaka <NA> NA
# Assignment 3
summary_report <- final_report %>%
group_by(company_name) %>%
summarise(
total_invoiced = sum(amount, na.rm = TRUE),
number_of_invoices = n()
)
summary_report
## # A tibble: 4 × 3
## company_name total_invoiced number_of_invoices
## <chr> <dbl> <int>
## 1 Bralirwa 3800000 2
## 2 Inyange 0 1
## 3 MTN 0 1
## 4 Rubis 5000000 1
Assignment 4.a.
4. Trace()
multiplication <- function(a, b){
return(a * b)
}
trace(multiplication, tracer = quote(print("Spy report: The function just started!")))
## [1] "multiplication"
multiplication(5, 4)
## Tracing multiplication(5, 4) on entry
## [1] "Spy report: The function just started!"
## [1] 20
multiplication(12, 30)
## Tracing multiplication(12, 30) on entry
## [1] "Spy report: The function just started!"
## [1] 360
multiplication(54, 23)
## Tracing multiplication(54, 23) on entry
## [1] "Spy report: The function just started!"
## [1] 1242
# Inject 'browser()' at the start of the function
trace(multiplication, tracer = browser)
## [1] "multiplication"
multiplication(24, 12)
## Tracing multiplication(24, 12) on entry
## Called from: eval(expr, p)
## debug: {
## return(a * b)
## }
## debug: return(a * b)
## [1] 288
#stop tracking the function
untrace(multiplication)
Assignment 5
5. Lappy Functions {lappy(),sapply(),vapply(),mapply()}
#recover() is an powerful debugging tool in R that allows you to inspect the state of your code exactly when and where an error occurs.
#Unlike simple error messages, it pauses execution and lets you "browse" the environment of any function in the active call stack.
options(error = recover)
#Calculating a simple percentage.
calculate_percentage <- function(value, total){
if (total == 0){
stop("cannot divide by zero!")# this will force an error
}
result <- value / total
return(result)
}
#calculate_percentage(100, 0)
# lapply(), sapply(), vapply(), and mapply(), the key distinction is about the input and output type and safety.
fruits <- list("banana","mango","apple","kiwi","pear")
#lapply function can be used when you want a list back no matter what.
lapply(fruits, nchar)
## [[1]]
## [1] 6
##
## [[2]]
## [1] 5
##
## [[3]]
## [1] 5
##
## [[4]]
## [1] 4
##
## [[5]]
## [1] 4
#sapply function can be used to simplify the output automatically.
sapply(fruits, nchar)
## [1] 6 5 5 4 4
#vapply function can be used as sapply but with a declared output type.
vapply(fruits, nchar, FUN.VALUE = integer(1))
## [1] 6 5 5 4 4
#mapply function, applies a function over multiple inputs in parallel
names <- c("Danny","Judith","Yves")
titles <- c("Eng.","Dr.","Chief ")
mapply(function(title, name) paste(title, name), titles,names)
## Eng. Dr. Chief
## "Eng. Danny" "Dr. Judith" "Chief Yves"
#incase of a data.frame
students_df <- data.frame(
Name = c("Alice", "Bob", "Carol"),
Math = c(80, 70, 95),
English = c(92, 57, 98),
Science = c(78, 60, 100)
)
weights <- c(Math = 0.4, English = 0.3, Science = 0.3)
#in case of lapply
score_co <- students_df[, c("Math","English","Science")]
# We select only the numeric columns and apply across rows using split:
#split() is the key here it breaks the data.frame into a list of rows so lapply can walk over each student.
lapply(split(score_co, students_df$Name), function(scores) {
list(mean = mean(as.numeric(scores)),
min = min(as.numeric(scores)),
max = max(as.numeric(scores)))
})
## $Alice
## $Alice$mean
## [1] 83.33333
##
## $Alice$min
## [1] 78
##
## $Alice$max
## [1] 92
##
##
## $Bob
## $Bob$mean
## [1] 62.33333
##
## $Bob$min
## [1] 57
##
## $Bob$max
## [1] 70
##
##
## $Carol
## $Carol$mean
## [1] 97.66667
##
## $Carol$min
## [1] 95
##
## $Carol$max
## [1] 100
students_df
## Name Math English Science
## 1 Alice 80 92 78
## 2 Bob 70 57 60
## 3 Carol 95 98 100
#in case of sapply
#same as lapply we use split to break it down into a list
students_df$Average <- sapply(
split(score_co, students_df$Name),
function(scores) mean(as.numeric(scores))
)
students_df
## Name Math English Science Average
## 1 Alice 80 92 78 83.33333
## 2 Bob 70 57 60 62.33333
## 3 Carol 95 98 100 97.66667
#in case of vapply
#you have to select the output or indicate what is the expectation
students_df$Passed <- vapply(
split(score_co, students_df$Name),
function(scores) mean(as.numeric(scores)) >= 65,
FUN.VALUE = logical(1)
)
students_df
## Name Math English Science Average Passed
## 1 Alice 80 92 78 83.33333 TRUE
## 2 Bob 70 57 60 62.33333 FALSE
## 3 Carol 95 98 100 97.66667 TRUE
#in case of mapply
#mapply walks down three columns in parallel exactly what it's built for.
#Each row feeds Math, English, and Science simultaneously into the function.
students_df$Weighted_Avg <- mapply(
function(math, english, science) {
sum(c(math, english, science) * weights)
},
students_df$Math,
students_df$English,
students_df$Science
)
students_df
## Name Math English Science Average Passed Weighted_Avg
## 1 Alice 80 92 78 83.33333 TRUE 83.0
## 2 Bob 70 57 60 62.33333 FALSE 63.1
## 3 Carol 95 98 100 97.66667 TRUE 97.4
Assignment 6
5. Summary Statistics
#Function that calculates summary statistics and its application
#Creating custom function for summary statistics
# 1. Mean
mean_d <- function(x, na.rm = TRUE){
if (na.rm) {
x <- x[!is.na(x)]
}
if (length(x) == 0) {
return(NA)
}
sum(x) / length(x)
}
# 2. Median
median_d <- function(x, na.rm = TRUE){
if (na.rm) {
x <- x[!is.na(x)]
}
if (length(x) == 0) {
return(NA)
}
sorted <- sort(x)
if (length(x) %% 2 == 0) {
return((sorted[length(x)/2] + sorted[length(x)/2 + 1]) / 2)
}else{
return(sorted[(length(x) + 1) / 2])
}
}
# 3. Interquartile range
iqr_d <- function(x, na.rm = TRUE){
if (na.rm) {
x <- x[!is.na(x)]
}
q1 <- median_d(x[x <= median_d(x)])
q3 <- median_d(x[x >= median_d(x)])
q3 - q1
}
# 4. Variance
var_d <- function(x, na.rm = TRUE){
if (na.rm) {
x <- x[!is.na(x)]
}
sum((x - mean_d(x))^2) / (length(x) - 1)
}
#5. Standard Deviation
std_d <- function(x, na.rm = TRUE){
sqrt(var_d(x, na.rm = na.rm))
}
#6 range
range_d <- function(x, na.rm = TRUE){
if(na.rm){
x <- x[!is.na(x)]
}
max(x) - min(x)
}
#7. Putting it all together
summary_d <- function(x){
list(
mean = mean_d(x),
median = median_d(x),
variance = var_d(x),
sd = std_d(x),
range = range_d(x),
iqr = iqr_d(x),
min = min(x),
max = max(x)
)
}
# creating a vector
ages <- c(14,32,45,29,25,11,20,32,44,50,41,60)
#Applying on our vector
summary_d(ages)
## $mean
## [1] 33.58333
##
## $median
## [1] 32
##
## $variance
## [1] 221.7197
##
## $sd
## [1] 14.89026
##
## $range
## [1] 49
##
## $iqr
## [1] 19
##
## $min
## [1] 11
##
## $max
## [1] 60