# read.table reads any delimiter-separated file
class_data <- read.table("data/class_data.txt", header = TRUE, sep = "\t")
head(class_data)
## studentID gender math_score it_score height weight
## 1 1 M 12 13 165 60
## 2 2 F 17 18 158 52
## 3 3 M 10 11 170 75
## 4 4 M 9 10 168 70
## 5 5 F NA 19 155 48
## 6 6 F 15 14 162 55
str(class_data)
## 'data.frame': 10 obs. of 6 variables:
## $ studentID : int 1 2 3 4 5 6 7 8 9 10
## $ gender : chr "M" "F" "M" "M" ...
## $ math_score: int 12 17 10 9 NA 15 20 13 11 18
## $ it_score : int 13 18 11 10 19 14 20 16 12 17
## $ height : int 165 158 170 168 155 162 175 160 172 157
## $ weight : int 60 52 75 70 48 55 80 54 78 50
# read.csv is a shortcut for read.table with sep=","
student_data <- read.csv("data/student_data.csv", header = TRUE)
head(student_data)
## studentID gender math_score it_score height weight bmi
## 1 1 M 12 13 1.65 60 22.0
## 2 2 F 17 18 1.58 52 20.8
## 3 3 M 10 11 1.70 75 26.0
## 4 4 M 9 10 1.68 70 24.8
## 5 5 F NA 19 1.55 48 19.9
## 6 6 F 15 14 1.62 55 20.9
str(student_data)
## 'data.frame': 10 obs. of 7 variables:
## $ studentID : int 1 2 3 4 5 6 7 8 9 10
## $ gender : chr "M" "F" "M" "M" ...
## $ math_score: int 12 17 10 9 NA 15 20 13 11 18
## $ it_score : int 13 18 11 10 19 14 20 16 12 17
## $ height : num 1.65 1.58 1.7 1.68 1.55 1.62 1.75 1.6 1.72 1.57
## $ weight : int 60 52 75 70 48 55 80 54 78 50
## $ bmi : num 22 20.8 26 24.8 19.9 20.9 26.1 21.1 26.4 20.3
install.packages("readxl")
library(readxl)
# datasample.xlsx — student sample data
excel_data <- read_excel("data/datasample.xlsx")
head(excel_data)
## # A tibble: 5 × 5
## Account `First Name` `Last Name` ID Mobile1
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 Jean John 111111 7888888
## 2 2 diane Dina 5555555 788888
## 3 3 diane Dina 5555555 788888
## 4 4 diane Dina 5555555 788888
## 5 5 diane Dina 5555555 788888
str(excel_data)
## tibble [5 × 5] (S3: tbl_df/tbl/data.frame)
## $ Account : num [1:5] 1 2 3 4 5
## $ First Name: chr [1:5] "Jean" "diane" "diane" "diane" ...
## $ Last Name : chr [1:5] "John" "Dina" "Dina" "Dina" ...
## $ ID : num [1:5] 111111 5555555 5555555 5555555 5555555
## $ Mobile1 : num [1:5] 7888888 788888 788888 788888 788888
# OpeningDate.xlsx — opening dates dataset
opening_data <- read_excel("data/OpeningDate.xlsx")
head(opening_data)
## # A tibble: 6 × 12
## `Open Date` `Date Birth` Gender Marital ...5 Status ...7
## <dttm> <dttm> <chr> <chr> <lgl> <chr> <lgl>
## 1 2026-01-12 00:00:00 1999-09-09 00:00:00 Male Single NA Active NA
## 2 2026-02-03 00:00:00 1990-01-01 00:00:00 Male Married NA Active NA
## 3 2026-02-03 00:00:00 1987-12-22 00:00:00 Female Married NA Active NA
## 4 2026-02-03 00:00:00 1990-12-28 00:00:00 Male Single NA Active NA
## 5 2026-02-03 00:00:00 1987-06-03 00:00:00 Female Married NA Active NA
## 6 2026-02-03 00:00:00 2000-01-01 00:00:00 Female Married NA Active NA
## # ℹ 5 more variables: Education <chr>, Post <chr>, Nature <chr>, Salary <dbl>,
## # `Valid Date` <dttm>
str(opening_data)
## tibble [2,492 × 12] (S3: tbl_df/tbl/data.frame)
## $ Open Date : POSIXct[1:2492], format: "2026-01-12" "2026-02-03" ...
## $ Date Birth: POSIXct[1:2492], format: "1999-09-09" "1990-01-01" ...
## $ Gender : chr [1:2492] "Male" "Male" "Female" "Male" ...
## $ Marital : chr [1:2492] "Single" "Married" "Married" "Single" ...
## $ ...5 : logi [1:2492] NA NA NA NA NA NA ...
## $ Status : chr [1:2492] "Active" "Active" "Active" "Active" ...
## $ ...7 : logi [1:2492] NA NA NA NA NA NA ...
## $ Education : chr [1:2492] "Bachelors Degree" "Diploma (A2 level)" "Bachelors Degree" "Diploma (A2 level)" ...
## $ Post : chr [1:2492] "Service and sales workers/Education" "Service and sales workers/Education" "Service and sales workers/Education" "Service and sales workers/Education" ...
## $ Nature : chr [1:2492] "Retail or Individuals Teacher" "Retail or Individuals Teacher" "Retail or Individuals Teacher" "Retail or Individuals Teacher" ...
## $ Salary : num [1:2492] 0 0 0 0 0 0 0 0 0 0 ...
## $ Valid Date: POSIXct[1:2492], format: "2026-01-20" "2026-02-23" ...
install.packages("haven")
library(haven)
# datasample.sav — SPSS format
spss_data <- read_sav("data/datasample.sav")
head(spss_data)
## # A tibble: 6 × 4
## ID Name Age Salary
## <dbl> <chr> <dbl> <dbl>
## 1 1 Jean 25 500
## 2 2 Alice 30 600
## 3 3 Bob 28 550
## 4 4 David 35 700
## 5 5 Emma 22 450
## 6 6 Frank 40 800
str(spss_data)
## tibble [10 × 4] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:10] 1 2 3 4 5 6 7 8 9 10
## ..- attr(*, "format.spss")= chr "F8.0"
## $ Name : chr [1:10] "Jean" "Alice" "Bob" "David" ...
## ..- attr(*, "format.spss")= chr "A5"
## $ Age : num [1:10] 25 30 28 35 22 40 27 33 29 31
## ..- attr(*, "format.spss")= chr "F8.2"
## $ Salary: num [1:10] 500 600 550 700 450 800 520 680 590 610
## ..- attr(*, "format.spss")= chr "F8.2"
# loan_datasample.dta — loan data in Stata format (100 loan records)
stata_data <- read_dta("data/loan_datasample.dta")
head(stata_data)
## # A tibble: 6 × 13
## LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 2415 Client 1 Female 23 10543900 12.4 48
## 2 1002 2463 Client 2 Female 49 3679871 10.6 48
## 3 1003 2179 Client 3 Male 56 12627095 12.4 36
## 4 1004 2526 Client 4 Male 64 19726106 18.9 48
## 5 1005 2195 Client 5 Female 42 13541045 18.1 24
## 6 1006 2938 Client 6 Female 62 8668860 17.5 36
## # ℹ 5 more variables: LoanType <chr>, DisbursementDate <date>, Status <chr>,
## # CollateralValue <dbl>, MonthlyInstallment <dbl>
str(stata_data)
## tibble [100 × 13] (S3: tbl_df/tbl/data.frame)
## $ LoanID : num [1:100] 1001 1002 1003 1004 1005 ...
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ ClientID : num [1:100] 2415 2463 2179 2526 2195 ...
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ ClientName : chr [1:100] "Client 1" "Client 2" "Client 3" "Client 4" ...
## ..- attr(*, "format.stata")= chr "%-9s"
## $ Gender : chr [1:100] "Female" "Female" "Male" "Male" ...
## ..- attr(*, "format.stata")= chr "%-9s"
## $ Age : num [1:100] 23 49 56 64 42 62 40 31 28 41 ...
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ LoanAmount : num [1:100] 10543900 3679871 12627095 19726106 13541045 ...
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ InterestRate : num [1:100] 12.4 10.6 12.4 18.9 18.1 ...
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ LoanTermMonths : num [1:100] 48 48 36 48 24 36 48 24 12 36 ...
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ LoanType : chr [1:100] "Agriculture" "Personal" "Agriculture" "Business" ...
## ..- attr(*, "format.stata")= chr "%-9s"
## $ DisbursementDate : Date[1:100], format: "2023-02-15" "2023-03-05" ...
## $ Status : chr [1:100] "Closed" "Active" "Defaulted" "Pending" ...
## ..- attr(*, "format.stata")= chr "%-9s"
## $ CollateralValue : num [1:100] 22780071 8156706 3467474 880098 24205659 ...
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ MonthlyInstallment: num [1:100] 246925 84775 394141 488632 666445 ...
## ..- attr(*, "format.stata")= chr "%10.0g"
install.packages("DBI")
install.packages("RSQLite")
library(DBI)
library(RSQLite)
# Connect to the existing SQLite database
con_sqlite <- dbConnect(RSQLite::SQLite(), "data/school.db")
# List available tables
dbListTables(con_sqlite)
## [1] "loans" "students"
# Import the students table
students_db <- dbGetQuery(con_sqlite, "SELECT * FROM students")
head(students_db)
## studentID gender math_score it_score height weight bmi
## 1 1 M 12 13 1.65 60 22.0
## 2 2 F 17 18 1.58 52 20.8
## 3 3 M 10 11 1.70 75 26.0
## 4 4 M 9 10 1.68 70 24.8
## 5 5 F NA 19 1.55 48 19.9
## 6 6 F 15 14 1.62 55 20.9
# Import the loans table
loans_db <- dbGetQuery(con_sqlite, "SELECT * FROM loans")
head(loans_db)
## LoanID ClientName Gender Age LoanAmount InterestRate LoanTermMonths Status
## 1 1001 Client 1 Male 28 5000000 15.0 24 Active
## 2 1002 Client 2 Female 34 3000000 12.5 12 Closed
## 3 1003 Client 3 Male 45 8000000 18.0 36 Active
## 4 1004 Client 4 Female 23 1500000 11.0 12 Pending
## 5 1005 Client 5 Male 38 6000000 16.5 24 Defaulted
## 6 1006 Client 6 Female 30 2500000 13.0 12 Active
# Query with a filter — students with math_score >= 15
high_scores <- dbGetQuery(con_sqlite,
"SELECT studentID, gender, math_score, it_score
FROM students
WHERE math_score >= 15")
print(high_scores)
## studentID gender math_score it_score
## 1 2 F 17 18
## 2 6 F 15 14
## 3 7 M 20 20
## 4 10 F 18 17
dbDisconnect(con_sqlite)
install.packages("RPostgres")
library(RPostgres)
con_pg <- dbConnect(
RPostgres::Postgres(),
dbname = "pozo_db",
host = "localhost",
user = "root",
password = "hexakomb",
port = 5432
)
# Customers
customers <- dbGetQuery(con_pg, "SELECT * FROM customer LIMIT 10")
head(customers)
dbDisconnect(con_pg)
The merge() function in R is a fundamental data manipulation tool used to combine two datasets based on one or more common columns, commonly referred to as a key. Its primary application lies in situations where related information is stored in separate tables and needs to be brought together for analysis. This is a very common scenario in real-world data science, where data is rarely collected and stored in a single place. For example, in a hospital setting, patient demographic information such as name and age may be stored in one table while laboratory test results are stored in another. By merging these two tables on a common patient ID column, an analyst can create one complete record that enables a full picture of each patient’s health status. ### 5.1. Inner join (default)
# split mtcars into two separate tables
car_engine <- data.frame(
car_name = rownames(mtcars)[1:10],
hp = mtcars$hp[1:10],
cyl = mtcars$cyl[1:10]
)
car_performance <- data.frame(
car_name = rownames(mtcars)[6:15],
mpg = mtcars$mpg[6:15],
wt = mtcars$wt[6:15]
)
# inner join: only cars that appear in Both tables
inner_result <- merge(car_engine, car_performance, by = "car_name")
print(inner_result)
## car_name hp cyl mpg wt
## 1 Duster 360 245 8 14.3 3.57
## 2 Merc 230 95 4 22.8 3.15
## 3 Merc 240D 62 4 24.4 3.19
## 4 Merc 280 123 6 19.2 3.44
## 5 Valiant 105 6 18.1 3.46
cat("Rows returned:", nrow(inner_result), "\n")
## Rows returned: 5
#Only cars 6 to 10 appear because those are the only ones present in both tables. Cars 1 to 5 are dropped because they only exist in car_engine.
# split mtcars into two separate tables
# left join — keep ALL rows from car_engine
# fill NA where no match found in car_performance
left_result <- merge(car_engine, car_performance,
by = "car_name",
all.x = TRUE)
print(left_result)
## car_name hp cyl mpg wt
## 1 Datsun 710 93 4 NA NA
## 2 Duster 360 245 8 14.3 3.57
## 3 Hornet 4 Drive 110 6 NA NA
## 4 Hornet Sportabout 175 8 NA NA
## 5 Mazda RX4 110 6 NA NA
## 6 Mazda RX4 Wag 110 6 NA NA
## 7 Merc 230 95 4 22.8 3.15
## 8 Merc 240D 62 4 24.4 3.19
## 9 Merc 280 123 6 19.2 3.44
## 10 Valiant 105 6 18.1 3.46
cat("Rows returned:", nrow(left_result), "\n")
## Rows returned: 10
#All 10 cars from car_engine are kept. Cars 1 to 5 have NA in the mpg and wt columns because they have no match in car_performance.
# right join — keep ALL rows from car_performance
right_result <- merge(car_engine, car_performance,
by = "car_name",
all.y = TRUE)
print(right_result)
## car_name hp cyl mpg wt
## 1 Cadillac Fleetwood NA NA 10.4 5.25
## 2 Duster 360 245 8 14.3 3.57
## 3 Merc 230 95 4 22.8 3.15
## 4 Merc 240D 62 4 24.4 3.19
## 5 Merc 280 123 6 19.2 3.44
## 6 Merc 280C NA NA 17.8 3.44
## 7 Merc 450SE NA NA 16.4 4.07
## 8 Merc 450SL NA NA 17.3 3.73
## 9 Merc 450SLC NA NA 15.2 3.78
## 10 Valiant 105 6 18.1 3.46
cat("Rows returned:", nrow(right_result), "\n")
## Rows returned: 10
#All 10 cars from car_performance are kept. Cars 11 to 15 have NA in hp and cyl because they have no match in car_engine.
# full join — keep ALL rows from Both tables
full_result <- merge(car_engine, car_performance,
by = "car_name",
all = TRUE)
print(full_result)
## car_name hp cyl mpg wt
## 1 Cadillac Fleetwood NA NA 10.4 5.25
## 2 Datsun 710 93 4 NA NA
## 3 Duster 360 245 8 14.3 3.57
## 4 Hornet 4 Drive 110 6 NA NA
## 5 Hornet Sportabout 175 8 NA NA
## 6 Mazda RX4 110 6 NA NA
## 7 Mazda RX4 Wag 110 6 NA NA
## 8 Merc 230 95 4 22.8 3.15
## 9 Merc 240D 62 4 24.4 3.19
## 10 Merc 280 123 6 19.2 3.44
## 11 Merc 280C NA NA 17.8 3.44
## 12 Merc 450SE NA NA 16.4 4.07
## 13 Merc 450SL NA NA 17.3 3.73
## 14 Merc 450SLC NA NA 15.2 3.78
## 15 Valiant 105 6 18.1 3.46
cat("Rows returned:", nrow(full_result), "\n")
## Rows returned: 15
#All 15 unique cars appear. Cars only in car_engine have NA for performance columns. Cars only in car_performance have NA for engine columns.
# two tables where the key has different names
students <- data.frame(
student_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Carol", "David", "Eve")
)
grades <- data.frame(
id = c(1, 2, 3, 4, 5),
grade = c("A", "B", "A", "C", "B"),
score = c(85, 72, 91, 65, 78)
)
# by.x = key name in first table
# by.y = key name in second table
result <- merge(students, grades,
by.x = "student_id",
by.y = "id")
print(result)
## student_id name grade score
## 1 1 Alice A 85
## 2 2 Bob B 72
## 3 3 Carol A 91
## 4 4 David C 65
## 5 5 Eve B 78
group_by() is a function from the dplyr package that splits a dataset into groups based on one or more columns, allowing you to perform calculations separately for each group. It is almost always used together with summarise(), mutate(), or other dplyr functions. The %>% operator, called the pipe operator, is used to chain multiple operations together in a clean and readable sequence. Instead of nesting functions inside each other, the pipe passes the result of one function directly into the next, making the code read like a series of steps from top to bottom. Together, group_by() and %>% form the backbone of data summarisation and aggregation in R. ## 6.1 Basic syntax
#dataset %>%
# group_by(column) %>%
# summarise(result = function(column))
# result = the name you give the new column
# function = replace this with a real function like mean, sum, max etc
library(tidyverse)
iris %>%
group_by(Species) %>%
summarise(
count = n(),
average = mean(Petal.Length),
total = sum(Petal.Length),
highest = max(Petal.Length),
lowest = min(Petal.Length),
std_dev = sd(Petal.Length),
middle = median(Petal.Length),
unique_val = n_distinct(Petal.Length)
)
## # A tibble: 3 × 9
## Species count average total highest lowest std_dev middle unique_val
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 setosa 50 1.46 73.1 1.9 1 0.174 1.5 9
## 2 versicolor 50 4.26 213 5.1 3 0.470 4.35 19
## 3 virginica 50 5.55 278. 6.9 4.5 0.552 5.55 20
#This produces one row per species with all eight statistics computed at once , every one of them replacing what was written as function in the template.
trace() and recover() in R trace() is used to insert debugging code into a function temporarily without editing the function itself. It lets you monitor when a function is called, what arguments it receives, and what happens inside it.
# add tracing to a function
#trace(what = function_name, tracer = code_to_run)
# remove tracing when done
#untrace(function_name)
# trace mean() to see every time it is called
trace(mean)
# now call mean
mean(c(1, 2, 3, 4, 5))
## trace: mean(c(1, 2, 3, 4, 5))
## [1] 3
mean(mtcars$mpg)
## trace: mean(mtcars$mpg)
## [1] 20.09062
# output will show
# trace: mean(c(1,2,3,4,5))
# trace: mean(mtcars$mpg)
# remove trace when done
untrace(mean)
recover() pauses execution at the point of error and lets you inspect the environment of each function in the call stack interactively.
options(error = recover)
process_values <- function(values) {
results <- c()
for (i in seq_along(values)) {
result <- sqrt(values[[i]])
results <- c(results, result)
}
return(results)
}
# list with a bad value
my_values <- list(4, 9, -1, 16)
process_values(my_values)
## [1] 2 3 NaN 4
summary_stats <- function(x, var_name = "Variable") {
# remove missing values
x_clean <- x[!is.na(x)]
n_missing <- length(x) - length(x_clean)
# calculate all statistics
n <- length(x_clean)
mean_val <- mean(x_clean)
med_val <- median(x_clean)
sd_val <- sd(x_clean)
var_val <- var(x_clean)
se_val <- sd_val / sqrt(n)
min_val <- min(x_clean)
max_val <- max(x_clean)
range_val <- max_val - min_val
q1 <- quantile(x_clean, 0.25)
q3 <- quantile(x_clean, 0.75)
iqr_val <- IQR(x_clean)
skew_val <- (3 * (mean_val - med_val)) / sd_val
cv_val <- (sd_val / mean_val) * 100
# count outliers using IQR method
lower_fence <- q1 - 1.5 * iqr_val
upper_fence <- q3 + 1.5 * iqr_val
n_outliers <- sum(x_clean < lower_fence | x_clean > upper_fence)
# print results
cat("╔══════════════════════════════════════╗\n")
cat(" Summary Statistics:", var_name, "\n")
cat("╚══════════════════════════════════════╝\n")
cat("Sample size :", n, "\n")
cat("Missing values :", n_missing, "\n")
cat("──────────────────────────────────────\n")
cat("Mean :", round(mean_val, 4),"\n")
cat("Median :", round(med_val, 4),"\n")
cat("Std deviation :", round(sd_val, 4),"\n")
cat("Variance :", round(var_val, 4),"\n")
cat("Std error :", round(se_val, 4),"\n")
cat("──────────────────────────────────────\n")
cat("Minimum :", round(min_val, 4),"\n")
cat("Maximum :", round(max_val, 4),"\n")
cat("Range :", round(range_val,4),"\n")
cat("──────────────────────────────────────\n")
cat("Q1 (25th pct) :", round(q1, 4),"\n")
cat("Q3 (75th pct) :", round(q3, 4),"\n")
cat("IQR :", round(iqr_val, 4),"\n")
cat("──────────────────────────────────────\n")
cat("Skewness :", round(skew_val, 4),"\n")
cat("Coeff of variation :", round(cv_val, 2), "%\n")
cat("Outliers (IQR rule):", n_outliers, "\n")
cat("Lower fence :", round(lower_fence, 4), "\n")
cat("Upper fence :", round(upper_fence, 4), "\n")
cat("══════════════════════════════════════\n")
# return as a list silently so values can be stored
invisible(list(
n = n,
mean = mean_val,
median = med_val,
sd = sd_val,
variance = var_val,
se = se_val,
min = min_val,
max = max_val,
range = range_val,
q1 = q1,
q3 = q3,
iqr = iqr_val,
skewness = skew_val,
cv = cv_val,
n_outliers = n_outliers,
lower_fence = lower_fence,
upper_fence = upper_fence
))
}
#Apply it:
# on a simple vector
summary_stats(mtcars$mpg, "Miles per Gallon")
## ╔══════════════════════════════════════╗
## Summary Statistics: Miles per Gallon
## ╚══════════════════════════════════════╝
## Sample size : 32
## Missing values : 0
## ──────────────────────────────────────
## Mean : 20.0906
## Median : 19.2
## Std deviation : 6.0269
## Variance : 36.3241
## Std error : 1.0654
## ──────────────────────────────────────
## Minimum : 10.4
## Maximum : 33.9
## Range : 23.5
## ──────────────────────────────────────
## Q1 (25th pct) : 15.425
## Q3 (75th pct) : 22.8
## IQR : 7.375
## ──────────────────────────────────────
## Skewness : 0.4433
## Coeff of variation : 30 %
## Outliers (IQR rule): 1
## Lower fence : 4.3625
## Upper fence : 33.8625
## ══════════════════════════════════════
# on your stock data
summary_stats(zee$Close, "Zee Entertainment Close Price")
## ╔══════════════════════════════════════╗
## Summary Statistics: Zee Entertainment Close Price
## ╚══════════════════════════════════════╝
## Sample size : 5306
## Missing values : 0
## ──────────────────────────────────────
## Mean : 273.2336
## Median : 238.125
## Std deviation : 176.0483
## Variance : 30993
## Std error : 2.4168
## ──────────────────────────────────────
## Minimum : 62.3
## Maximum : 1541.7
## Range : 1479.4
## ──────────────────────────────────────
## Q1 (25th pct) : 143.2
## Q3 (75th pct) : 345.5875
## IQR : 202.3875
## ──────────────────────────────────────
## Skewness : 0.5983
## Coeff of variation : 64.43 %
## Outliers (IQR rule): 84
## Lower fence : -160.3813
## Upper fence : 649.1688
## ══════════════════════════════════════
# store the results
results <- summary_stats(iris$Sepal.Length, "Sepal Length")
## ╔══════════════════════════════════════╗
## Summary Statistics: Sepal Length
## ╚══════════════════════════════════════╝
## Sample size : 150
## Missing values : 0
## ──────────────────────────────────────
## Mean : 5.8433
## Median : 5.8
## Std deviation : 0.8281
## Variance : 0.6857
## Std error : 0.0676
## ──────────────────────────────────────
## Minimum : 4.3
## Maximum : 7.9
## Range : 3.6
## ──────────────────────────────────────
## Q1 (25th pct) : 5.1
## Q3 (75th pct) : 6.4
## IQR : 1.3
## ──────────────────────────────────────
## Skewness : 0.157
## Coeff of variation : 14.17 %
## Outliers (IQR rule): 0
## Lower fence : 3.15
## Upper fence : 8.35
## ══════════════════════════════════════
results$mean # access individual values
## [1] 5.843333
results$n_outliers
## [1] 0
###8.1 Two Sample of t-Test
two_sample_ttest <- function(x, y,
group1_name = "Group 1",
group2_name = "Group 2",
alpha = 0.05,
alternative = "two.sided",
var.equal = FALSE) {
# ── Step 1: clean data ──────────────────────────────
x_clean <- x[!is.na(x)]
y_clean <- y[!is.na(y)]
# ── Step 2: descriptive statistics ──────────────────
n1 <- length(x_clean)
n2 <- length(y_clean)
mean1 <- mean(x_clean)
mean2 <- mean(y_clean)
sd1 <- sd(x_clean)
sd2 <- sd(y_clean)
se1 <- sd1 / sqrt(n1)
se2 <- sd2 / sqrt(n2)
# ── Step 3: check normality (Shapiro-Wilk) ──────────
# shapiro test only works for n <= 5000
if (n1 <= 5000) {
norm1 <- shapiro.test(x_clean)
} else {
norm1 <- list(statistic = NA, p.value = NA)
}
if (n2 <= 5000) {
norm2 <- shapiro.test(y_clean)
} else {
norm2 <- list(statistic = NA, p.value = NA)
}
# ── Step 4: check equality of variances (Levene) ────
var_test <- var.test(x_clean, y_clean)
# ── Step 5: decide which test to use ────────────────
if (var_test$p.value < alpha) {
var.equal <- FALSE # variances significantly different
test_type <- "Welch two sample t-test (unequal variances)"
} else {
var.equal <- TRUE # variances not significantly different
test_type <- "Student two sample t-test (equal variances)"
}
# ── Step 6: run the t-test ───────────────────────────
test_result <- t.test(x_clean, y_clean,
alternative = alternative,
var.equal = var.equal,
conf.level = 1 - alpha)
# ── Step 7: compute effect size (Cohen's d) ─────────
pooled_sd <- sqrt(((n1 - 1) * sd1^2 + (n2 - 1) * sd2^2) /
(n1 + n2 - 2))
cohens_d <- (mean1 - mean2) / pooled_sd
# interpret effect size
effect_label <- ifelse(abs(cohens_d) < 0.2, "negligible",
ifelse(abs(cohens_d) < 0.5, "small",
ifelse(abs(cohens_d) < 0.8, "medium", "large")))
# ── Step 8: interpret the result ────────────────────
decision <- ifelse(test_result$p.value < alpha,
"REJECT the null hypothesis",
"FAIL TO REJECT the null hypothesis")
conclusion <- ifelse(test_result$p.value < alpha,
paste("There IS a statistically significant difference between",
group1_name, "and", group2_name),
paste("There is NO statistically significant difference between",
group1_name, "and", group2_name))
# ── Step 9: print full report ───────────────────────
cat("╔══════════════════════════════════════════╗\n")
cat(" TWO SAMPLE T-TEST REPORT \n")
cat("╚══════════════════════════════════════════╝\n")
cat("\n── STEP 1: Hypotheses ─────────────────────\n")
if (alternative == "two.sided") {
cat("H0: mean(", group1_name, ") = mean(", group2_name, ")\n")
cat("H1: mean(", group1_name, ") ≠ mean(", group2_name, ")\n")
} else if (alternative == "greater") {
cat("H0: mean(", group1_name, ") <= mean(", group2_name, ")\n")
cat("H1: mean(", group1_name, ") > mean(", group2_name, ")\n")
} else {
cat("H0: mean(", group1_name, ") >= mean(", group2_name, ")\n")
cat("H1: mean(", group1_name, ") < mean(", group2_name, ")\n")
}
cat("Significance level (alpha):", alpha, "\n")
cat("\n── STEP 2: Descriptive Statistics ─────────\n")
cat(sprintf("%-25s %10s %10s\n", "Statistic", group1_name, group2_name))
cat(rep("─", 47), "\n", sep = "")
cat(sprintf("%-25s %10.4f %10.4f\n", "Sample size", n1, n2))
cat(sprintf("%-25s %10.4f %10.4f\n", "Mean", mean1, mean2))
cat(sprintf("%-25s %10.4f %10.4f\n", "Std deviation", sd1, sd2))
cat(sprintf("%-25s %10.4f %10.4f\n", "Std error", se1, se2))
cat("\n── STEP 3: Normality Test (Shapiro-Wilk) ──\n")
cat(group1_name, "— W =", round(norm1$statistic, 4),
", p =", round(norm1$p.value, 4),
ifelse(norm1$p.value > alpha, "→ normal", "→ not normal"), "\n")
cat(group2_name, "— W =", round(norm2$statistic, 4),
", p =", round(norm2$p.value, 4),
ifelse(norm2$p.value > alpha, "→ normal", "→ not normal"), "\n")
cat("\n── STEP 4: Equality of Variances (F-test) ─\n")
cat("F =", round(var_test$statistic, 4),
", p =", round(var_test$p.value, 4),
ifelse(var_test$p.value > alpha,
"→ variances are equal",
"→ variances are NOT equal"), "\n")
cat("\n── STEP 5: Test Selected ───────────────────\n")
cat(test_type, "\n")
cat("\n── STEP 6: Test Results ────────────────────\n")
cat("t statistic :", round(test_result$statistic, 4), "\n")
cat("Degrees of freedom:", round(test_result$parameter, 4), "\n")
cat("p-value :", round(test_result$p.value, 6), "\n")
cat("95% CI : [",
round(test_result$conf.int[1], 4), ",",
round(test_result$conf.int[2], 4), "]\n")
cat("\n── STEP 7: Effect Size (Cohen's d) ─────────\n")
cat("Cohen's d :", round(cohens_d, 4), "\n")
cat("Interpretation:", effect_label, "effect\n")
cat("\n── STEP 8: Decision ────────────────────────\n")
cat(decision, "\n")
cat(conclusion, "\n")
cat("══════════════════════════════════════════\n")
# return results invisibly
invisible(list(
t_statistic = test_result$statistic,
df = test_result$parameter,
p_value = test_result$p.value,
conf_int = test_result$conf.int,
cohens_d = cohens_d,
effect = effect_label,
decision = decision
))
}
# Apply both functions
# compare mpg between 4 and 6 cylinder cars
four_cyl <- mtcars$mpg[mtcars$cyl == 4]
six_cyl <- mtcars$mpg[mtcars$cyl == 6]
two_sample_ttest(four_cyl, six_cyl,
group1_name = "4 cylinder",
group2_name = "6 cylinder",
alpha = 0.05)
## ╔══════════════════════════════════════════╗
## TWO SAMPLE T-TEST REPORT
## ╚══════════════════════════════════════════╝
##
## ── STEP 1: Hypotheses ─────────────────────
## H0: mean( 4 cylinder ) = mean( 6 cylinder )
## H1: mean( 4 cylinder ) ≠ mean( 6 cylinder )
## Significance level (alpha): 0.05
##
## ── STEP 2: Descriptive Statistics ─────────
## Statistic 4 cylinder 6 cylinder
## ───────────────────────────────────────────────
## Sample size 11.0000 7.0000
## Mean 26.6636 19.7429
## Std deviation 4.5098 1.4536
## Std error 1.3598 0.5494
##
## ── STEP 3: Normality Test (Shapiro-Wilk) ──
## 4 cylinder — W = 0.9124 , p = 0.2606 → normal
## 6 cylinder — W = 0.899 , p = 0.3252 → normal
##
## ── STEP 4: Equality of Variances (F-test) ─
## F = 9.6261 , p = 0.0118 → variances are NOT equal
##
## ── STEP 5: Test Selected ───────────────────
## Welch two sample t-test (unequal variances)
##
## ── STEP 6: Test Results ────────────────────
## t statistic : 4.7191
## Degrees of freedom: 12.956
## p-value : 0.000405
## 95% CI : [ 3.7514 , 10.0902 ]
##
## ── STEP 7: Effect Size (Cohen's d) ─────────
## Cohen's d : 1.8833
## Interpretation: large effect
##
## ── STEP 8: Decision ────────────────────────
## REJECT the null hypothesis
## There IS a statistically significant difference between 4 cylinder and 6 cylinder
## ══════════════════════════════════════════
# compare sepal length between two iris species
setosa <- iris$Sepal.Length[iris$Species == "setosa"]
versicolor <- iris$Sepal.Length[iris$Species == "versicolor"]
two_sample_ttest(setosa, versicolor,
group1_name = "Setosa",
group2_name = "Versicolor",
alpha = 0.05)
## ╔══════════════════════════════════════════╗
## TWO SAMPLE T-TEST REPORT
## ╚══════════════════════════════════════════╝
##
## ── STEP 1: Hypotheses ─────────────────────
## H0: mean( Setosa ) = mean( Versicolor )
## H1: mean( Setosa ) ≠ mean( Versicolor )
## Significance level (alpha): 0.05
##
## ── STEP 2: Descriptive Statistics ─────────
## Statistic Setosa Versicolor
## ───────────────────────────────────────────────
## Sample size 50.0000 50.0000
## Mean 5.0060 5.9360
## Std deviation 0.3525 0.5162
## Std error 0.0498 0.0730
##
## ── STEP 3: Normality Test (Shapiro-Wilk) ──
## Setosa — W = 0.9777 , p = 0.4595 → normal
## Versicolor — W = 0.9778 , p = 0.4647 → normal
##
## ── STEP 4: Equality of Variances (F-test) ─
## F = 0.4663 , p = 0.0087 → variances are NOT equal
##
## ── STEP 5: Test Selected ───────────────────
## Welch two sample t-test (unequal variances)
##
## ── STEP 6: Test Results ────────────────────
## t statistic : -10.521
## Degrees of freedom: 86.538
## p-value : 0
## 95% CI : [ -1.1057 , -0.7543 ]
##
## ── STEP 7: Effect Size (Cohen's d) ─────────
## Cohen's d : -2.1042
## Interpretation: large effect
##
## ── STEP 8: Decision ────────────────────────
## REJECT the null hypothesis
## There IS a statistically significant difference between Setosa and Versicolor
## ══════════════════════════════════════════
# apply summary stats first then test
summary_stats(four_cyl, "4 Cylinder MPG")
## ╔══════════════════════════════════════╗
## Summary Statistics: 4 Cylinder MPG
## ╚══════════════════════════════════════╝
## Sample size : 11
## Missing values : 0
## ──────────────────────────────────────
## Mean : 26.6636
## Median : 26
## Std deviation : 4.5098
## Variance : 20.3385
## Std error : 1.3598
## ──────────────────────────────────────
## Minimum : 21.4
## Maximum : 33.9
## Range : 12.5
## ──────────────────────────────────────
## Q1 (25th pct) : 22.8
## Q3 (75th pct) : 30.4
## IQR : 7.6
## ──────────────────────────────────────
## Skewness : 0.4415
## Coeff of variation : 16.91 %
## Outliers (IQR rule): 0
## Lower fence : 11.4
## Upper fence : 41.8
## ══════════════════════════════════════
summary_stats(six_cyl, "6 Cylinder MPG")
## ╔══════════════════════════════════════╗
## Summary Statistics: 6 Cylinder MPG
## ╚══════════════════════════════════════╝
## Sample size : 7
## Missing values : 0
## ──────────────────────────────────────
## Mean : 19.7429
## Median : 19.7
## Std deviation : 1.4536
## Variance : 2.1129
## Std error : 0.5494
## ──────────────────────────────────────
## Minimum : 17.8
## Maximum : 21.4
## Range : 3.6
## ──────────────────────────────────────
## Q1 (25th pct) : 18.65
## Q3 (75th pct) : 21
## IQR : 2.35
## ──────────────────────────────────────
## Skewness : 0.0885
## Coeff of variation : 7.36 %
## Outliers (IQR rule): 0
## Lower fence : 15.125
## Upper fence : 24.525
## ══════════════════════════════════════
two_sample_ttest(four_cyl, six_cyl,
group1_name = "4 Cylinder",
group2_name = "6 Cylinder")
## ╔══════════════════════════════════════════╗
## TWO SAMPLE T-TEST REPORT
## ╚══════════════════════════════════════════╝
##
## ── STEP 1: Hypotheses ─────────────────────
## H0: mean( 4 Cylinder ) = mean( 6 Cylinder )
## H1: mean( 4 Cylinder ) ≠ mean( 6 Cylinder )
## Significance level (alpha): 0.05
##
## ── STEP 2: Descriptive Statistics ─────────
## Statistic 4 Cylinder 6 Cylinder
## ───────────────────────────────────────────────
## Sample size 11.0000 7.0000
## Mean 26.6636 19.7429
## Std deviation 4.5098 1.4536
## Std error 1.3598 0.5494
##
## ── STEP 3: Normality Test (Shapiro-Wilk) ──
## 4 Cylinder — W = 0.9124 , p = 0.2606 → normal
## 6 Cylinder — W = 0.899 , p = 0.3252 → normal
##
## ── STEP 4: Equality of Variances (F-test) ─
## F = 9.6261 , p = 0.0118 → variances are NOT equal
##
## ── STEP 5: Test Selected ───────────────────
## Welch two sample t-test (unequal variances)
##
## ── STEP 6: Test Results ────────────────────
## t statistic : 4.7191
## Degrees of freedom: 12.956
## p-value : 0.000405
## 95% CI : [ 3.7514 , 10.0902 ]
##
## ── STEP 7: Effect Size (Cohen's d) ─────────
## Cohen's d : 1.8833
## Interpretation: large effect
##
## ── STEP 8: Decision ────────────────────────
## REJECT the null hypothesis
## There IS a statistically significant difference between 4 Cylinder and 6 Cylinder
## ══════════════════════════════════════════
Returns a LIST always regardless of input lapply() stands for list apply. It takes a list or vector and applies a function to every single element in it, always returning the result as a list regardless of what the output looks like. It is best used when you want to perform the same operation on multiple items and you expect the results to have different lengths or types, making a list the safest container to hold them all.
# basic syntax
#lapply(X, FUN)
#apply to a vector
numbers <- list(1, 4, 9, 16, 25)
result <- lapply(numbers, sqrt)
print(result)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 3
##
## [[4]]
## [1] 4
##
## [[5]]
## [1] 5
# returns a list of square roots
# X = vector, list or dataframe
# FUN = function to apply to each element
result <- lapply(mtcars, mean)
print(result)
## $mpg
## [1] 20.09062
##
## $cyl
## [1] 6.1875
##
## $disp
## [1] 230.7219
##
## $hp
## [1] 146.6875
##
## $drat
## [1] 3.596563
##
## $wt
## [1] 3.21725
##
## $qsec
## [1] 17.84875
##
## $vs
## [1] 0.4375
##
## $am
## [1] 0.40625
##
## $gear
## [1] 3.6875
##
## $carb
## [1] 2.8125
# returns a list of means for each colum
numbers <- list(10, 20, 30, 40)
result <- lapply(numbers, function(x) {
squared <- x^2
return(squared)
})
print(result)
## [[1]]
## [1] 100
##
## [[2]]
## [1] 400
##
## [[3]]
## [1] 900
##
## [[4]]
## [1] 1600
Same as lapply but tries to SIMPLIFY the result into a vector or matrix sapply() stands for simplified apply. It works exactly like lapply() under the hood but goes one step further by trying to simplify the result into a vector or matrix instead of a list. If every element of the result is a single number, it returns a numeric vector. If every element has the same length, it returns a matrix. It is best used when you want a clean and simple output that is easy to read and work with directly.
#returns a vector instead of list
numbers <- c(1, 4, 9, 16, 25)
result <- sapply(numbers, sqrt)
print(result)
## [1] 1 2 3 4 5
# returns a named vector not a list
class(result) # "numeric"
## [1] "numeric"
result <- sapply(mtcars, mean)
print(result)
## mpg cyl disp hp drat wt qsec
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 17.848750
## vs am gear carb
## 0.437500 0.406250 3.687500 2.812500
# returns a named numeric vector
result <- sapply(mtcars, function(x) {
c(mean = mean(x),
sd = sd(x),
min = min(x),
max = max(x))
})
print(result)
## mpg cyl disp hp drat wt qsec
## mean 20.090625 6.187500 230.7219 146.68750 3.5965625 3.2172500 17.848750
## sd 6.026948 1.785922 123.9387 68.56287 0.5346787 0.9784574 1.786943
## min 10.400000 4.000000 71.1000 52.00000 2.7600000 1.5130000 14.500000
## max 33.900000 8.000000 472.0000 335.00000 4.9300000 5.4240000 22.900000
## vs am gear carb
## mean 0.4375000 0.4062500 3.6875000 2.8125
## sd 0.5040161 0.4989909 0.7378041 1.6152
## min 0.0000000 0.0000000 3.0000000 1.0000
## max 1.0000000 1.0000000 5.0000000 8.0000
# returns a 4 x 11 matrix
result <- sapply(zee, function(x) sum(is.na(x)))
print(result)
## Date Symbol Series Prev.Close
## 0 0 0 0
## Open High Low Last
## 0 0 0 0
## Close VWAP Volume Turnover
## 0 0 0 0
## Trades Deliverable.Volume X.Deliverble
## 2850 519 519
# returns named numeric vector of NA counts
Apply a function to MULTIPLE vectors/lists in parallel — element by element mapply() stands for multivariate apply. Unlike lapply() and sapply() which work on one input at a time, mapply() takes two or more vectors and applies a function to their elements in parallel, meaning it pairs up the first element of each vector, then the second, then the third, and so on. It is best used when your function needs more than one input at the same time, such as computing the percentage change between a previous price vector and a current price vector element by element.
# Example 1 — add two vectors element by element
x <- c(1, 2, 3, 4, 5)
y <- c(10, 20, 30, 40, 50)
result <- mapply(function(a, b) a + b, x, y)
print(result)
## [1] 11 22 33 44 55
# returns c(11, 22, 33, 44, 55)
names <- c("Alice", "Bob", "Carol")
scores <- c(85, 92, 78)
result <- mapply(function(name, score) {
paste(name, "scored", score, "marks")
}, names, scores)
print(result)
## Alice Bob Carol
## "Alice scored 85 marks" "Bob scored 92 marks" "Carol scored 78 marks"
prev_close <- zee$Prev.Close[1:10]
close <- zee$Close[1:10]
pct_change <- mapply(function(prev, curr) {
(curr - prev) / prev * 100
}, prev_close, close)
print(round(pct_change, 2))
## [1] 8.00 6.84 -6.67 -5.19 -8.00 -5.80 5.51 4.30 -3.83 -2.08
map() is the tidyverse equivalent of lapply() from the purrr package. It applies a function to every element of a list or vector and always returns a list. What makes it more powerful than lapply() is that it comes with a family of variants such as map_dbl() for numeric output, map_chr() for character output, map2() for two inputs, and pmap() for multiple inputs, giving you precise control over what type of output you get. It is best used when you are already working in a tidyverse pipeline with %>% and want consistent and readable code.
#basic map returns list
numbers <- list(1, 4, 9, 16, 25)
result <- map(numbers, sqrt)
print(result)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 3
##
## [[4]]
## [1] 4
##
## [[5]]
## [1] 5
###9.4.1 map_chr returns character vector
result <- map_chr(mtcars, function(x) {
ifelse(mean(x) > 100, "high", "low")
})
print(result)
## mpg cyl disp hp drat wt qsec vs am gear carb
## "low" "low" "high" "high" "low" "low" "low" "low" "low" "low" "low"
names <- list("Alice", "Bob", "Carol")
scores <- list(85, 92, 78)
result <- map2(names, scores, function(name, score) {
paste(name, "scored", score)
})
print(result)
## [[1]]
## [1] "Alice scored 85"
##
## [[2]]
## [1] "Bob scored 92"
##
## [[3]]
## [1] "Carol scored 78"
#pmap for multiple inputs
params <- list(
mean = c(0, 5, 10),
sd = c(1, 2, 3),
n = c(10, 20, 30)
)
result <- pmap(params, function(mean, sd, n) {
rnorm(n, mean = mean, sd = sd)
})
print(length(result)) # 3 samples
## [1] 3
print(length(result[[1]])) # 10 values
## [1] 10
Used to display counts or values as bars. Two modes — counting automatically or using your own values.
# counts how many cars have each number of cylinders
ggplot(mtcars, aes(x = factor(cyl))) +
geom_bar(fill = "steelblue", color = "white") +
labs(title = "Number of cars by cylinder count",
x = "Cylinders", y = "Count") +
theme_minimal()
### 10.1.1 compute average mpg per cylinder first
avg_mpg <- mtcars %>%
group_by(cyl) %>%
summarise(avg = mean(mpg))
# then plot the values
ggplot(avg_mpg, aes(x = factor(cyl), y = avg)) +
geom_bar(stat = "identity", fill = "green", color = "white") +
labs(title = "Average MPG by cylinder count",
x = "Cylinders", y = "Average MPG") +
theme_minimal()
### 10.2 geom_boxplot() Used to display the distribution of a numeric
variable showing median, quartiles and outliers. ┌─────────────────────┐
│ whisker top │ ← Q3 + 1.5 × IQR ├─────────────────────┤ ← Q3 (75th
percentile) │ │ │ THE BOX │ │ │ ├─────────────────────┤ ← median (50th
percentile) │ │ ├─────────────────────┤ ← Q1 (25th percentile) │ whisker
bottom │ ← Q1 - 1.5 × IQR
● = outliers (points beyond the whiskers)
ggplot(mtcars, aes(y = mpg)) +
geom_boxplot(fill = "cyan", color = "navy") +
labs(title = "Distribution of MPG",
y = "Miles per Gallon") +
theme_minimal()
### 10.2.1 Boxplot by group
ggplot(mtcars, aes(x = factor(cyl), y = mpg)) +
geom_boxplot(fill = "yellow", color = "navy") +
labs(title = "MPG distribution by cylinder count",
x = "Cylinders", y = "MPG") +
theme_minimal()
### 10.2.2 Boxplot with notch — shows confidence interval around
median
ggplot(mtcars, aes(x = factor(cyl), y = mpg, fill = factor(cyl))) +
geom_boxplot(notch = FALSE) +
labs(title = "MPG by cylinders with notched median CI",
x = "Cylinders", y = "MPG") +
theme_minimal()
### 10.3 geom_smooth() Used to add a trend line or smoothed curve to a
scatter plot to show the overall pattern in the data.
#Basic smooth line
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "red") +
geom_smooth() +
labs(title = "MPG vs weight with smooth trend",
x = "Weight", y = "MPG") +
theme_minimal()
### 10.3.1 Linear regression line — method lm
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", color = "red",
se = TRUE) +
labs(title = "MPG vs weight — linear regression line",
x = "Weight", y = "MPG") +
theme_minimal()
### 10.3.2 LOESS smooth — default, captures non-linear patterns
ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "loess", color = "darkgreen",
fill = "lightgreen", alpha = 0.3) +
labs(title = "MPG vs horsepower — LOESS smooth",
x = "Horsepower", y = "MPG") +
theme_minimal()