Overview

  1. Text files (TXT, CSV)
  2. Excel files (XLSX)
  3. Statistical packages: SPSS (.sav), Stata (.dta)
  4. Database management systems (SQLite, PostgreSQL, MySQL)
  5. Application of the merge() Function in R
  6. Application of group_by() and %>% in R
  7. How to use trace() and recover()
  8. Summary Statistics
  9. lapply(),sapply(),mapply()
  10. geom_bar, geom_boxplot,geom_smooth

1. Importing from Text Files

1.1 Read a TAB-separated TXT file using base R

# read.table reads any delimiter-separated file
class_data <- read.table("data/class_data.txt", header = TRUE, sep = "\t")
head(class_data)
##   studentID gender math_score it_score height weight
## 1         1      M         12       13    165     60
## 2         2      F         17       18    158     52
## 3         3      M         10       11    170     75
## 4         4      M          9       10    168     70
## 5         5      F         NA       19    155     48
## 6         6      F         15       14    162     55
str(class_data)
## 'data.frame':    10 obs. of  6 variables:
##  $ studentID : int  1 2 3 4 5 6 7 8 9 10
##  $ gender    : chr  "M" "F" "M" "M" ...
##  $ math_score: int  12 17 10 9 NA 15 20 13 11 18
##  $ it_score  : int  13 18 11 10 19 14 20 16 12 17
##  $ height    : int  165 158 170 168 155 162 175 160 172 157
##  $ weight    : int  60 52 75 70 48 55 80 54 78 50

1.2 Read a CSV file

# read.csv is a shortcut for read.table with sep=","
student_data <- read.csv("data/student_data.csv", header = TRUE)
head(student_data)
##   studentID gender math_score it_score height weight  bmi
## 1         1      M         12       13   1.65     60 22.0
## 2         2      F         17       18   1.58     52 20.8
## 3         3      M         10       11   1.70     75 26.0
## 4         4      M          9       10   1.68     70 24.8
## 5         5      F         NA       19   1.55     48 19.9
## 6         6      F         15       14   1.62     55 20.9
str(student_data)
## 'data.frame':    10 obs. of  7 variables:
##  $ studentID : int  1 2 3 4 5 6 7 8 9 10
##  $ gender    : chr  "M" "F" "M" "M" ...
##  $ math_score: int  12 17 10 9 NA 15 20 13 11 18
##  $ it_score  : int  13 18 11 10 19 14 20 16 12 17
##  $ height    : num  1.65 1.58 1.7 1.68 1.55 1.62 1.75 1.6 1.72 1.57
##  $ weight    : int  60 52 75 70 48 55 80 54 78 50
##  $ bmi       : num  22 20.8 26 24.8 19.9 20.9 26.1 21.1 26.4 20.3

2. Importing from Excel Files

install.packages("readxl")
library(readxl)

# datasample.xlsx — student sample data
excel_data <- read_excel("data/datasample.xlsx")
head(excel_data)
## # A tibble: 5 × 5
##   Account `First Name` `Last Name`      ID Mobile1
##     <dbl> <chr>        <chr>         <dbl>   <dbl>
## 1       1 Jean         John         111111 7888888
## 2       2 diane        Dina        5555555  788888
## 3       3 diane        Dina        5555555  788888
## 4       4 diane        Dina        5555555  788888
## 5       5 diane        Dina        5555555  788888
str(excel_data)
## tibble [5 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Account   : num [1:5] 1 2 3 4 5
##  $ First Name: chr [1:5] "Jean" "diane" "diane" "diane" ...
##  $ Last Name : chr [1:5] "John" "Dina" "Dina" "Dina" ...
##  $ ID        : num [1:5] 111111 5555555 5555555 5555555 5555555
##  $ Mobile1   : num [1:5] 7888888 788888 788888 788888 788888
# OpeningDate.xlsx — opening dates dataset
opening_data <- read_excel("data/OpeningDate.xlsx")
head(opening_data)
## # A tibble: 6 × 12
##   `Open Date`         `Date Birth`        Gender Marital ...5  Status ...7 
##   <dttm>              <dttm>              <chr>  <chr>   <lgl> <chr>  <lgl>
## 1 2026-01-12 00:00:00 1999-09-09 00:00:00 Male   Single  NA    Active NA   
## 2 2026-02-03 00:00:00 1990-01-01 00:00:00 Male   Married NA    Active NA   
## 3 2026-02-03 00:00:00 1987-12-22 00:00:00 Female Married NA    Active NA   
## 4 2026-02-03 00:00:00 1990-12-28 00:00:00 Male   Single  NA    Active NA   
## 5 2026-02-03 00:00:00 1987-06-03 00:00:00 Female Married NA    Active NA   
## 6 2026-02-03 00:00:00 2000-01-01 00:00:00 Female Married NA    Active NA   
## # ℹ 5 more variables: Education <chr>, Post <chr>, Nature <chr>, Salary <dbl>,
## #   `Valid Date` <dttm>
str(opening_data)
## tibble [2,492 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Open Date : POSIXct[1:2492], format: "2026-01-12" "2026-02-03" ...
##  $ Date Birth: POSIXct[1:2492], format: "1999-09-09" "1990-01-01" ...
##  $ Gender    : chr [1:2492] "Male" "Male" "Female" "Male" ...
##  $ Marital   : chr [1:2492] "Single" "Married" "Married" "Single" ...
##  $ ...5      : logi [1:2492] NA NA NA NA NA NA ...
##  $ Status    : chr [1:2492] "Active" "Active" "Active" "Active" ...
##  $ ...7      : logi [1:2492] NA NA NA NA NA NA ...
##  $ Education : chr [1:2492] "Bachelors Degree" "Diploma (A2  level)" "Bachelors Degree" "Diploma (A2  level)" ...
##  $ Post      : chr [1:2492] "Service and sales workers/Education" "Service and sales workers/Education" "Service and sales workers/Education" "Service and sales workers/Education" ...
##  $ Nature    : chr [1:2492] "Retail or Individuals Teacher" "Retail or Individuals Teacher" "Retail or Individuals Teacher" "Retail or Individuals Teacher" ...
##  $ Salary    : num [1:2492] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Valid Date: POSIXct[1:2492], format: "2026-01-20" "2026-02-23" ...

3. Importing from Statistical Packages (SPSS, Stata, SAS)

install.packages("haven")
library(haven)

3.1 Import SPSS (.sav) file

# datasample.sav — SPSS format
spss_data <- read_sav("data/datasample.sav")
head(spss_data)
## # A tibble: 6 × 4
##      ID Name    Age Salary
##   <dbl> <chr> <dbl>  <dbl>
## 1     1 Jean     25    500
## 2     2 Alice    30    600
## 3     3 Bob      28    550
## 4     4 David    35    700
## 5     5 Emma     22    450
## 6     6 Frank    40    800
str(spss_data)
## tibble [10 × 4] (S3: tbl_df/tbl/data.frame)
##  $ ID    : num [1:10] 1 2 3 4 5 6 7 8 9 10
##   ..- attr(*, "format.spss")= chr "F8.0"
##  $ Name  : chr [1:10] "Jean" "Alice" "Bob" "David" ...
##   ..- attr(*, "format.spss")= chr "A5"
##  $ Age   : num [1:10] 25 30 28 35 22 40 27 33 29 31
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ Salary: num [1:10] 500 600 550 700 450 800 520 680 590 610
##   ..- attr(*, "format.spss")= chr "F8.2"

3.2 Import Stata (.dta) file

# loan_datasample.dta — loan data in Stata format (100 loan records)
stata_data <- read_dta("data/loan_datasample.dta")
head(stata_data)
## # A tibble: 6 × 13
##   LoanID ClientID ClientName Gender   Age LoanAmount InterestRate LoanTermMonths
##    <dbl>    <dbl> <chr>      <chr>  <dbl>      <dbl>        <dbl>          <dbl>
## 1   1001     2415 Client 1   Female    23   10543900         12.4             48
## 2   1002     2463 Client 2   Female    49    3679871         10.6             48
## 3   1003     2179 Client 3   Male      56   12627095         12.4             36
## 4   1004     2526 Client 4   Male      64   19726106         18.9             48
## 5   1005     2195 Client 5   Female    42   13541045         18.1             24
## 6   1006     2938 Client 6   Female    62    8668860         17.5             36
## # ℹ 5 more variables: LoanType <chr>, DisbursementDate <date>, Status <chr>,
## #   CollateralValue <dbl>, MonthlyInstallment <dbl>
str(stata_data)
## tibble [100 × 13] (S3: tbl_df/tbl/data.frame)
##  $ LoanID            : num [1:100] 1001 1002 1003 1004 1005 ...
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ ClientID          : num [1:100] 2415 2463 2179 2526 2195 ...
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ ClientName        : chr [1:100] "Client 1" "Client 2" "Client 3" "Client 4" ...
##   ..- attr(*, "format.stata")= chr "%-9s"
##  $ Gender            : chr [1:100] "Female" "Female" "Male" "Male" ...
##   ..- attr(*, "format.stata")= chr "%-9s"
##  $ Age               : num [1:100] 23 49 56 64 42 62 40 31 28 41 ...
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ LoanAmount        : num [1:100] 10543900 3679871 12627095 19726106 13541045 ...
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ InterestRate      : num [1:100] 12.4 10.6 12.4 18.9 18.1 ...
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ LoanTermMonths    : num [1:100] 48 48 36 48 24 36 48 24 12 36 ...
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ LoanType          : chr [1:100] "Agriculture" "Personal" "Agriculture" "Business" ...
##   ..- attr(*, "format.stata")= chr "%-9s"
##  $ DisbursementDate  : Date[1:100], format: "2023-02-15" "2023-03-05" ...
##  $ Status            : chr [1:100] "Closed" "Active" "Defaulted" "Pending" ...
##   ..- attr(*, "format.stata")= chr "%-9s"
##  $ CollateralValue   : num [1:100] 22780071 8156706 3467474 880098 24205659 ...
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ MonthlyInstallment: num [1:100] 246925 84775 394141 488632 666445 ...
##   ..- attr(*, "format.stata")= chr "%10.0g"

4. Importing from Database Management Systems

4.1 SQLite (portable, no server needed — school.db is included in this project)

install.packages("DBI")
install.packages("RSQLite")
library(DBI)
library(RSQLite)

# Connect to the existing SQLite database
con_sqlite <- dbConnect(RSQLite::SQLite(), "data/school.db")

# List available tables
dbListTables(con_sqlite)
## [1] "loans"    "students"
# Import the students table
students_db <- dbGetQuery(con_sqlite, "SELECT * FROM students")
head(students_db)
##   studentID gender math_score it_score height weight  bmi
## 1         1      M         12       13   1.65     60 22.0
## 2         2      F         17       18   1.58     52 20.8
## 3         3      M         10       11   1.70     75 26.0
## 4         4      M          9       10   1.68     70 24.8
## 5         5      F         NA       19   1.55     48 19.9
## 6         6      F         15       14   1.62     55 20.9
# Import the loans table
loans_db <- dbGetQuery(con_sqlite, "SELECT * FROM loans")
head(loans_db)
##   LoanID ClientName Gender Age LoanAmount InterestRate LoanTermMonths    Status
## 1   1001   Client 1   Male  28    5000000         15.0             24    Active
## 2   1002   Client 2 Female  34    3000000         12.5             12    Closed
## 3   1003   Client 3   Male  45    8000000         18.0             36    Active
## 4   1004   Client 4 Female  23    1500000         11.0             12   Pending
## 5   1005   Client 5   Male  38    6000000         16.5             24 Defaulted
## 6   1006   Client 6 Female  30    2500000         13.0             12    Active
# Query with a filter — students with math_score >= 15
high_scores <- dbGetQuery(con_sqlite,
  "SELECT studentID, gender, math_score, it_score
   FROM students
   WHERE math_score >= 15")
print(high_scores)
##   studentID gender math_score it_score
## 1         2      F         17       18
## 2         6      F         15       14
## 3         7      M         20       20
## 4        10      F         18       17
dbDisconnect(con_sqlite)

4.2 PostgreSQL (requires a running PostgreSQL server)

install.packages("RPostgres")
library(RPostgres)

con_pg <- dbConnect(
  RPostgres::Postgres(),
  dbname   = "pozo_db",
  host     = "localhost",
  user     = "root",
  password = "hexakomb",
  port     = 5432
)

# Customers
customers <- dbGetQuery(con_pg, "SELECT * FROM customer LIMIT 10")
head(customers)

dbDisconnect(con_pg)

5. Application of the merge() Function in R

The merge() function in R is a fundamental data manipulation tool used to combine two datasets based on one or more common columns, commonly referred to as a key. Its primary application lies in situations where related information is stored in separate tables and needs to be brought together for analysis. This is a very common scenario in real-world data science, where data is rarely collected and stored in a single place. For example, in a hospital setting, patient demographic information such as name and age may be stored in one table while laboratory test results are stored in another. By merging these two tables on a common patient ID column, an analyst can create one complete record that enables a full picture of each patient’s health status. ### 5.1. Inner join (default)

# split mtcars into two separate tables
car_engine <- data.frame(
  car_name = rownames(mtcars)[1:10],
  hp       = mtcars$hp[1:10],
  cyl      = mtcars$cyl[1:10]
)

car_performance <- data.frame(
  car_name = rownames(mtcars)[6:15],
  mpg      = mtcars$mpg[6:15],
  wt       = mtcars$wt[6:15]
)

# inner join: only cars that appear in Both tables
inner_result <- merge(car_engine, car_performance, by = "car_name")
print(inner_result)
##     car_name  hp cyl  mpg   wt
## 1 Duster 360 245   8 14.3 3.57
## 2   Merc 230  95   4 22.8 3.15
## 3  Merc 240D  62   4 24.4 3.19
## 4   Merc 280 123   6 19.2 3.44
## 5    Valiant 105   6 18.1 3.46
cat("Rows returned:", nrow(inner_result), "\n")
## Rows returned: 5
#Only cars 6 to 10 appear because those are the only ones present in both tables. Cars 1 to 5 are dropped because they only exist in car_engine.

5.2. Left join

# split mtcars into two separate tables
# left join — keep ALL rows from car_engine
# fill NA where no match found in car_performance
left_result <- merge(car_engine, car_performance, 
                     by  = "car_name", 
                     all.x = TRUE)
print(left_result)
##             car_name  hp cyl  mpg   wt
## 1         Datsun 710  93   4   NA   NA
## 2         Duster 360 245   8 14.3 3.57
## 3     Hornet 4 Drive 110   6   NA   NA
## 4  Hornet Sportabout 175   8   NA   NA
## 5          Mazda RX4 110   6   NA   NA
## 6      Mazda RX4 Wag 110   6   NA   NA
## 7           Merc 230  95   4 22.8 3.15
## 8          Merc 240D  62   4 24.4 3.19
## 9           Merc 280 123   6 19.2 3.44
## 10           Valiant 105   6 18.1 3.46
cat("Rows returned:", nrow(left_result), "\n")
## Rows returned: 10
#All 10 cars from car_engine are kept. Cars 1 to 5 have NA in the mpg and wt columns because they have no match in car_performance.

5.3. Right join

# right join — keep ALL rows from car_performance
right_result <- merge(car_engine, car_performance, 
                      by    = "car_name", 
                      all.y = TRUE)
print(right_result)
##              car_name  hp cyl  mpg   wt
## 1  Cadillac Fleetwood  NA  NA 10.4 5.25
## 2          Duster 360 245   8 14.3 3.57
## 3            Merc 230  95   4 22.8 3.15
## 4           Merc 240D  62   4 24.4 3.19
## 5            Merc 280 123   6 19.2 3.44
## 6           Merc 280C  NA  NA 17.8 3.44
## 7          Merc 450SE  NA  NA 16.4 4.07
## 8          Merc 450SL  NA  NA 17.3 3.73
## 9         Merc 450SLC  NA  NA 15.2 3.78
## 10            Valiant 105   6 18.1 3.46
cat("Rows returned:", nrow(right_result), "\n")
## Rows returned: 10
#All 10 cars from car_performance are kept. Cars 11 to 15 have NA in hp and cyl because they have no match in car_engine.

5.4. Full join

# full join — keep ALL rows from Both tables
full_result <- merge(car_engine, car_performance, 
                     by  = "car_name", 
                     all = TRUE)
print(full_result)
##              car_name  hp cyl  mpg   wt
## 1  Cadillac Fleetwood  NA  NA 10.4 5.25
## 2          Datsun 710  93   4   NA   NA
## 3          Duster 360 245   8 14.3 3.57
## 4      Hornet 4 Drive 110   6   NA   NA
## 5   Hornet Sportabout 175   8   NA   NA
## 6           Mazda RX4 110   6   NA   NA
## 7       Mazda RX4 Wag 110   6   NA   NA
## 8            Merc 230  95   4 22.8 3.15
## 9           Merc 240D  62   4 24.4 3.19
## 10           Merc 280 123   6 19.2 3.44
## 11          Merc 280C  NA  NA 17.8 3.44
## 12         Merc 450SE  NA  NA 16.4 4.07
## 13         Merc 450SL  NA  NA 17.3 3.73
## 14        Merc 450SLC  NA  NA 15.2 3.78
## 15            Valiant 105   6 18.1 3.46
cat("Rows returned:", nrow(full_result), "\n")
## Rows returned: 15
#All 15 unique cars appear. Cars only in car_engine have NA for performance columns. Cars only in car_performance have NA for engine columns.

5.5. Merge on different column names

# two tables where the key has different names
students <- data.frame(
  student_id = c(1, 2, 3, 4, 5),
  name       = c("Alice", "Bob", "Carol", "David", "Eve")
)

grades <- data.frame(
  id    = c(1, 2, 3, 4, 5),
  grade = c("A", "B", "A", "C", "B"),
  score = c(85, 72, 91, 65, 78)
)

# by.x = key name in first table
# by.y = key name in second table
result <- merge(students, grades, 
                by.x = "student_id", 
                by.y = "id")
print(result)
##   student_id  name grade score
## 1          1 Alice     A    85
## 2          2   Bob     B    72
## 3          3 Carol     A    91
## 4          4 David     C    65
## 5          5   Eve     B    78

6. Application of group_by() and %>% in R

group_by() is a function from the dplyr package that splits a dataset into groups based on one or more columns, allowing you to perform calculations separately for each group. It is almost always used together with summarise(), mutate(), or other dplyr functions. The %>% operator, called the pipe operator, is used to chain multiple operations together in a clean and readable sequence. Instead of nesting functions inside each other, the pipe passes the result of one function directly into the next, making the code read like a series of steps from top to bottom. Together, group_by() and %>% form the backbone of data summarisation and aggregation in R. ## 6.1 Basic syntax

#dataset %>%
#  group_by(column) %>%
#  summarise(result = function(column))

# result = the name you give the new column
# function = replace this with a real function like mean, sum, max etc
library(tidyverse)

iris %>%
  group_by(Species) %>%
  summarise(
    count      = n(),
    average    = mean(Petal.Length),
    total      = sum(Petal.Length),
    highest    = max(Petal.Length),
    lowest     = min(Petal.Length),
    std_dev    = sd(Petal.Length),
    middle     = median(Petal.Length),
    unique_val = n_distinct(Petal.Length)
  )
## # A tibble: 3 × 9
##   Species    count average total highest lowest std_dev middle unique_val
##   <fct>      <int>   <dbl> <dbl>   <dbl>  <dbl>   <dbl>  <dbl>      <int>
## 1 setosa        50    1.46  73.1     1.9    1     0.174   1.5           9
## 2 versicolor    50    4.26 213       5.1    3     0.470   4.35         19
## 3 virginica     50    5.55 278.      6.9    4.5   0.552   5.55         20
#This produces one row per species with all eight statistics computed at once , every one of them replacing what was written as function in the template.

7. How to use trace() and recover()

trace() and recover() in R trace() is used to insert debugging code into a function temporarily without editing the function itself. It lets you monitor when a function is called, what arguments it receives, and what happens inside it.

# add tracing to a function
#trace(what = function_name, tracer = code_to_run)

# remove tracing when done
#untrace(function_name)

7.1 trace()

# trace mean() to see every time it is called
trace(mean)

# now call mean
mean(c(1, 2, 3, 4, 5))
## trace: mean(c(1, 2, 3, 4, 5))
## [1] 3
mean(mtcars$mpg)
## trace: mean(mtcars$mpg)
## [1] 20.09062
# output will show
# trace: mean(c(1,2,3,4,5))
# trace: mean(mtcars$mpg)

# remove trace when done
untrace(mean)

7.2 recover() — interactive debugging

recover() pauses execution at the point of error and lets you inspect the environment of each function in the call stack interactively.

options(error = recover)

process_values <- function(values) {
  results <- c()
  for (i in seq_along(values)) {
    result    <- sqrt(values[[i]])
    results   <- c(results, result)
  }
  return(results)
}

# list with a bad value
my_values <- list(4, 9, -1, 16)

process_values(my_values)
## [1]   2   3 NaN   4

8. Summary Statistics

summary_stats <- function(x, var_name = "Variable") {
  
  # remove missing values
  x_clean <- x[!is.na(x)]
  n_missing <- length(x) - length(x_clean)
  
  # calculate all statistics
  n        <- length(x_clean)
  mean_val <- mean(x_clean)
  med_val  <- median(x_clean)
  sd_val   <- sd(x_clean)
  var_val  <- var(x_clean)
  se_val   <- sd_val / sqrt(n)
  min_val  <- min(x_clean)
  max_val  <- max(x_clean)
  range_val <- max_val - min_val
  q1       <- quantile(x_clean, 0.25)
  q3       <- quantile(x_clean, 0.75)
  iqr_val  <- IQR(x_clean)
  skew_val <- (3 * (mean_val - med_val)) / sd_val
  cv_val   <- (sd_val / mean_val) * 100
  
  # count outliers using IQR method
  lower_fence <- q1 - 1.5 * iqr_val
  upper_fence <- q3 + 1.5 * iqr_val
  n_outliers  <- sum(x_clean < lower_fence | x_clean > upper_fence)
  
  # print results
  cat("╔══════════════════════════════════════╗\n")
  cat("   Summary Statistics:", var_name,        "\n")
  cat("╚══════════════════════════════════════╝\n")
  cat("Sample size        :", n,                 "\n")
  cat("Missing values     :", n_missing,         "\n")
  cat("──────────────────────────────────────\n")
  cat("Mean               :", round(mean_val, 4),"\n")
  cat("Median             :", round(med_val,  4),"\n")
  cat("Std deviation      :", round(sd_val,   4),"\n")
  cat("Variance           :", round(var_val,  4),"\n")
  cat("Std error          :", round(se_val,   4),"\n")
  cat("──────────────────────────────────────\n")
  cat("Minimum            :", round(min_val,  4),"\n")
  cat("Maximum            :", round(max_val,  4),"\n")
  cat("Range              :", round(range_val,4),"\n")
  cat("──────────────────────────────────────\n")
  cat("Q1 (25th pct)      :", round(q1,       4),"\n")
  cat("Q3 (75th pct)      :", round(q3,       4),"\n")
  cat("IQR                :", round(iqr_val,  4),"\n")
  cat("──────────────────────────────────────\n")
  cat("Skewness           :", round(skew_val, 4),"\n")
  cat("Coeff of variation :", round(cv_val,   2), "%\n")
  cat("Outliers (IQR rule):", n_outliers,        "\n")
  cat("Lower fence        :", round(lower_fence, 4), "\n")
  cat("Upper fence        :", round(upper_fence, 4), "\n")
  cat("══════════════════════════════════════\n")
  
  # return as a list silently so values can be stored
  invisible(list(
    n           = n,
    mean        = mean_val,
    median      = med_val,
    sd          = sd_val,
    variance    = var_val,
    se          = se_val,
    min         = min_val,
    max         = max_val,
    range       = range_val,
    q1          = q1,
    q3          = q3,
    iqr         = iqr_val,
    skewness    = skew_val,
    cv          = cv_val,
    n_outliers  = n_outliers,
    lower_fence = lower_fence,
    upper_fence = upper_fence
  ))
}
#Apply it:
# on a simple vector
summary_stats(mtcars$mpg, "Miles per Gallon")
## ╔══════════════════════════════════════╗
##    Summary Statistics: Miles per Gallon 
## ╚══════════════════════════════════════╝
## Sample size        : 32 
## Missing values     : 0 
## ──────────────────────────────────────
## Mean               : 20.0906 
## Median             : 19.2 
## Std deviation      : 6.0269 
## Variance           : 36.3241 
## Std error          : 1.0654 
## ──────────────────────────────────────
## Minimum            : 10.4 
## Maximum            : 33.9 
## Range              : 23.5 
## ──────────────────────────────────────
## Q1 (25th pct)      : 15.425 
## Q3 (75th pct)      : 22.8 
## IQR                : 7.375 
## ──────────────────────────────────────
## Skewness           : 0.4433 
## Coeff of variation : 30 %
## Outliers (IQR rule): 1 
## Lower fence        : 4.3625 
## Upper fence        : 33.8625 
## ══════════════════════════════════════
# on your stock data
summary_stats(zee$Close, "Zee Entertainment Close Price")
## ╔══════════════════════════════════════╗
##    Summary Statistics: Zee Entertainment Close Price 
## ╚══════════════════════════════════════╝
## Sample size        : 5306 
## Missing values     : 0 
## ──────────────────────────────────────
## Mean               : 273.2336 
## Median             : 238.125 
## Std deviation      : 176.0483 
## Variance           : 30993 
## Std error          : 2.4168 
## ──────────────────────────────────────
## Minimum            : 62.3 
## Maximum            : 1541.7 
## Range              : 1479.4 
## ──────────────────────────────────────
## Q1 (25th pct)      : 143.2 
## Q3 (75th pct)      : 345.5875 
## IQR                : 202.3875 
## ──────────────────────────────────────
## Skewness           : 0.5983 
## Coeff of variation : 64.43 %
## Outliers (IQR rule): 84 
## Lower fence        : -160.3813 
## Upper fence        : 649.1688 
## ══════════════════════════════════════
# store the results
results <- summary_stats(iris$Sepal.Length, "Sepal Length")
## ╔══════════════════════════════════════╗
##    Summary Statistics: Sepal Length 
## ╚══════════════════════════════════════╝
## Sample size        : 150 
## Missing values     : 0 
## ──────────────────────────────────────
## Mean               : 5.8433 
## Median             : 5.8 
## Std deviation      : 0.8281 
## Variance           : 0.6857 
## Std error          : 0.0676 
## ──────────────────────────────────────
## Minimum            : 4.3 
## Maximum            : 7.9 
## Range              : 3.6 
## ──────────────────────────────────────
## Q1 (25th pct)      : 5.1 
## Q3 (75th pct)      : 6.4 
## IQR                : 1.3 
## ──────────────────────────────────────
## Skewness           : 0.157 
## Coeff of variation : 14.17 %
## Outliers (IQR rule): 0 
## Lower fence        : 3.15 
## Upper fence        : 8.35 
## ══════════════════════════════════════
results$mean       # access individual values
## [1] 5.843333
results$n_outliers
## [1] 0

###8.1 Two Sample of t-Test

two_sample_ttest <- function(x, y, 
                              group1_name = "Group 1",
                              group2_name = "Group 2",
                              alpha       = 0.05,
                              alternative = "two.sided",
                              var.equal   = FALSE) {
  
  # ── Step 1: clean data ──────────────────────────────
  x_clean <- x[!is.na(x)]
  y_clean <- y[!is.na(y)]
  
  # ── Step 2: descriptive statistics ──────────────────
  n1    <- length(x_clean)
  n2    <- length(y_clean)
  mean1 <- mean(x_clean)
  mean2 <- mean(y_clean)
  sd1   <- sd(x_clean)
  sd2   <- sd(y_clean)
  se1   <- sd1 / sqrt(n1)
  se2   <- sd2 / sqrt(n2)
  
  # ── Step 3: check normality (Shapiro-Wilk) ──────────
  # shapiro test only works for n <= 5000
  if (n1 <= 5000) {
    norm1 <- shapiro.test(x_clean)
  } else {
    norm1 <- list(statistic = NA, p.value = NA)
  }
  if (n2 <= 5000) {
    norm2 <- shapiro.test(y_clean)
  } else {
    norm2 <- list(statistic = NA, p.value = NA)
  }
  
  # ── Step 4: check equality of variances (Levene) ────
  var_test <- var.test(x_clean, y_clean)
  
  # ── Step 5: decide which test to use ────────────────
  if (var_test$p.value < alpha) {
    var.equal <- FALSE   # variances significantly different
    test_type <- "Welch two sample t-test (unequal variances)"
  } else {
    var.equal <- TRUE    # variances not significantly different
    test_type <- "Student two sample t-test (equal variances)"
  }
  
  # ── Step 6: run the t-test ───────────────────────────
  test_result <- t.test(x_clean, y_clean,
                        alternative = alternative,
                        var.equal   = var.equal,
                        conf.level  = 1 - alpha)
  
  # ── Step 7: compute effect size (Cohen's d) ─────────
  pooled_sd <- sqrt(((n1 - 1) * sd1^2 + (n2 - 1) * sd2^2) /
                      (n1 + n2 - 2))
  cohens_d  <- (mean1 - mean2) / pooled_sd
  
  # interpret effect size
  effect_label <- ifelse(abs(cohens_d) < 0.2, "negligible",
                  ifelse(abs(cohens_d) < 0.5, "small",
                  ifelse(abs(cohens_d) < 0.8, "medium", "large")))
  
  # ── Step 8: interpret the result ────────────────────
  decision <- ifelse(test_result$p.value < alpha,
                     "REJECT the null hypothesis",
                     "FAIL TO REJECT the null hypothesis")
  
  conclusion <- ifelse(test_result$p.value < alpha,
    paste("There IS a statistically significant difference between",
          group1_name, "and", group2_name),
    paste("There is NO statistically significant difference between",
          group1_name, "and", group2_name))
  
  # ── Step 9: print full report ───────────────────────
  cat("╔══════════════════════════════════════════╗\n")
  cat("        TWO SAMPLE T-TEST REPORT            \n")
  cat("╚══════════════════════════════════════════╝\n")
  
  cat("\n── STEP 1: Hypotheses ─────────────────────\n")
  if (alternative == "two.sided") {
    cat("H0: mean(", group1_name, ") = mean(", group2_name, ")\n")
    cat("H1: mean(", group1_name, ") ≠ mean(", group2_name, ")\n")
  } else if (alternative == "greater") {
    cat("H0: mean(", group1_name, ") <= mean(", group2_name, ")\n")
    cat("H1: mean(", group1_name, ") >  mean(", group2_name, ")\n")
  } else {
    cat("H0: mean(", group1_name, ") >= mean(", group2_name, ")\n")
    cat("H1: mean(", group1_name, ") <  mean(", group2_name, ")\n")
  }
  cat("Significance level (alpha):", alpha, "\n")
  
  cat("\n── STEP 2: Descriptive Statistics ─────────\n")
  cat(sprintf("%-25s %10s %10s\n", "Statistic", group1_name, group2_name))
  cat(rep("─", 47), "\n", sep = "")
  cat(sprintf("%-25s %10.4f %10.4f\n", "Sample size",    n1,    n2))
  cat(sprintf("%-25s %10.4f %10.4f\n", "Mean",           mean1, mean2))
  cat(sprintf("%-25s %10.4f %10.4f\n", "Std deviation",  sd1,   sd2))
  cat(sprintf("%-25s %10.4f %10.4f\n", "Std error",      se1,   se2))
  
  cat("\n── STEP 3: Normality Test (Shapiro-Wilk) ──\n")
  cat(group1_name, "— W =", round(norm1$statistic, 4),
      ", p =", round(norm1$p.value, 4),
      ifelse(norm1$p.value > alpha, "→ normal", "→ not normal"), "\n")
  cat(group2_name, "— W =", round(norm2$statistic, 4),
      ", p =", round(norm2$p.value, 4),
      ifelse(norm2$p.value > alpha, "→ normal", "→ not normal"), "\n")
  
  cat("\n── STEP 4: Equality of Variances (F-test) ─\n")
  cat("F =", round(var_test$statistic, 4),
      ", p =", round(var_test$p.value, 4),
      ifelse(var_test$p.value > alpha,
             "→ variances are equal",
             "→ variances are NOT equal"), "\n")
  
  cat("\n── STEP 5: Test Selected ───────────────────\n")
  cat(test_type, "\n")
  
  cat("\n── STEP 6: Test Results ────────────────────\n")
  cat("t statistic  :", round(test_result$statistic, 4), "\n")
  cat("Degrees of freedom:", round(test_result$parameter, 4), "\n")
  cat("p-value      :", round(test_result$p.value, 6),   "\n")
  cat("95% CI       : [",
      round(test_result$conf.int[1], 4), ",",
      round(test_result$conf.int[2], 4), "]\n")
  
  cat("\n── STEP 7: Effect Size (Cohen's d) ─────────\n")
  cat("Cohen's d    :", round(cohens_d, 4), "\n")
  cat("Interpretation:", effect_label, "effect\n")
  
  cat("\n── STEP 8: Decision ────────────────────────\n")
  cat(decision, "\n")
  cat(conclusion, "\n")
  cat("══════════════════════════════════════════\n")
  
  # return results invisibly
  invisible(list(
    t_statistic = test_result$statistic,
    df          = test_result$parameter,
    p_value     = test_result$p.value,
    conf_int    = test_result$conf.int,
    cohens_d    = cohens_d,
    effect      = effect_label,
    decision    = decision
  ))
}
# Apply both functions
# compare mpg between 4 and 6 cylinder cars
four_cyl <- mtcars$mpg[mtcars$cyl == 4]
six_cyl  <- mtcars$mpg[mtcars$cyl == 6]

two_sample_ttest(four_cyl, six_cyl,
                 group1_name = "4 cylinder",
                 group2_name = "6 cylinder",
                 alpha       = 0.05)
## ╔══════════════════════════════════════════╗
##         TWO SAMPLE T-TEST REPORT            
## ╚══════════════════════════════════════════╝
## 
## ── STEP 1: Hypotheses ─────────────────────
## H0: mean( 4 cylinder ) = mean( 6 cylinder )
## H1: mean( 4 cylinder ) ≠ mean( 6 cylinder )
## Significance level (alpha): 0.05 
## 
## ── STEP 2: Descriptive Statistics ─────────
## Statistic                 4 cylinder 6 cylinder
## ───────────────────────────────────────────────
## Sample size                  11.0000     7.0000
## Mean                         26.6636    19.7429
## Std deviation                 4.5098     1.4536
## Std error                     1.3598     0.5494
## 
## ── STEP 3: Normality Test (Shapiro-Wilk) ──
## 4 cylinder — W = 0.9124 , p = 0.2606 → normal 
## 6 cylinder — W = 0.899 , p = 0.3252 → normal 
## 
## ── STEP 4: Equality of Variances (F-test) ─
## F = 9.6261 , p = 0.0118 → variances are NOT equal 
## 
## ── STEP 5: Test Selected ───────────────────
## Welch two sample t-test (unequal variances) 
## 
## ── STEP 6: Test Results ────────────────────
## t statistic  : 4.7191 
## Degrees of freedom: 12.956 
## p-value      : 0.000405 
## 95% CI       : [ 3.7514 , 10.0902 ]
## 
## ── STEP 7: Effect Size (Cohen's d) ─────────
## Cohen's d    : 1.8833 
## Interpretation: large effect
## 
## ── STEP 8: Decision ────────────────────────
## REJECT the null hypothesis 
## There IS a statistically significant difference between 4 cylinder and 6 cylinder 
## ══════════════════════════════════════════
# compare sepal length between two iris species
setosa     <- iris$Sepal.Length[iris$Species == "setosa"]
versicolor <- iris$Sepal.Length[iris$Species == "versicolor"]

two_sample_ttest(setosa, versicolor,
                 group1_name = "Setosa",
                 group2_name = "Versicolor",
                 alpha       = 0.05)
## ╔══════════════════════════════════════════╗
##         TWO SAMPLE T-TEST REPORT            
## ╚══════════════════════════════════════════╝
## 
## ── STEP 1: Hypotheses ─────────────────────
## H0: mean( Setosa ) = mean( Versicolor )
## H1: mean( Setosa ) ≠ mean( Versicolor )
## Significance level (alpha): 0.05 
## 
## ── STEP 2: Descriptive Statistics ─────────
## Statistic                     Setosa Versicolor
## ───────────────────────────────────────────────
## Sample size                  50.0000    50.0000
## Mean                          5.0060     5.9360
## Std deviation                 0.3525     0.5162
## Std error                     0.0498     0.0730
## 
## ── STEP 3: Normality Test (Shapiro-Wilk) ──
## Setosa — W = 0.9777 , p = 0.4595 → normal 
## Versicolor — W = 0.9778 , p = 0.4647 → normal 
## 
## ── STEP 4: Equality of Variances (F-test) ─
## F = 0.4663 , p = 0.0087 → variances are NOT equal 
## 
## ── STEP 5: Test Selected ───────────────────
## Welch two sample t-test (unequal variances) 
## 
## ── STEP 6: Test Results ────────────────────
## t statistic  : -10.521 
## Degrees of freedom: 86.538 
## p-value      : 0 
## 95% CI       : [ -1.1057 , -0.7543 ]
## 
## ── STEP 7: Effect Size (Cohen's d) ─────────
## Cohen's d    : -2.1042 
## Interpretation: large effect
## 
## ── STEP 8: Decision ────────────────────────
## REJECT the null hypothesis 
## There IS a statistically significant difference between Setosa and Versicolor 
## ══════════════════════════════════════════
# apply summary stats first then test
summary_stats(four_cyl, "4 Cylinder MPG")
## ╔══════════════════════════════════════╗
##    Summary Statistics: 4 Cylinder MPG 
## ╚══════════════════════════════════════╝
## Sample size        : 11 
## Missing values     : 0 
## ──────────────────────────────────────
## Mean               : 26.6636 
## Median             : 26 
## Std deviation      : 4.5098 
## Variance           : 20.3385 
## Std error          : 1.3598 
## ──────────────────────────────────────
## Minimum            : 21.4 
## Maximum            : 33.9 
## Range              : 12.5 
## ──────────────────────────────────────
## Q1 (25th pct)      : 22.8 
## Q3 (75th pct)      : 30.4 
## IQR                : 7.6 
## ──────────────────────────────────────
## Skewness           : 0.4415 
## Coeff of variation : 16.91 %
## Outliers (IQR rule): 0 
## Lower fence        : 11.4 
## Upper fence        : 41.8 
## ══════════════════════════════════════
summary_stats(six_cyl,  "6 Cylinder MPG")
## ╔══════════════════════════════════════╗
##    Summary Statistics: 6 Cylinder MPG 
## ╚══════════════════════════════════════╝
## Sample size        : 7 
## Missing values     : 0 
## ──────────────────────────────────────
## Mean               : 19.7429 
## Median             : 19.7 
## Std deviation      : 1.4536 
## Variance           : 2.1129 
## Std error          : 0.5494 
## ──────────────────────────────────────
## Minimum            : 17.8 
## Maximum            : 21.4 
## Range              : 3.6 
## ──────────────────────────────────────
## Q1 (25th pct)      : 18.65 
## Q3 (75th pct)      : 21 
## IQR                : 2.35 
## ──────────────────────────────────────
## Skewness           : 0.0885 
## Coeff of variation : 7.36 %
## Outliers (IQR rule): 0 
## Lower fence        : 15.125 
## Upper fence        : 24.525 
## ══════════════════════════════════════
two_sample_ttest(four_cyl, six_cyl,
                 group1_name = "4 Cylinder",
                 group2_name = "6 Cylinder")
## ╔══════════════════════════════════════════╗
##         TWO SAMPLE T-TEST REPORT            
## ╚══════════════════════════════════════════╝
## 
## ── STEP 1: Hypotheses ─────────────────────
## H0: mean( 4 Cylinder ) = mean( 6 Cylinder )
## H1: mean( 4 Cylinder ) ≠ mean( 6 Cylinder )
## Significance level (alpha): 0.05 
## 
## ── STEP 2: Descriptive Statistics ─────────
## Statistic                 4 Cylinder 6 Cylinder
## ───────────────────────────────────────────────
## Sample size                  11.0000     7.0000
## Mean                         26.6636    19.7429
## Std deviation                 4.5098     1.4536
## Std error                     1.3598     0.5494
## 
## ── STEP 3: Normality Test (Shapiro-Wilk) ──
## 4 Cylinder — W = 0.9124 , p = 0.2606 → normal 
## 6 Cylinder — W = 0.899 , p = 0.3252 → normal 
## 
## ── STEP 4: Equality of Variances (F-test) ─
## F = 9.6261 , p = 0.0118 → variances are NOT equal 
## 
## ── STEP 5: Test Selected ───────────────────
## Welch two sample t-test (unequal variances) 
## 
## ── STEP 6: Test Results ────────────────────
## t statistic  : 4.7191 
## Degrees of freedom: 12.956 
## p-value      : 0.000405 
## 95% CI       : [ 3.7514 , 10.0902 ]
## 
## ── STEP 7: Effect Size (Cohen's d) ─────────
## Cohen's d    : 1.8833 
## Interpretation: large effect
## 
## ── STEP 8: Decision ────────────────────────
## REJECT the null hypothesis 
## There IS a statistically significant difference between 4 Cylinder and 6 Cylinder 
## ══════════════════════════════════════════

9. lapply(),sapply(),mapply()

9.1 . lapply() — List Apply

Returns a LIST always regardless of input lapply() stands for list apply. It takes a list or vector and applies a function to every single element in it, always returning the result as a list regardless of what the output looks like. It is best used when you want to perform the same operation on multiple items and you expect the results to have different lengths or types, making a list the safest container to hold them all.

# basic syntax
#lapply(X, FUN)
#apply to a vector
numbers <- list(1, 4, 9, 16, 25)

result <- lapply(numbers, sqrt)
print(result)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3
## 
## [[4]]
## [1] 4
## 
## [[5]]
## [1] 5
# returns a list of square roots
# X   = vector, list or dataframe
# FUN = function to apply to each element

9.1.2 apply to each column of a dataframe

result <- lapply(mtcars, mean)

print(result)
## $mpg
## [1] 20.09062
## 
## $cyl
## [1] 6.1875
## 
## $disp
## [1] 230.7219
## 
## $hp
## [1] 146.6875
## 
## $drat
## [1] 3.596563
## 
## $wt
## [1] 3.21725
## 
## $qsec
## [1] 17.84875
## 
## $vs
## [1] 0.4375
## 
## $am
## [1] 0.40625
## 
## $gear
## [1] 3.6875
## 
## $carb
## [1] 2.8125
# returns a list of means for each colum

9.1.3. custom function

numbers <- list(10, 20, 30, 40)

result <- lapply(numbers, function(x) {
  squared <- x^2
  return(squared)
})
print(result)
## [[1]]
## [1] 100
## 
## [[2]]
## [1] 400
## 
## [[3]]
## [1] 900
## 
## [[4]]
## [1] 1600

9.2 sapply() — Simplified Apply

Same as lapply but tries to SIMPLIFY the result into a vector or matrix sapply() stands for simplified apply. It works exactly like lapply() under the hood but goes one step further by trying to simplify the result into a vector or matrix instead of a list. If every element of the result is a single number, it returns a numeric vector. If every element has the same length, it returns a matrix. It is best used when you want a clean and simple output that is easy to read and work with directly.

#returns a vector instead of list
numbers <- c(1, 4, 9, 16, 25)

result <- sapply(numbers, sqrt)
print(result)
## [1] 1 2 3 4 5
# returns a named vector not a list
class(result)   # "numeric"
## [1] "numeric"

9.2.1 mean of each column

result <- sapply(mtcars, mean)
print(result)
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500
# returns a named numeric vector

9.2.3 multiple return values becomes a matrix

result <- sapply(mtcars, function(x) {
  c(mean   = mean(x),
    sd     = sd(x),
    min    = min(x),
    max    = max(x))
})
print(result)
##            mpg      cyl     disp        hp      drat        wt      qsec
## mean 20.090625 6.187500 230.7219 146.68750 3.5965625 3.2172500 17.848750
## sd    6.026948 1.785922 123.9387  68.56287 0.5346787 0.9784574  1.786943
## min  10.400000 4.000000  71.1000  52.00000 2.7600000 1.5130000 14.500000
## max  33.900000 8.000000 472.0000 335.00000 4.9300000 5.4240000 22.900000
##             vs        am      gear   carb
## mean 0.4375000 0.4062500 3.6875000 2.8125
## sd   0.5040161 0.4989909 0.7378041 1.6152
## min  0.0000000 0.0000000 3.0000000 1.0000
## max  1.0000000 1.0000000 5.0000000 8.0000
# returns a 4 x 11 matrix

9.2.4 count missing values per column

result <- sapply(zee, function(x) sum(is.na(x)))
print(result)
##               Date             Symbol             Series         Prev.Close 
##                  0                  0                  0                  0 
##               Open               High                Low               Last 
##                  0                  0                  0                  0 
##              Close               VWAP             Volume           Turnover 
##                  0                  0                  0                  0 
##             Trades Deliverable.Volume       X.Deliverble 
##               2850                519                519
# returns named numeric vector of NA counts

9.3 mapply() — Multivariate Apply

Apply a function to MULTIPLE vectors/lists in parallel — element by element mapply() stands for multivariate apply. Unlike lapply() and sapply() which work on one input at a time, mapply() takes two or more vectors and applies a function to their elements in parallel, meaning it pairs up the first element of each vector, then the second, then the third, and so on. It is best used when your function needs more than one input at the same time, such as computing the percentage change between a previous price vector and a current price vector element by element.

# Example 1 — add two vectors element by element
x <- c(1, 2, 3, 4, 5)
y <- c(10, 20, 30, 40, 50)

result <- mapply(function(a, b) a + b, x, y)
print(result)
## [1] 11 22 33 44 55
# returns c(11, 22, 33, 44, 55)

9.3.1 create personalised messages

names  <- c("Alice", "Bob", "Carol")
scores <- c(85, 92, 78)

result <- mapply(function(name, score) {
  paste(name, "scored", score, "marks")
}, names, scores)

print(result)
##                   Alice                     Bob                   Carol 
## "Alice scored 85 marks"   "Bob scored 92 marks" "Carol scored 78 marks"

9.3.2 compute percentage change between two price vectors

prev_close <- zee$Prev.Close[1:10]
close      <- zee$Close[1:10]

pct_change <- mapply(function(prev, curr) {
  (curr - prev) / prev * 100
}, prev_close, close)

print(round(pct_change, 2))
##  [1]  8.00  6.84 -6.67 -5.19 -8.00 -5.80  5.51  4.30 -3.83 -2.08

9.4 map()

map() is the tidyverse equivalent of lapply() from the purrr package. It applies a function to every element of a list or vector and always returns a list. What makes it more powerful than lapply() is that it comes with a family of variants such as map_dbl() for numeric output, map_chr() for character output, map2() for two inputs, and pmap() for multiple inputs, giving you precise control over what type of output you get. It is best used when you are already working in a tidyverse pipeline with %>% and want consistent and readable code.

#basic map returns list
numbers <- list(1, 4, 9, 16, 25)

result <- map(numbers, sqrt)
print(result)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3
## 
## [[4]]
## [1] 4
## 
## [[5]]
## [1] 5

###9.4.1 map_chr returns character vector

result <- map_chr(mtcars, function(x) {
  ifelse(mean(x) > 100, "high", "low")
})
print(result)
##    mpg    cyl   disp     hp   drat     wt   qsec     vs     am   gear   carb 
##  "low"  "low" "high" "high"  "low"  "low"  "low"  "low"  "low"  "low"  "low"

9.4.2 map2 for two inputs

names  <- list("Alice", "Bob", "Carol")
scores <- list(85, 92, 78)

result <- map2(names, scores, function(name, score) {
  paste(name, "scored", score)
})
print(result)
## [[1]]
## [1] "Alice scored 85"
## 
## [[2]]
## [1] "Bob scored 92"
## 
## [[3]]
## [1] "Carol scored 78"

9.4.3 generate random samples with different parameters

#pmap for multiple inputs
params <- list(
  mean = c(0,  5,  10),
  sd   = c(1,  2,  3),
  n    = c(10, 20, 30)
)
result <- pmap(params, function(mean, sd, n) {
  rnorm(n, mean = mean, sd = sd)
})
print(length(result))    # 3 samples
## [1] 3
print(length(result[[1]]))  # 10 values
## [1] 10

10. geom_bar, geom_boxplot,geom_smooth

10.1 geom_bar()

Used to display counts or values as bars. Two modes — counting automatically or using your own values.

# counts how many cars have each number of cylinders
ggplot(mtcars, aes(x = factor(cyl))) +
  geom_bar(fill = "steelblue", color = "white") +
  labs(title = "Number of cars by cylinder count",
       x = "Cylinders", y = "Count") +
  theme_minimal()

### 10.1.1 compute average mpg per cylinder first

avg_mpg <- mtcars %>%
  group_by(cyl) %>%
  summarise(avg = mean(mpg))

# then plot the values
ggplot(avg_mpg, aes(x = factor(cyl), y = avg)) +
  geom_bar(stat = "identity", fill = "green", color = "white") +
  labs(title = "Average MPG by cylinder count",
       x = "Cylinders", y = "Average MPG") +
  theme_minimal()

### 10.2 geom_boxplot() Used to display the distribution of a numeric variable showing median, quartiles and outliers. ┌─────────────────────┐ │ whisker top │ ← Q3 + 1.5 × IQR ├─────────────────────┤ ← Q3 (75th percentile) │ │ │ THE BOX │ │ │ ├─────────────────────┤ ← median (50th percentile) │ │ ├─────────────────────┤ ← Q1 (25th percentile) │ whisker bottom │ ← Q1 - 1.5 × IQR

    ●  = outliers (points beyond the whiskers)
ggplot(mtcars, aes(y = mpg)) +
  geom_boxplot(fill = "cyan", color = "navy") +
  labs(title = "Distribution of MPG",
       y = "Miles per Gallon") +
  theme_minimal()

### 10.2.1 Boxplot by group

ggplot(mtcars, aes(x = factor(cyl), y = mpg)) +
  geom_boxplot(fill = "yellow", color = "navy") +
  labs(title = "MPG distribution by cylinder count",
       x = "Cylinders", y = "MPG") +
  theme_minimal()

### 10.2.2 Boxplot with notch — shows confidence interval around median

ggplot(mtcars, aes(x = factor(cyl), y = mpg, fill = factor(cyl))) +
  geom_boxplot(notch = FALSE) +
  labs(title = "MPG by cylinders with notched median CI",
       x = "Cylinders", y = "MPG") +
  theme_minimal()

### 10.3 geom_smooth() Used to add a trend line or smoothed curve to a scatter plot to show the overall pattern in the data.

#Basic smooth line
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point(color = "red") +
  geom_smooth() +
  labs(title = "MPG vs weight with smooth trend",
       x = "Weight", y = "MPG") +
  theme_minimal()

### 10.3.1 Linear regression line — method lm

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point(color = "steelblue") +
  geom_smooth(method = "lm", color = "red",
              se = TRUE) +
  labs(title = "MPG vs weight — linear regression line",
       x = "Weight", y = "MPG") +
  theme_minimal()

### 10.3.2 LOESS smooth — default, captures non-linear patterns

ggplot(mtcars, aes(x = hp, y = mpg)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "loess", color = "darkgreen",
              fill  = "lightgreen", alpha = 0.3) +
  labs(title = "MPG vs horsepower — LOESS smooth",
       x = "Horsepower", y = "MPG") +
  theme_minimal()