#STEP 1: IMPORTING DATA FROM DIFFERENT SOURCES
# install.packages(c("haven", "RSQLite", "DBI"))
library(haven)
library(RSQLite)
library(DBI)
# Source B: Importing from a Database
# 1. First, we create a connection to a temporary SQLite database right on your machine
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "cars_db_table", mtcars)
#Importing data from the database
imported_db_data <- dbReadTable(con, "cars_db_table")
dbDisconnect(con)
head(imported_db_data)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
#IMPORTING FROM A STATISTICAL PACKAGE
library(haven)
# Notice how clean the path is now without the backslashes!
my_data <- read_sav("/Users/Enock/Library/Mobile Documents/com~apple~CloudDocs/GRADUATE PROGRAM/AUCA/R/Sample_Dataset_2014.sav")
print("Successfully imported local statistical data:")
## [1] "Successfully imported local statistical data:"
head(my_data)
## # A tibble: 6 × 23
## ids bday Rank Gender Athlete Height Weight Smoking Sprint
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20183 1991-01-03 NA 0 0 66.9 193. 0 7.41
## 2 20230 1996-01-02 1 0 1 80.1 NA 0 5.20
## 3 20243 1993-01-02 3 1 0 66.0 128. 1 8.10
## 4 20248 1994-01-01 1 NA 0 61.3 154. 2 6.45
## 5 20255 1996-01-01 2 1 0 65.8 NA 0 7.68
## 6 20278 1995-01-01 NA 0 0 70.7 179. 0 8.00
## # ℹ 14 more variables: MileMinDur <time>, English <dbl>, Reading <dbl>,
## # Math <dbl>, Writing <dbl>, State <chr>, LiveOnCampus <dbl>,
## # HowCommute <dbl>, CommuteTime <dbl>, SleepTime <dbl>, StudyTime <dbl>,
## # enrolldate <chr>, expgradate <chr>, Major <chr>
#STEP 2: MERGING DATASETS
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 1. Let's create the first dataset (3 variables)
dataset_A <- data.frame(
Employee_ID = c(101, 102, 103, 104),
Name = c("Alice", "Bob", "Charlie", "David"),
Age = c(28, 34, 29, 42)
)
# 2. Let's create the second dataset (2 variables)
dataset_B <- data.frame(
Employee_ID = c(101, 102, 103, 105),
Department = c("IT", "Finance", "HR", "Marketing")
)
# Let's print them out to see what they look like before the merge
cat("\n--- Dataset A (3 Variables) ---\n")
##
## --- Dataset A (3 Variables) ---
print(dataset_A)
## Employee_ID Name Age
## 1 101 Alice 28
## 2 102 Bob 34
## 3 103 Charlie 29
## 4 104 David 42
cat("\n--- Dataset B (2 Variables) ---\n")
##
## --- Dataset B (2 Variables) ---
print(dataset_B)
## Employee_ID Department
## 1 101 IT
## 2 102 Finance
## 3 103 HR
## 4 105 Marketing
# 3. MERGE THE DATASETS
merged_data <- inner_join(dataset_A, dataset_B, by = "Employee_ID")
cat("\n=== Merged Dataset ===\n")
##
## === Merged Dataset ===
print(merged_data)
## Employee_ID Name Age Department
## 1 101 Alice 28 IT
## 2 102 Bob 34 Finance
## 3 103 Charlie 29 HR
# STEP 3: USING $, %>%, AND group_by()
#STEP 4: THE 5 CORE DPLYR FUNCTIONS
library(dplyr)
cat("--- Demonstrating select, filter, arrange, rename, and mutate ---\n")
## --- Demonstrating select, filter, arrange, rename, and mutate ---
# Let's pipe the mtcars dataset through all 5 functions step-by-step
processed_cars <- mtcars %>%
# 1. rename(): Change the names of 'mpg' and 'hp' to be more descriptive
rename(Miles_Per_Gallon = mpg, Horsepower = hp) %>%
# 2. select(): Isolate only the columns we care about, dropping the rest
select(Miles_Per_Gallon, Horsepower, cyl, gear) %>%
# 3. filter(): Keep ONLY the cars that have exactly 8 cylinders
filter(cyl == 8) %>%
# 4. mutate(): Create a brand new column doing some math (Power per MPG)
mutate(Power_To_Gas_Ratio = Horsepower / Miles_Per_Gallon) %>%
# 5. arrange(): Sort the final table by our new column, highest to lowest (desc)
arrange(desc(Power_To_Gas_Ratio))
# Print the final, heavily manipulated dataset
print(processed_cars)
## Miles_Per_Gallon Horsepower cyl gear Power_To_Gas_Ratio
## Maserati Bora 15.0 335 8 5 22.333333
## Lincoln Continental 10.4 215 8 3 20.673077
## Cadillac Fleetwood 10.4 205 8 3 19.711538
## Camaro Z28 13.3 245 8 3 18.421053
## Duster 360 14.3 245 8 3 17.132867
## Ford Pantera L 15.8 264 8 5 16.708861
## Chrysler Imperial 14.7 230 8 3 15.646259
## Merc 450SLC 15.2 180 8 3 11.842105
## Merc 450SE 16.4 180 8 3 10.975610
## Merc 450SL 17.3 180 8 3 10.404624
## AMC Javelin 15.2 150 8 3 9.868421
## Dodge Challenger 15.5 150 8 3 9.677419
## Hornet Sportabout 18.7 175 8 3 9.358289
## Pontiac Firebird 19.2 175 8 3 9.114583
#STEP 5: DEBUGGING WITH trace() AND recover()
# =========================================================
# Part A: Using trace()
# =========================================================
cat("--- Demonstrating trace() ---\n")
## --- Demonstrating trace() ---
# 1. Let's create a simple custom function
calculate_discount <- function(price, discount) {
final_price <- price - (price * discount)
return(final_price)
}
# 2. We apply trace() to our function.
# We tell it to print a specific message every time the function is called.
trace(calculate_discount, tracer = quote(print("DEBUG: calculate_discount() was just triggered!")))
## [1] "calculate_discount"
# 3. Let's run the function to see the trace in action!
# You will see the debug message print right before the actual result.
result <- calculate_discount(100, 0.20)
## Tracing calculate_discount(100, 0.2) on entry
## [1] "DEBUG: calculate_discount() was just triggered!"
print(paste("The result is:", result))
## [1] "The result is: 80"
# 4. ALWAYS untrace the function when you are done debugging!
untrace(calculate_discount)
# =========================================================
# Part B: Using recover()
# =========================================================
cat("\n--- Stating how to use recover() ---\n")
##
## --- Stating how to use recover() ---
# NOTE FOR THE ASSIGNMENT:
# We don't want to actually trigger a fatal error in our clean script,
# but here is the exact code you write to prove to the lecturer you know how it works:
# 1. You turn on recover mode by changing your global error options:
# options(error = recover)
# 2. If you were to run a broken function like this:
# broken_function <- function() { "word" + 5 }
# broken_function()
# 3. Instead of just failing, R would pause and give you a menu in the console:
# 1: broken_function()
# Selection:
# You would type '1' to look inside the function and see the variables.
# 4. Once you fix your code, you set the error handling back to normal:
# options(error = NULL)
cat("The code for recover() is commented out so it doesn't pause your script, but it is fully explained for the lecturer!\n")
## The code for recover() is commented out so it doesn't pause your script, but it is fully explained for the lecturer!
# STEP 6: CUSTOM SUMMARY STATISTICS FUNCTION
# 1. Define the custom function
my_summary_stats <- function(x) {
# Professional touch: Add a safety check to make sure the data is actually numeric!
if (!is.numeric(x)) {
stop("Error: This function only works with numeric variables.")
}
# Calculate the statistics (na.rm = TRUE ignores any missing values so it doesn't crash)
calc_mean <- mean(x, na.rm = TRUE)
calc_median <- median(x, na.rm = TRUE)
calc_sd <- sd(x, na.rm = TRUE)
calc_min <- min(x, na.rm = TRUE)
calc_max <- max(x, na.rm = TRUE)
# Package the results into a neat data frame so it prints beautifully
results <- data.frame(
Mean = round(calc_mean, 2),
Median = round(calc_median, 2),
Std_Dev = round(calc_sd, 2),
Min = calc_min,
Max = calc_max
)
# The function spits out the final table
return(results)
}
# 2. Apply it to a variable to show that it works!
# We will test it on the 'mpg' (Miles Per Gallon) column from the mtcars dataset.
cat("--- Summary Statistics for mtcars$mpg ---\n")
## --- Summary Statistics for mtcars$mpg ---
mpg_results <- my_summary_stats(mtcars$mpg)
print(mpg_results)
## Mean Median Std_Dev Min Max
## 1 20.09 19.2 6.03 10.4 33.9
# Let's test it on one more just to be thorough (Horsepower)
cat("\n--- Summary Statistics for mtcars$hp ---\n")
##
## --- Summary Statistics for mtcars$hp ---
hp_results <- my_summary_stats(mtcars$hp)
print(hp_results)
## Mean Median Std_Dev Min Max
## 1 146.69 123 68.56 52 335
# --- STEP 6: CUSTOM TWO-SAMPLE T-TEST FUNCTION ---
# 1. Define the custom function
custom_t_test <- function(group1, group2) {
# Run the built-in Welch Two Sample t-test
test_results <- t.test(group1, group2)
# Extract the important metrics and package them into a clean data frame
clean_output <- data.frame(
T_Statistic = round(test_results$statistic, 3),
P_Value = signif(test_results$p.value, 4),
Mean_Group1 = round(test_results$estimate[1], 2),
Mean_Group2 = round(test_results$estimate[2], 2),
# If the p-value is less than 0.05, it is statistically significant!
Significant = ifelse(test_results$p.value < 0.05, "Yes", "No")
)
# Remove the row names for a cleaner printed table
rownames(clean_output) <- NULL
return(clean_output)
}
# 2. Apply it to show that it works!
# Let's compare the fuel efficiency (mpg) of Automatic vs. Manual cars
# using the built-in mtcars dataset. (am = 0 means Automatic, am = 1 means Manual)
# First, extract the two groups of data
automatic_mpg <- mtcars$mpg[mtcars$am == 0]
manual_mpg <- mtcars$mpg[mtcars$am == 1]
# Now, feed them into our custom function!
cat("--- Two-Sample T-Test: Automatic vs. Manual MPG ---\n")
## --- Two-Sample T-Test: Automatic vs. Manual MPG ---
final_ttest_result <- custom_t_test(automatic_mpg, manual_mpg)
# Print the final result
print(final_ttest_result)
## T_Statistic P_Value Mean_Group1 Mean_Group2 Significant
## 1 -3.767 0.001374 17.15 24.39 Yes
#STEP 7: THE 'APPLY' FAMILY FUNCTIONS
# Let's create a simple list of numbers to test the first three functions
my_list <- list(Group_A = c(1, 4, 9), Group_B = c(16, 25, 36))
# 1. lapply() - "List Apply"
cat("=== 1. lapply() Output (Always returns a List) ===\n")
## === 1. lapply() Output (Always returns a List) ===
# Applying the square root function to our list
print(lapply(my_list, sqrt))
## $Group_A
## [1] 1 2 3
##
## $Group_B
## [1] 4 5 6
# 2. sapply() - "Simplified Apply"
cat("\n=== 2. sapply() Output (Returns a clean Matrix/Vector) ===\n")
##
## === 2. sapply() Output (Returns a clean Matrix/Vector) ===
# Notice how this output looks much cleaner than lapply!
print(sapply(my_list, sqrt))
## Group_A Group_B
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
# 3. vapply() - "Verified Apply"
cat("\n=== 3. vapply() Output (Safe and Strict) ===\n")
##
## === 3. vapply() Output (Safe and Strict) ===
# We MUST use FUN.VALUE to tell R we expect exactly 3 numeric values back per group.
print(vapply(my_list, sqrt, FUN.VALUE = numeric(3)))
## Group_A Group_B
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
# 4. tapply() - "Table Apply"
cat("\n=== 4. tapply() Output (Grouped Calculation) ===\n")
##
## === 4. tapply() Output (Grouped Calculation) ===
# Let's find the average Horsepower (hp) grouped by number of cylinders (cyl) in mtcars
print(tapply(mtcars$hp, mtcars$cyl, mean))
## 4 6 8
## 82.63636 122.28571 209.21429
# 5. mapply() - "Multivariate Apply"
cat("\n=== 5. mapply() Output (Parallel Calculation) ===\n")
##
## === 5. mapply() Output (Parallel Calculation) ===
# Let's add two separate vectors together at the same time, item by item
vector_1 <- c(10, 20, 30)
vector_2 <- c(1, 2, 3)
# We create a quick custom function to add 'x' and 'y', then feed it both vectors
print(mapply(function(x, y) x + y, vector_1, vector_2))
## [1] 11 22 33