#STEP 1: IMPORTING DATA FROM DIFFERENT SOURCES

# install.packages(c("haven", "RSQLite", "DBI"))

library(haven)
library(RSQLite)
library(DBI)

# Source B: Importing from a Database
# 1. First, we create a connection to a temporary SQLite database right on your machine
con <- dbConnect(RSQLite::SQLite(), ":memory:")

dbWriteTable(con, "cars_db_table", mtcars)

#Importing data from the database
imported_db_data <- dbReadTable(con, "cars_db_table")

dbDisconnect(con)

head(imported_db_data)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
#IMPORTING FROM A STATISTICAL PACKAGE
library(haven)

# Notice how clean the path is now without the backslashes!
my_data <- read_sav("/Users/Enock/Library/Mobile Documents/com~apple~CloudDocs/GRADUATE PROGRAM/AUCA/R/Sample_Dataset_2014.sav")

print("Successfully imported local statistical data:")
## [1] "Successfully imported local statistical data:"
head(my_data)
## # A tibble: 6 × 23
##     ids bday        Rank Gender Athlete Height Weight Smoking Sprint
##   <dbl> <date>     <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>
## 1 20183 1991-01-03    NA      0       0   66.9   193.       0   7.41
## 2 20230 1996-01-02     1      0       1   80.1    NA        0   5.20
## 3 20243 1993-01-02     3      1       0   66.0   128.       1   8.10
## 4 20248 1994-01-01     1     NA       0   61.3   154.       2   6.45
## 5 20255 1996-01-01     2      1       0   65.8    NA        0   7.68
## 6 20278 1995-01-01    NA      0       0   70.7   179.       0   8.00
## # ℹ 14 more variables: MileMinDur <time>, English <dbl>, Reading <dbl>,
## #   Math <dbl>, Writing <dbl>, State <chr>, LiveOnCampus <dbl>,
## #   HowCommute <dbl>, CommuteTime <dbl>, SleepTime <dbl>, StudyTime <dbl>,
## #   enrolldate <chr>, expgradate <chr>, Major <chr>
#STEP 2: MERGING DATASETS

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 1. Let's create the first dataset (3 variables)
dataset_A <- data.frame(
  Employee_ID = c(101, 102, 103, 104),
  Name = c("Alice", "Bob", "Charlie", "David"),
  Age = c(28, 34, 29, 42)
)

# 2. Let's create the second dataset (2 variables)
dataset_B <- data.frame(
  Employee_ID = c(101, 102, 103, 105),
  Department = c("IT", "Finance", "HR", "Marketing")
)

# Let's print them out to see what they look like before the merge
cat("\n--- Dataset A (3 Variables) ---\n")
## 
## --- Dataset A (3 Variables) ---
print(dataset_A)
##   Employee_ID    Name Age
## 1         101   Alice  28
## 2         102     Bob  34
## 3         103 Charlie  29
## 4         104   David  42
cat("\n--- Dataset B (2 Variables) ---\n")
## 
## --- Dataset B (2 Variables) ---
print(dataset_B)
##   Employee_ID Department
## 1         101         IT
## 2         102    Finance
## 3         103         HR
## 4         105  Marketing
# 3. MERGE THE DATASETS
merged_data <- inner_join(dataset_A, dataset_B, by = "Employee_ID")

cat("\n=== Merged Dataset ===\n")
## 
## === Merged Dataset ===
print(merged_data)
##   Employee_ID    Name Age Department
## 1         101   Alice  28         IT
## 2         102     Bob  34    Finance
## 3         103 Charlie  29         HR
# STEP 3: USING $, %>%, AND group_by()


#STEP 4: THE 5 CORE DPLYR FUNCTIONS
library(dplyr)
cat("--- Demonstrating select, filter, arrange, rename, and mutate ---\n")
## --- Demonstrating select, filter, arrange, rename, and mutate ---
# Let's  pipe the mtcars dataset through all 5 functions step-by-step
processed_cars <- mtcars %>%
  
  # 1. rename(): Change the names of 'mpg' and 'hp' to be more descriptive
  rename(Miles_Per_Gallon = mpg, Horsepower = hp) %>%
  
  # 2. select(): Isolate only the columns we care about, dropping the rest
  select(Miles_Per_Gallon, Horsepower, cyl, gear) %>%
  
  # 3. filter(): Keep ONLY the cars that have exactly 8 cylinders
  filter(cyl == 8) %>%
  
  # 4. mutate(): Create a brand new column doing some math (Power per MPG)
  mutate(Power_To_Gas_Ratio = Horsepower / Miles_Per_Gallon) %>%
  
  # 5. arrange(): Sort the final table by our new column, highest to lowest (desc)
  arrange(desc(Power_To_Gas_Ratio))

# Print the final, heavily manipulated dataset
print(processed_cars)
##                     Miles_Per_Gallon Horsepower cyl gear Power_To_Gas_Ratio
## Maserati Bora                   15.0        335   8    5          22.333333
## Lincoln Continental             10.4        215   8    3          20.673077
## Cadillac Fleetwood              10.4        205   8    3          19.711538
## Camaro Z28                      13.3        245   8    3          18.421053
## Duster 360                      14.3        245   8    3          17.132867
## Ford Pantera L                  15.8        264   8    5          16.708861
## Chrysler Imperial               14.7        230   8    3          15.646259
## Merc 450SLC                     15.2        180   8    3          11.842105
## Merc 450SE                      16.4        180   8    3          10.975610
## Merc 450SL                      17.3        180   8    3          10.404624
## AMC Javelin                     15.2        150   8    3           9.868421
## Dodge Challenger                15.5        150   8    3           9.677419
## Hornet Sportabout               18.7        175   8    3           9.358289
## Pontiac Firebird                19.2        175   8    3           9.114583
#STEP 5: DEBUGGING WITH trace() AND recover()

# =========================================================
# Part A: Using trace()
# =========================================================
cat("--- Demonstrating trace() ---\n")
## --- Demonstrating trace() ---
# 1. Let's create a simple custom function
calculate_discount <- function(price, discount) {
  final_price <- price - (price * discount)
  return(final_price)
}

# 2. We apply trace() to our function. 
# We tell it to print a specific message every time the function is called.
trace(calculate_discount, tracer = quote(print("DEBUG: calculate_discount() was just triggered!")))
## [1] "calculate_discount"
# 3. Let's run the function to see the trace in action!
# You will see the debug message print right before the actual result.
result <- calculate_discount(100, 0.20)
## Tracing calculate_discount(100, 0.2) on entry 
## [1] "DEBUG: calculate_discount() was just triggered!"
print(paste("The result is:", result))
## [1] "The result is: 80"
# 4. ALWAYS untrace the function when you are done debugging!
untrace(calculate_discount)


# =========================================================
# Part B: Using recover()
# =========================================================
cat("\n--- Stating how to use recover() ---\n")
## 
## --- Stating how to use recover() ---
# NOTE FOR THE ASSIGNMENT: 
# We don't want to actually trigger a fatal error in our clean script, 
# but here is the exact code you write to prove to the lecturer you know how it works:

# 1. You turn on recover mode by changing your global error options:
# options(error = recover)

# 2. If you were to run a broken function like this:
# broken_function <- function() { "word" + 5 } 
# broken_function()

# 3. Instead of just failing, R would pause and give you a menu in the console:
# 1: broken_function()
# Selection: 

# You would type '1' to look inside the function and see the variables.

# 4. Once you fix your code, you set the error handling back to normal:
# options(error = NULL)

cat("The code for recover() is commented out so it doesn't pause your script, but it is fully explained for the lecturer!\n")
## The code for recover() is commented out so it doesn't pause your script, but it is fully explained for the lecturer!
# STEP 6: CUSTOM SUMMARY STATISTICS FUNCTION

# 1. Define the custom function
my_summary_stats <- function(x) {
  
  # Professional touch: Add a safety check to make sure the data is actually numeric!
  if (!is.numeric(x)) {
    stop("Error: This function only works with numeric variables.")
  }
  
  # Calculate the statistics (na.rm = TRUE ignores any missing values so it doesn't crash)
  calc_mean <- mean(x, na.rm = TRUE)
  calc_median <- median(x, na.rm = TRUE)
  calc_sd <- sd(x, na.rm = TRUE)
  calc_min <- min(x, na.rm = TRUE)
  calc_max <- max(x, na.rm = TRUE)
  
  # Package the results into a neat data frame so it prints beautifully
  results <- data.frame(
    Mean = round(calc_mean, 2),
    Median = round(calc_median, 2),
    Std_Dev = round(calc_sd, 2),
    Min = calc_min,
    Max = calc_max
  )
  
  # The function spits out the final table
  return(results)
}

# 2. Apply it to a variable to show that it works!
# We will test it on the 'mpg' (Miles Per Gallon) column from the mtcars dataset.
cat("--- Summary Statistics for mtcars$mpg ---\n")
## --- Summary Statistics for mtcars$mpg ---
mpg_results <- my_summary_stats(mtcars$mpg)
print(mpg_results)
##    Mean Median Std_Dev  Min  Max
## 1 20.09   19.2    6.03 10.4 33.9
# Let's test it on one more just to be thorough (Horsepower)
cat("\n--- Summary Statistics for mtcars$hp ---\n")
## 
## --- Summary Statistics for mtcars$hp ---
hp_results <- my_summary_stats(mtcars$hp)
print(hp_results)
##     Mean Median Std_Dev Min Max
## 1 146.69    123   68.56  52 335
# --- STEP 6: CUSTOM TWO-SAMPLE T-TEST FUNCTION ---

# 1. Define the custom function
custom_t_test <- function(group1, group2) {
  
  # Run the built-in Welch Two Sample t-test
  test_results <- t.test(group1, group2)
  
  # Extract the important metrics and package them into a clean data frame
  clean_output <- data.frame(
    T_Statistic = round(test_results$statistic, 3),
    P_Value = signif(test_results$p.value, 4),
    Mean_Group1 = round(test_results$estimate[1], 2),
    Mean_Group2 = round(test_results$estimate[2], 2),
    # If the p-value is less than 0.05, it is statistically significant!
    Significant = ifelse(test_results$p.value < 0.05, "Yes", "No") 
  )
  
  # Remove the row names for a cleaner printed table
  rownames(clean_output) <- NULL 
  
  return(clean_output)
}

# 2. Apply it to show that it works!
# Let's compare the fuel efficiency (mpg) of Automatic vs. Manual cars 
# using the built-in mtcars dataset. (am = 0 means Automatic, am = 1 means Manual)

# First, extract the two groups of data
automatic_mpg <- mtcars$mpg[mtcars$am == 0]
manual_mpg <- mtcars$mpg[mtcars$am == 1]

# Now, feed them into our custom function!
cat("--- Two-Sample T-Test: Automatic vs. Manual MPG ---\n")
## --- Two-Sample T-Test: Automatic vs. Manual MPG ---
final_ttest_result <- custom_t_test(automatic_mpg, manual_mpg)

# Print the final result
print(final_ttest_result)
##   T_Statistic  P_Value Mean_Group1 Mean_Group2 Significant
## 1      -3.767 0.001374       17.15       24.39         Yes
#STEP 7: THE 'APPLY' FAMILY FUNCTIONS

# Let's create a simple list of numbers to test the first three functions
my_list <- list(Group_A = c(1, 4, 9), Group_B = c(16, 25, 36))


# 1. lapply() - "List Apply"
cat("=== 1. lapply() Output (Always returns a List) ===\n")
## === 1. lapply() Output (Always returns a List) ===
# Applying the square root function to our list
print(lapply(my_list, sqrt))
## $Group_A
## [1] 1 2 3
## 
## $Group_B
## [1] 4 5 6
# 2. sapply() - "Simplified Apply"
cat("\n=== 2. sapply() Output (Returns a clean Matrix/Vector) ===\n")
## 
## === 2. sapply() Output (Returns a clean Matrix/Vector) ===
# Notice how this output looks much cleaner than lapply!
print(sapply(my_list, sqrt))
##      Group_A Group_B
## [1,]       1       4
## [2,]       2       5
## [3,]       3       6
# 3. vapply() - "Verified Apply"
cat("\n=== 3. vapply() Output (Safe and Strict) ===\n")
## 
## === 3. vapply() Output (Safe and Strict) ===
# We MUST use FUN.VALUE to tell R we expect exactly 3 numeric values back per group.
print(vapply(my_list, sqrt, FUN.VALUE = numeric(3)))
##      Group_A Group_B
## [1,]       1       4
## [2,]       2       5
## [3,]       3       6
# 4. tapply() - "Table Apply"
cat("\n=== 4. tapply() Output (Grouped Calculation) ===\n")
## 
## === 4. tapply() Output (Grouped Calculation) ===
# Let's find the average Horsepower (hp) grouped by number of cylinders (cyl) in mtcars
print(tapply(mtcars$hp, mtcars$cyl, mean))
##         4         6         8 
##  82.63636 122.28571 209.21429
# 5. mapply() - "Multivariate Apply"
cat("\n=== 5. mapply() Output (Parallel Calculation) ===\n")
## 
## === 5. mapply() Output (Parallel Calculation) ===
# Let's add two separate vectors together at the same time, item by item
vector_1 <- c(10, 20, 30)
vector_2 <- c(1, 2, 3)

# We create a quick custom function to add 'x' and 'y', then feed it both vectors
print(mapply(function(x, y) x + y, vector_1, vector_2))
## [1] 11 22 33