# --- STEP 1: IMPORTING DATA FROM DIFFERENT SOURCES ---

# install.packages(c("haven", "RSQLite", "DBI"))

library(haven)
library(RSQLite)
library(DBI)

# --- Source B: Importing from a Database ---
# 1. First, we create a connection to a temporary SQLite database right on your machine
con <- dbConnect(RSQLite::SQLite(), ":memory:")

dbWriteTable(con, "cars_db_table", mtcars)

#Importing data from the database
imported_db_data <- dbReadTable(con, "cars_db_table")

dbDisconnect(con)

head(imported_db_data)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
#IMPORTING FROM A STATISTICAL PACKAGE
library(haven)

# Notice how clean the path is now without the backslashes!
my_data <- read_sav("/Users/Enock/Library/Mobile Documents/com~apple~CloudDocs/GRADUATE PROGRAM/AUCA/R/Sample_Dataset_2014.sav")

print("Successfully imported local statistical data:")
## [1] "Successfully imported local statistical data:"
head(my_data)
## # A tibble: 6 × 23
##     ids bday        Rank Gender Athlete Height Weight Smoking Sprint
##   <dbl> <date>     <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>
## 1 20183 1991-01-03    NA      0       0   66.9   193.       0   7.41
## 2 20230 1996-01-02     1      0       1   80.1    NA        0   5.20
## 3 20243 1993-01-02     3      1       0   66.0   128.       1   8.10
## 4 20248 1994-01-01     1     NA       0   61.3   154.       2   6.45
## 5 20255 1996-01-01     2      1       0   65.8    NA        0   7.68
## 6 20278 1995-01-01    NA      0       0   70.7   179.       0   8.00
## # ℹ 14 more variables: MileMinDur <time>, English <dbl>, Reading <dbl>,
## #   Math <dbl>, Writing <dbl>, State <chr>, LiveOnCampus <dbl>,
## #   HowCommute <dbl>, CommuteTime <dbl>, SleepTime <dbl>, StudyTime <dbl>,
## #   enrolldate <chr>, expgradate <chr>, Major <chr>
# --- STEP 2: MERGING DATASETS ---

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 1. Let's create the first dataset (3 variables)
dataset_A <- data.frame(
  Employee_ID = c(101, 102, 103, 104),
  Name = c("Alice", "Bob", "Charlie", "David"),
  Age = c(28, 34, 29, 42)
)

# 2. Let's create the second dataset (2 variables)
dataset_B <- data.frame(
  Employee_ID = c(101, 102, 103, 105),
  Department = c("IT", "Finance", "HR", "Marketing")
)

# Let's print them out to see what they look like before the merge
cat("\n--- Dataset A (3 Variables) ---\n")
## 
## --- Dataset A (3 Variables) ---
print(dataset_A)
##   Employee_ID    Name Age
## 1         101   Alice  28
## 2         102     Bob  34
## 3         103 Charlie  29
## 4         104   David  42
cat("\n--- Dataset B (2 Variables) ---\n")
## 
## --- Dataset B (2 Variables) ---
print(dataset_B)
##   Employee_ID Department
## 1         101         IT
## 2         102    Finance
## 3         103         HR
## 4         105  Marketing
# 3. MERGE THE DATASETS
merged_data <- inner_join(dataset_A, dataset_B, by = "Employee_ID")

cat("\n--- Merged Dataset ---\n")
## 
## --- Merged Dataset ---
print(merged_data)
##   Employee_ID    Name Age Department
## 1         101   Alice  28         IT
## 2         102     Bob  34    Finance
## 3         103 Charlie  29         HR
# --- STEP 3: USING $, %>%, AND group_by() ---

library(dplyr)
cat("--- 1. Using the '$' Operator ---\n")
## --- 1. Using the '$' Operator ---
# We use '$' to extract just the 'mpg' (miles per gallon) column from mtcars
# and calculate the overall average for all cars.
overall_avg_mpg <- mean(mtcars$mpg)
cat("The overall average MPG for all cars is:", round(overall_avg_mpg, 2), "\n\n")
## The overall average MPG for all cars is: 20.09
cat("--- 2. Using '%>%' and 'group_by()' together ---\n")
## --- 2. Using '%>%' and 'group_by()' together ---
# Now, instead of the overall average, let's find the average MPG 
# based on how many cylinders (cyl) the engine has.

grouped_car_data <- mtcars %>%
  group_by(cyl) %>%                  # Group the data by 4, 6, or 8 cylinders
  summarise(
    Average_MPG = mean(mpg),         # Calculate the mean MPG for each group
    Total_Cars = n()                 # Count how many cars fall into each group
  )

# Print the final grouped table
print(grouped_car_data)
## # A tibble: 3 × 3
##     cyl Average_MPG Total_Cars
##   <dbl>       <dbl>      <int>
## 1     4        26.7         11
## 2     6        19.7          7
## 3     8        15.1         14
# --- EXTRA STEP: THE 5 CORE DPLYR VERBS ---

library(dplyr)

cat("--- Demonstrating select, filter, arrange, rename, and mutate ---\n")
## --- Demonstrating select, filter, arrange, rename, and mutate ---
# We are going to pipe the mtcars dataset through all 5 functions step-by-step
processed_cars <- mtcars %>%
  
  # 1. rename(): Change the names of 'mpg' and 'hp' to be more descriptive
  rename(Miles_Per_Gallon = mpg, Horsepower = hp) %>%
  
  # 2. select(): Isolate only the columns we care about, dropping the rest
  select(Miles_Per_Gallon, Horsepower, cyl, gear) %>%
  
  # 3. filter(): Keep ONLY the cars that have exactly 8 cylinders
  filter(cyl == 8) %>%
  
  # 4. mutate(): Create a brand new column doing some math (Power per MPG)
  mutate(Power_To_Gas_Ratio = Horsepower / Miles_Per_Gallon) %>%
  
  # 5. arrange(): Sort the final table by our new column, highest to lowest (desc)
  arrange(desc(Power_To_Gas_Ratio))

# Print the final, heavily manipulated dataset
print(processed_cars)
##                     Miles_Per_Gallon Horsepower cyl gear Power_To_Gas_Ratio
## Maserati Bora                   15.0        335   8    5          22.333333
## Lincoln Continental             10.4        215   8    3          20.673077
## Cadillac Fleetwood              10.4        205   8    3          19.711538
## Camaro Z28                      13.3        245   8    3          18.421053
## Duster 360                      14.3        245   8    3          17.132867
## Ford Pantera L                  15.8        264   8    5          16.708861
## Chrysler Imperial               14.7        230   8    3          15.646259
## Merc 450SLC                     15.2        180   8    3          11.842105
## Merc 450SE                      16.4        180   8    3          10.975610
## Merc 450SL                      17.3        180   8    3          10.404624
## AMC Javelin                     15.2        150   8    3           9.868421
## Dodge Challenger                15.5        150   8    3           9.677419
## Hornet Sportabout               18.7        175   8    3           9.358289
## Pontiac Firebird                19.2        175   8    3           9.114583