# --- STEP 1: IMPORTING DATA FROM DIFFERENT SOURCES ---
# install.packages(c("haven", "RSQLite", "DBI"))
library(haven)
library(RSQLite)
library(DBI)
# --- Source B: Importing from a Database ---
# 1. First, we create a connection to a temporary SQLite database right on your machine
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "cars_db_table", mtcars)
#Importing data from the database
imported_db_data <- dbReadTable(con, "cars_db_table")
dbDisconnect(con)
head(imported_db_data)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
#IMPORTING FROM A STATISTICAL PACKAGE
library(haven)
# Notice how clean the path is now without the backslashes!
my_data <- read_sav("/Users/Enock/Library/Mobile Documents/com~apple~CloudDocs/GRADUATE PROGRAM/AUCA/R/Sample_Dataset_2014.sav")
print("Successfully imported local statistical data:")
## [1] "Successfully imported local statistical data:"
head(my_data)
## # A tibble: 6 × 23
## ids bday Rank Gender Athlete Height Weight Smoking Sprint
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20183 1991-01-03 NA 0 0 66.9 193. 0 7.41
## 2 20230 1996-01-02 1 0 1 80.1 NA 0 5.20
## 3 20243 1993-01-02 3 1 0 66.0 128. 1 8.10
## 4 20248 1994-01-01 1 NA 0 61.3 154. 2 6.45
## 5 20255 1996-01-01 2 1 0 65.8 NA 0 7.68
## 6 20278 1995-01-01 NA 0 0 70.7 179. 0 8.00
## # ℹ 14 more variables: MileMinDur <time>, English <dbl>, Reading <dbl>,
## # Math <dbl>, Writing <dbl>, State <chr>, LiveOnCampus <dbl>,
## # HowCommute <dbl>, CommuteTime <dbl>, SleepTime <dbl>, StudyTime <dbl>,
## # enrolldate <chr>, expgradate <chr>, Major <chr>
# --- STEP 2: MERGING DATASETS ---
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 1. Let's create the first dataset (3 variables)
dataset_A <- data.frame(
Employee_ID = c(101, 102, 103, 104),
Name = c("Alice", "Bob", "Charlie", "David"),
Age = c(28, 34, 29, 42)
)
# 2. Let's create the second dataset (2 variables)
dataset_B <- data.frame(
Employee_ID = c(101, 102, 103, 105),
Department = c("IT", "Finance", "HR", "Marketing")
)
# Let's print them out to see what they look like before the merge
cat("\n--- Dataset A (3 Variables) ---\n")
##
## --- Dataset A (3 Variables) ---
print(dataset_A)
## Employee_ID Name Age
## 1 101 Alice 28
## 2 102 Bob 34
## 3 103 Charlie 29
## 4 104 David 42
cat("\n--- Dataset B (2 Variables) ---\n")
##
## --- Dataset B (2 Variables) ---
print(dataset_B)
## Employee_ID Department
## 1 101 IT
## 2 102 Finance
## 3 103 HR
## 4 105 Marketing
# 3. MERGE THE DATASETS
merged_data <- inner_join(dataset_A, dataset_B, by = "Employee_ID")
cat("\n--- Merged Dataset ---\n")
##
## --- Merged Dataset ---
print(merged_data)
## Employee_ID Name Age Department
## 1 101 Alice 28 IT
## 2 102 Bob 34 Finance
## 3 103 Charlie 29 HR
# --- STEP 3: USING $, %>%, AND group_by() ---
library(dplyr)
cat("--- 1. Using the '$' Operator ---\n")
## --- 1. Using the '$' Operator ---
# We use '$' to extract just the 'mpg' (miles per gallon) column from mtcars
# and calculate the overall average for all cars.
overall_avg_mpg <- mean(mtcars$mpg)
cat("The overall average MPG for all cars is:", round(overall_avg_mpg, 2), "\n\n")
## The overall average MPG for all cars is: 20.09
cat("--- 2. Using '%>%' and 'group_by()' together ---\n")
## --- 2. Using '%>%' and 'group_by()' together ---
# Now, instead of the overall average, let's find the average MPG
# based on how many cylinders (cyl) the engine has.
grouped_car_data <- mtcars %>%
group_by(cyl) %>% # Group the data by 4, 6, or 8 cylinders
summarise(
Average_MPG = mean(mpg), # Calculate the mean MPG for each group
Total_Cars = n() # Count how many cars fall into each group
)
# Print the final grouped table
print(grouped_car_data)
## # A tibble: 3 × 3
## cyl Average_MPG Total_Cars
## <dbl> <dbl> <int>
## 1 4 26.7 11
## 2 6 19.7 7
## 3 8 15.1 14
# --- EXTRA STEP: THE 5 CORE DPLYR VERBS ---
library(dplyr)
cat("--- Demonstrating select, filter, arrange, rename, and mutate ---\n")
## --- Demonstrating select, filter, arrange, rename, and mutate ---
# We are going to pipe the mtcars dataset through all 5 functions step-by-step
processed_cars <- mtcars %>%
# 1. rename(): Change the names of 'mpg' and 'hp' to be more descriptive
rename(Miles_Per_Gallon = mpg, Horsepower = hp) %>%
# 2. select(): Isolate only the columns we care about, dropping the rest
select(Miles_Per_Gallon, Horsepower, cyl, gear) %>%
# 3. filter(): Keep ONLY the cars that have exactly 8 cylinders
filter(cyl == 8) %>%
# 4. mutate(): Create a brand new column doing some math (Power per MPG)
mutate(Power_To_Gas_Ratio = Horsepower / Miles_Per_Gallon) %>%
# 5. arrange(): Sort the final table by our new column, highest to lowest (desc)
arrange(desc(Power_To_Gas_Ratio))
# Print the final, heavily manipulated dataset
print(processed_cars)
## Miles_Per_Gallon Horsepower cyl gear Power_To_Gas_Ratio
## Maserati Bora 15.0 335 8 5 22.333333
## Lincoln Continental 10.4 215 8 3 20.673077
## Cadillac Fleetwood 10.4 205 8 3 19.711538
## Camaro Z28 13.3 245 8 3 18.421053
## Duster 360 14.3 245 8 3 17.132867
## Ford Pantera L 15.8 264 8 5 16.708861
## Chrysler Imperial 14.7 230 8 3 15.646259
## Merc 450SLC 15.2 180 8 3 11.842105
## Merc 450SE 16.4 180 8 3 10.975610
## Merc 450SL 17.3 180 8 3 10.404624
## AMC Javelin 15.2 150 8 3 9.868421
## Dodge Challenger 15.5 150 8 3 9.677419
## Hornet Sportabout 18.7 175 8 3 9.358289
## Pontiac Firebird 19.2 175 8 3 9.114583