This analysis demonstrates how to load, inspect, clean, transform,
and summarize data using R and the tidyverse package.
The mpg dataset from the ggplot2 package is used to explore
fuel efficiency across different vehicle brands and types.
# Load tidyverse, which includes ggplot2 and dplyr
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the mpg dataset
data(mpg)
# Preview the first 10 rows
head(mpg, 10)
## # A tibble: 10 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
# View the structure of the dataset
str(mpg)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
# Display all column names
names(mpg)
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
# Step 1: Select key columns
# Step 2: Create avg_mpg as the average of city and highway MPG
# Step 3: Filter based on avg_mpg, remove missing classes, and remove rows 2 and 5
# Step 4: Rename columns for clarity
# Step 5: Group data and create summary statistics
summary_table <- mpg %>%
select(manufacturer, class, cty, hwy) %>%
mutate(avg_mpg = as.numeric((cty + hwy) / 2)) %>%
filter(avg_mpg >= 25, !is.na(class)) %>%
slice(-c(2, 5)) %>%
rename(
brand = manufacturer,
vehicle_type = class
) %>%
group_by(brand, vehicle_type) %>%
summarise(
avg_mpg_mean = mean(avg_mpg),
vehicle_count = n(),
.groups = "drop"
)
# Display the final summary table
summary_table
## # A tibble: 11 × 4
## brand vehicle_type avg_mpg_mean vehicle_count
## <chr> <chr> <dbl> <int>
## 1 audi compact 25.2 2
## 2 chevrolet midsize 26 1
## 3 honda subcompact 28.2 8
## 4 hyundai midsize 25.8 2
## 5 nissan compact 25 1
## 6 nissan midsize 27.2 2
## 7 toyota compact 28.3 8
## 8 toyota midsize 25.7 3
## 9 volkswagen compact 26.6 9
## 10 volkswagen midsize 25 2
## 11 volkswagen subcompact 33.2 3