# Load tidyverse
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dataset
data(mpg)
# Preview the first 10 rows of the data
head(mpg, 10)
## # A tibble: 10 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
# Look at the structure of the data to see what types you have
# Display the names of all your variables (columns)
mpg %>% colnames()
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
mpg <- mpg %>%
select(manufacturer, class, cty, hwy)
head(mpg)
## # A tibble: 6 × 4
## manufacturer class cty hwy
## <chr> <chr> <int> <int>
## 1 audi compact 18 29
## 2 audi compact 21 29
## 3 audi compact 20 31
## 4 audi compact 21 30
## 5 audi compact 16 26
## 6 audi compact 18 26
#Uses select feature narrow down the table to specific variables
#Create a New Column
mpg <- mpg %>%
mutate(avg_mpg = (cty + hwy) /2 )
mpg <- mpg %>%
mutate(avg_mpg = as.numeric(avg_mpg))
head(mpg)
## # A tibble: 6 × 5
## manufacturer class cty hwy avg_mpg
## <chr> <chr> <int> <int> <dbl>
## 1 audi compact 18 29 23.5
## 2 audi compact 21 29 25
## 3 audi compact 20 31 25.5
## 4 audi compact 21 30 25.5
## 5 audi compact 16 26 21
## 6 audi compact 18 26 22
#Uses mutate command to create a new variable in the form of a column with the specified formula. The next mutate commands turns it into a numeric value
#Filter the Data
mpg <- mpg %>%
mutate(avg_mpg = (cty + hwy) /2 )
mpg <- mpg %>%
filter(avg_mpg >= 25)
mpg <- mpg %>%
filter (!is.na(class))
mpg <- mpg[-c(2,5), ]
head (mpg)
## # A tibble: 6 × 5
## manufacturer class cty hwy avg_mpg
## <chr> <chr> <int> <int> <dbl>
## 1 audi compact 21 29 25
## 2 audi compact 21 30 25.5
## 3 chevrolet midsize 22 30 26
## 4 honda subcompact 24 32 28
## 5 honda subcompact 25 32 28.5
## 6 honda subcompact 23 29 26
# Avg_mpg is repeated so that the avg_mpg variable can be found. The filter option will subset the data frame to contain only cars with greater than or equal to 25 mpg avg. The next filter command subsets the data to where the class variable isn't missing. After that the -c command removes rows 2 and 5.
#Rename Columns
mpg <- mpg %>%
rename(
brand = manufacturer,
vehicle_type = class
)
head(mpg)
## # A tibble: 6 × 5
## brand vehicle_type cty hwy avg_mpg
## <chr> <chr> <int> <int> <dbl>
## 1 audi compact 21 29 25
## 2 audi compact 21 30 25.5
## 3 chevrolet midsize 22 30 26
## 4 honda subcompact 24 32 28
## 5 honda subcompact 25 32 28.5
## 6 honda subcompact 23 29 26
#The rename option is used to assign a different name to these two variables.
names(mpg) #Create a Summary Table
mean_table <- mpg %>%
summarise(mean_avg_mpg = mean(avg_mpg))
count_table <- mpg %>%
group_by(brand, vehicle_type) %>%
summarise(count = n())
## `summarise()` has grouped output by 'brand'. You can override using the
## `.groups` argument.
head(mean_table)
## # A tibble: 1 × 1
## mean_avg_mpg
## <dbl>
## 1 27.5
head(count_table)
## # A tibble: 6 × 3
## # Groups: brand [5]
## brand vehicle_type count
## <chr> <chr> <int>
## 1 audi compact 2
## 2 chevrolet midsize 1
## 3 honda subcompact 8
## 4 hyundai midsize 2
## 5 nissan compact 1
## 6 nissan midsize 2
# A table is created first for the average of the average miles per gallon for each car left in the dataframe with the summarize command. A count table is also created to count the number of each type of car being grouped by the brand and vehicle type. The summarize command is used to display the number of each.