Data Homework
Install Packages and Load Data
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#selecting which built in dataset to use and prevewing the first 10 rows
mpg= read_builtin("mpg")
head(mpg, 10)
## # A tibble: 10 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
Names of all variables
#displaying names of all variables
names(mpg)
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
Organizing Data
#only want manufacturer,class,cty,hwy to be used
mpg_filterd= mpg %>%
select(manufacturer,class,cty,hwy) %>%
mutate(avg_mpg= as.numeric(cty+hwy)/2) %>%
filter(avg_mpg >= 25,!is.na(class)) %>% # keeps only mpg cars greater than 25 and !is.na removes any missing class and slice removes rows 2 and 5
slice(-c(2, 5)) %>%
#renaming variables using select() The new name appears on the left-hand side of the =, and the old variable appears on the right-hand side
select(brand=manufacturer,vehicle_type=class,cty,hwy,avg_mpg)
head(mpg_filterd)
## # A tibble: 6 × 5
## brand vehicle_type cty hwy avg_mpg
## <chr> <chr> <int> <int> <dbl>
## 1 audi compact 21 29 25
## 2 audi compact 21 30 25.5
## 3 chevrolet midsize 22 30 26
## 4 honda subcompact 24 32 28
## 5 honda subcompact 25 32 28.5
## 6 honda subcompact 23 29 26
Creating a Summary Table
summary_table= mpg_filterd %>%
group_by(brand, vehicle_type) %>%
summarise(mean_avg_mpg= mean(avg_mpg, na.rm = TRUE), vehicle_count= n(),.groups = "drop")
head(summary_table, 11)
## # A tibble: 11 × 4
## brand vehicle_type mean_avg_mpg vehicle_count
## <chr> <chr> <dbl> <int>
## 1 audi compact 25.2 2
## 2 chevrolet midsize 26 1
## 3 honda subcompact 28.2 8
## 4 hyundai midsize 25.8 2
## 5 nissan compact 25 1
## 6 nissan midsize 27.2 2
## 7 toyota compact 28.3 8
## 8 toyota midsize 25.7 3
## 9 volkswagen compact 26.6 9
## 10 volkswagen midsize 25 2
## 11 volkswagen subcompact 33.2 3