# Load tidyverse

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dataset
 data(mpg)
 
# Preview the first 10 rows of the data
head(mpg, 10)
## # A tibble: 10 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
# Look at the structure of the data to see what types you have 
 
# Display the names of all your variables (columns)
mpg %>% colnames()
##  [1] "manufacturer" "model"        "displ"        "year"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"

Select Key Columns

mpg <- mpg %>%
  select(manufacturer, class, cty, hwy)
head(mpg)
## # A tibble: 6 × 4
##   manufacturer class     cty   hwy
##   <chr>        <chr>   <int> <int>
## 1 audi         compact    18    29
## 2 audi         compact    21    29
## 3 audi         compact    20    31
## 4 audi         compact    21    30
## 5 audi         compact    16    26
## 6 audi         compact    18    26
#Uses select feature narrow down the table to specific variables

#Create a New Column

mpg <- mpg %>%
  mutate(avg_mpg = (cty + hwy) /2 )
mpg <- mpg %>%
  mutate(avg_mpg = as.numeric(avg_mpg))
head(mpg)
## # A tibble: 6 × 5
##   manufacturer class     cty   hwy avg_mpg
##   <chr>        <chr>   <int> <int>   <dbl>
## 1 audi         compact    18    29    23.5
## 2 audi         compact    21    29    25  
## 3 audi         compact    20    31    25.5
## 4 audi         compact    21    30    25.5
## 5 audi         compact    16    26    21  
## 6 audi         compact    18    26    22
#Uses mutate command to create a new variable in the form of a column with the specified formula. The next mutate commands turns it into a numeric value

#Filter the Data

mpg <- mpg %>%
  mutate(avg_mpg = (cty + hwy) /2 )
mpg <- mpg %>%
  filter(avg_mpg >= 25)
mpg <- mpg %>%
  filter (!is.na(class))
mpg <- mpg[-c(2,5), ]
head (mpg)
## # A tibble: 6 × 5
##   manufacturer class        cty   hwy avg_mpg
##   <chr>        <chr>      <int> <int>   <dbl>
## 1 audi         compact       21    29    25  
## 2 audi         compact       21    30    25.5
## 3 chevrolet    midsize       22    30    26  
## 4 honda        subcompact    24    32    28  
## 5 honda        subcompact    25    32    28.5
## 6 honda        subcompact    23    29    26
# Avg_mpg is repeated so that the avg_mpg variable can be found. The filter option will subset the data frame to contain only cars with greater than or equal to 25 mpg avg. The next filter command subsets the data to where the class variable isn't missing. After that the -c command removes rows 2 and 5.

#Rename Columns

mpg <- mpg %>%
  rename(
    brand = manufacturer,
    vehicle_type = class
  )
head(mpg)
## # A tibble: 6 × 5
##   brand     vehicle_type   cty   hwy avg_mpg
##   <chr>     <chr>        <int> <int>   <dbl>
## 1 audi      compact         21    29    25  
## 2 audi      compact         21    30    25.5
## 3 chevrolet midsize         22    30    26  
## 4 honda     subcompact      24    32    28  
## 5 honda     subcompact      25    32    28.5
## 6 honda     subcompact      23    29    26
#The rename option is used to assign a different name to these two variables.

names(mpg) #Create a Summary Table

mean_table <- mpg %>%
  summarise(mean_avg_mpg = mean(avg_mpg))
count_table <- mpg %>%
  group_by(brand, vehicle_type) %>%
  summarise(count = n())
## `summarise()` has grouped output by 'brand'. You can override using the
## `.groups` argument.
head(mean_table)
## # A tibble: 1 × 1
##   mean_avg_mpg
##          <dbl>
## 1         27.5
head(count_table)
## # A tibble: 6 × 3
## # Groups:   brand [5]
##   brand     vehicle_type count
##   <chr>     <chr>        <int>
## 1 audi      compact          2
## 2 chevrolet midsize          1
## 3 honda     subcompact       8
## 4 hyundai   midsize          2
## 5 nissan    compact          1
## 6 nissan    midsize          2
# A table is created first for the average of the average miles per gallon for each car left in the dataframe with the summarize command. A count table is also created to count the number of each type of car being grouped by the brand and vehicle type. The summarize command is used to display the number of each.