By completing this lab, you will be able to:
mpg datasetlibrary(ggplot2)
data(mpg)
str(mpg)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
summary(mpg)
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
num_vars <- mpg[, sapply(mpg, is.numeric)]
summary_table <- data.frame(
Variable = names(num_vars),
Min = sapply(num_vars, min),
Q1 = sapply(num_vars, quantile, 0.25),
Median = sapply(num_vars, median),
Mean = sapply(num_vars, mean),
Q3 = sapply(num_vars, quantile, 0.75),
Max = sapply(num_vars, max)
)
summary_table
## Variable Min Q1 Median Mean Q3 Max
## displ displ 1.6 2.4 3.3 3.471795 4.6 7
## year year 1999.0 1999.0 2003.5 2003.500000 2008.0 2008
## cyl cyl 4.0 4.0 6.0 5.888889 8.0 8
## cty cty 9.0 14.0 17.0 16.858974 19.0 35
## hwy hwy 12.0 18.0 24.0 23.440171 27.0 44
cty. What does this
suggest about the distribution?cat_vars <- mpg[, sapply(mpg, is.factor)]
lapply(cat_vars, table)
## named list()
variable_info <- data.frame(
Variable = names(mpg),
Type = ifelse(sapply(mpg, is.numeric), "Quantitative", "Qualitative"),
Measurement_Level = c(
"Nominal", "Nominal", "Nominal", "Nominal",
"Ratio", "Ratio", "Ratio", "Ratio",
"Ratio", "Ratio", "Nominal"
)
)
variable_info
## Variable Type Measurement_Level
## manufacturer manufacturer Qualitative Nominal
## model model Qualitative Nominal
## displ displ Quantitative Nominal
## year year Quantitative Nominal
## cyl cyl Quantitative Ratio
## trans trans Qualitative Ratio
## drv drv Qualitative Ratio
## cty cty Quantitative Ratio
## hwy hwy Quantitative Ratio
## fl fl Qualitative Ratio
## class class Qualitative Nominal
year considered a ratio-level variable in this
dataset?.Rmd file on Canvas.