# run this command if before loading. need to do this only once
# install.packages(tidyverse)
library(tidyverse)
library(knitr)
library(DT)
dat_mpg <- mpg
To read data from an external csv file use
dat_mpg <- read_csv("mpg.csv")
## Rows: 234 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): manufacturer, model, trans, drv, fl, class
## dbl (5): displ, year, cyl, cty, hwy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(knitr)
DT::datatable(dat_mpg)
# Type '?mpg' in RStudio console - what do you see?
dat_mpg %>% summary()
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
dat_mpg %>% pull(hwy) %>% is.na() %>% sum()
## [1] 0
dat_mpg %>% summarise_all(., ~sum(is.na(.)))
ggplot() +
geom_bar(data = dat_mpg,
mapping = aes(x = manufacturer, fill = manufacturer)) +
theme(legend.position = 'none',
text = element_text(size = 12),
axis.text.x = element_text(angle = 90),
)

ggplot() +
geom_histogram(data = dat_mpg,
mapping = aes(x=hwy),
color="black", bins = 10) +
theme(text = element_text(size = 20))

dat_mpg %>% select(hwy) %>% summary()
## hwy
## Min. :12.00
## 1st Qu.:18.00
## Median :24.00
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
dat_mpg %>% dplyr::group_by(trans) %>% summarise(n = n()) %>% kable()
| trans | n |
|---|---|
| auto(av) | 5 |
| auto(l3) | 2 |
| auto(l4) | 83 |
| auto(l5) | 39 |
| auto(l6) | 6 |
| auto(s4) | 3 |
| auto(s5) | 3 |
| auto(s6) | 16 |
| manual(m5) | 58 |
| manual(m6) | 19 |
ggplot() + geom_bar(data = dat_mpg,
mapping = aes(x=trans, fill = trans)) +
theme(legend.position = "none",
text = element_text(size = 20),
axis.text.x = element_text(angle = 90))
dat_mpg <- dat_mpg %>%
mutate(
trans_binary = ifelse(startsWith(trans, "auto"), "auto", "manual")
)
dat_mpg %>% dplyr::group_by(trans_binary) %>%
summarise(n = n())
ggplot() + geom_bar(data = dat_mpg,
mapping = aes(x=trans_binary, fill = trans_binary)) +
theme(legend.position = "none",
text = element_text(size = 20),
axis.text.x = element_text(angle = 90))
ggplot() +
geom_histogram(data = dat_mpg,
mapping = aes(x = hwy, fill = trans_binary),
color = "black",
alpha = 0.5) +
theme(text = element_text(size = 12),
legend.position = "top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#### Boxplot
ggplot() +
geom_boxplot(data = dat_mpg,
mapping = aes(x = trans_binary, y = hwy, fill= trans_binary)) +
theme(text = element_text(size = 12), legend.position = "top")
dat_mpg %>% group_by(trans_binary) %>% summarise(mean = mean(hwy), sd = sd(hwy), n = n())