Import data
# excel file
myData <- read_excel("data/myData.xlsx")
myData
## # A tibble: 2,973 × 10
## name state state_code type degree_length room_and_board in_state_tuition
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Aaniiih… Mont… MT Publ… 2 Year NA 2380
## 2 Abilene… Texas TX Priv… 4 Year 10350 34850
## 3 Abraham… Geor… GA Publ… 2 Year 8474 4128
## 4 Academy… Minn… MN For … 2 Year NA 17661
## 5 Academy… Cali… CA For … 4 Year 16648 27810
## 6 Adams S… Colo… CO Publ… 4 Year 8782 9440
## 7 Adelphi… New … NY Priv… 4 Year 16030 38660
## 8 Adirond… New … NY Publ… 2 Year 11660 5375
## 9 Adrian … Mich… MI Priv… 4 Year 11318 37087
## 10 Advance… Virg… VA For … 2 Year NA 13680
## # ℹ 2,963 more rows
## # ℹ 3 more variables: in_state_total <dbl>, out_of_state_tuition <dbl>,
## # out_of_state_total <dbl>
Variation
ggplot(myData, state = "New Hampshire") +
geom_bar(mapping = aes(x = in_state_tuition))

myData %>% count(in_state_tuition)
## # A tibble: 2,373 × 2
## in_state_tuition n
## <dbl> <int>
## 1 480 1
## 2 958 1
## 3 962 1
## 4 987 1
## 5 1154 1
## 6 1296 1
## 7 1376 1
## 8 1380 5
## 9 1382 1
## 10 1384 1
## # ℹ 2,363 more rows
Visualizing distributions
ggplot(myData, state = "New Hampshire") +
geom_histogram(mapping = aes(x = in_state_tuition))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(myData, state = "New Hampshire") +
geom_freqpoly(mapping = aes(x = in_state_tuition))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical Values
ggplot(myData, state = "New Hampshire", mapping = aes(x = in_state_tuition)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual Values
ggplot(myData, state = "New Hampshire") +
geom_histogram(mapping = aes(x = in_state_tuition), binwidth = 0.5)

Missing Values
myData %>%
group_by(in_state_tuition, out_of_state_tuition, in_state_total, out_of_state_total) %>%
summarise(mean = mean(name, na.rm = TRUE))
## Warning: There were 2816 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `mean = mean(name, na.rm = TRUE)`.
## ℹ In group 1: `in_state_tuition = 480`, `out_of_state_tuition = 480`,
## `in_state_total = 1430`, `out_of_state_total = 1430`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2815 remaining warnings.
## `summarise()` has grouped output by 'in_state_tuition', 'out_of_state_tuition',
## 'in_state_total'. You can override using the `.groups` argument.
## # A tibble: 2,816 × 5
## # Groups: in_state_tuition, out_of_state_tuition, in_state_total [2,816]
## in_state_tuition out_of_state_tuition in_state_total out_of_state_total mean
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 480 480 1430 1430 NA
## 2 958 958 7208 7208 NA
## 3 962 2426 962 2426 NA
## 4 987 987 8609 8609 NA
## 5 1154 7034 1154 7034 NA
## 6 1296 2904 1296 2904 NA
## 7 1376 1376 1376 1376 NA
## 8 1380 9120 1380 9120 NA
## 9 1380 9120 6322 14062 NA
## 10 1380 9480 1380 9480 NA
## # ℹ 2,806 more rows
Covariation
A categorical and continuous variable
ggplot(myData, state = "New Hampshire", mapping = aes(x = in_state_tuition)) +
geom_freqpoly(mapping = aes(colour = out_of_state_tuition), binwidth = 500)
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

Two categorical variables
ggplot(myData, state = "New Hampshire") +
geom_count(mapping = aes(x = in_state_tuition, y = out_of_state_tuition))

Two continous variables
ggplot(myData, state = "New Hampshire") +
geom_point(mapping = aes(x = in_state_tuition, y = out_of_state_tuition))

Patterns and models
ggplot(myData, state = "New Hampshire") +
geom_point(mapping = aes(x = in_state_tuition, y = out_of_state_tuition))
