Import data

# excel file
myData <- read_excel("data/myData.xlsx")
myData
## # A tibble: 2,973 × 10
##    name     state state_code type  degree_length room_and_board in_state_tuition
##    <chr>    <chr> <chr>      <chr> <chr>         <chr>                     <dbl>
##  1 Aaniiih… Mont… MT         Publ… 2 Year        NA                         2380
##  2 Abilene… Texas TX         Priv… 4 Year        10350                     34850
##  3 Abraham… Geor… GA         Publ… 2 Year        8474                       4128
##  4 Academy… Minn… MN         For … 2 Year        NA                        17661
##  5 Academy… Cali… CA         For … 4 Year        16648                     27810
##  6 Adams S… Colo… CO         Publ… 4 Year        8782                       9440
##  7 Adelphi… New … NY         Priv… 4 Year        16030                     38660
##  8 Adirond… New … NY         Publ… 2 Year        11660                      5375
##  9 Adrian … Mich… MI         Priv… 4 Year        11318                     37087
## 10 Advance… Virg… VA         For … 2 Year        NA                        13680
## # ℹ 2,963 more rows
## # ℹ 3 more variables: in_state_total <dbl>, out_of_state_tuition <dbl>,
## #   out_of_state_total <dbl>

Variation

ggplot(myData, state = "New Hampshire") +
    geom_bar(mapping = aes(x = in_state_tuition))

myData %>% count(in_state_tuition)
## # A tibble: 2,373 × 2
##    in_state_tuition     n
##               <dbl> <int>
##  1              480     1
##  2              958     1
##  3              962     1
##  4              987     1
##  5             1154     1
##  6             1296     1
##  7             1376     1
##  8             1380     5
##  9             1382     1
## 10             1384     1
## # ℹ 2,363 more rows

Visualizing distributions

ggplot(myData, state = "New Hampshire") +
    geom_histogram(mapping = aes(x = in_state_tuition))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(myData, state = "New Hampshire") +
  geom_freqpoly(mapping = aes(x = in_state_tuition))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical Values

ggplot(myData, state = "New Hampshire", mapping = aes(x = in_state_tuition)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual Values

ggplot(myData, state = "New Hampshire") + 
  geom_histogram(mapping = aes(x = in_state_tuition), binwidth = 0.5)

Missing Values

myData %>% 
  group_by(in_state_tuition, out_of_state_tuition, in_state_total, out_of_state_total) %>% 
  summarise(mean = mean(name, na.rm = TRUE))
## Warning: There were 2816 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `mean = mean(name, na.rm = TRUE)`.
## ℹ In group 1: `in_state_tuition = 480`, `out_of_state_tuition = 480`,
##   `in_state_total = 1430`, `out_of_state_total = 1430`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2815 remaining warnings.
## `summarise()` has grouped output by 'in_state_tuition', 'out_of_state_tuition',
## 'in_state_total'. You can override using the `.groups` argument.
## # A tibble: 2,816 × 5
## # Groups:   in_state_tuition, out_of_state_tuition, in_state_total [2,816]
##    in_state_tuition out_of_state_tuition in_state_total out_of_state_total  mean
##               <dbl>                <dbl>          <dbl>              <dbl> <dbl>
##  1              480                  480           1430               1430    NA
##  2              958                  958           7208               7208    NA
##  3              962                 2426            962               2426    NA
##  4              987                  987           8609               8609    NA
##  5             1154                 7034           1154               7034    NA
##  6             1296                 2904           1296               2904    NA
##  7             1376                 1376           1376               1376    NA
##  8             1380                 9120           1380               9120    NA
##  9             1380                 9120           6322              14062    NA
## 10             1380                 9480           1380               9480    NA
## # ℹ 2,806 more rows

Covariation

A categorical and continuous variable

ggplot(myData, state = "New Hampshire", mapping = aes(x = in_state_tuition)) + 
  geom_freqpoly(mapping = aes(colour = out_of_state_tuition), binwidth = 500)
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Two categorical variables

ggplot(myData, state = "New Hampshire") +
  geom_count(mapping = aes(x = in_state_tuition, y = out_of_state_tuition))

Two continous variables

ggplot(myData, state = "New Hampshire") +
  geom_point(mapping = aes(x = in_state_tuition, y = out_of_state_tuition))

Patterns and models

ggplot(myData, state = "New Hampshire") + 
  geom_point(mapping = aes(x = in_state_tuition, y = out_of_state_tuition))