Import Data

# excel file
schools <- read_excel("../00_data/myData NH Public Schools.xlsx")

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = schools) +
  geom_bar(mapping = aes(x = SCHOOL_LEVEL))

schools %>% count(SCHOOL_LEVEL)
## # A tibble: 5 × 2
##   SCHOOL_LEVEL     n
##   <chr>        <int>
## 1 High            88
## 2 Middle          98
## 3 Other            2
## 4 Primary        290
## 5 Unknown         14
ggplot(data = schools, mapping = aes(x = ENROLLMENT)) +
  geom_histogram(binwidth = 15)

ggplot(data = schools, mapping = aes(x = ENROLLMENT, color = SCHOOL_LEVEL)) +
  geom_freqpoly(binwidth = 15)

Typical values

schools %>%
    select(AREA:ENROLLMENT) %>%
    
        ggplot(aes(x = ENROLLMENT, color = SCHOOL_LEVEL)) +
        geom_histogram(binwidth = 25)

Unusual values

ggplot(schools, aes(y = ENROLLMENT, color = SCHOOL_LEVEL)) + 
  geom_histogram(binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 15))

Missing Values

# Had trouble with this…could not remove values zero and lower.

Covariation

ggplot(data = schools, mapping = aes(x = ENROLLMENT)) + 
   geom_freqpoly(mapping = aes(color = AREA), binwidth = 500)

A categorical and continuous variable

ggplot(data = schools, mapping = aes(x = SCHOOL_LEVEL, y = ENROLLMENT)) +
  geom_boxplot()

Two categorical variables

ggplot(data = schools) +
  geom_count(mapping = aes(x = AREA, y = SCHOOL_LEVEL))

schools %>% 
  count(AREA, SCHOOL_LEVEL) %>%  
  ggplot(mapping = aes(x = AREA, y = SCHOOL_LEVEL)) +
    geom_tile(mapping = aes(fill = n)) 

Two continous variables

ggplot(data = schools) + 
  geom_point(mapping = aes(x = ENROLLMENT, y = LEVEL_AGE_POPULATION), alpha = 10 / 100)

Patterns and models

# Filter out rows where either LEVEL_AGE_POPULATION or ENROLLMENT are zero or negative.

schools_filtered <- schools %>%
    filter(LEVEL_AGE_POPULATION > 0, ENROLLMENT > 0)
    
mod <- lm(log(LEVEL_AGE_POPULATION) ~ log(ENROLLMENT), data = schools_filtered)

schools <- schools %>%
  add_residuals(mod) %>%
  mutate(resid = exp(resid))
## Warning in log(ENROLLMENT): NaNs produced
ggplot(data = schools) + 
  geom_boxplot(mapping = aes(x = ENROLLMENT, y = resid))
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).