Import Data

myData <- readxl::read_xlsx("../01_module4/data/myData.xlsx")
myData
## # A tibble: 1,104 × 31
##    town11cd  town11nm population_2011 size_flag rgn11nm coastal coastal_detailed
##    <chr>     <chr>              <dbl> <chr>     <chr>   <chr>   <chr>           
##  1 E34000007 Carlton…            5456 Small To… East M… Non-co… Smaller non-coa…
##  2 E34000016 Dorches…           19060 Small To… South … Non-co… Smaller non-coa…
##  3 E34000020 Ely BUA            19090 Small To… East o… Non-co… Smaller non-coa…
##  4 E34000026 Market …            6429 Small To… Yorksh… Non-co… Smaller non-coa…
##  5 E34000027 Downham…           10884 Small To… East o… Non-co… Smaller non-coa…
##  6 E34000039 Penrith…           15181 Small To… North … Non-co… Smaller non-coa…
##  7 E34000048 Bolsove…           11754 Small To… East M… Non-co… Smaller non-coa…
##  8 E34000055 March B…           21051 Medium T… East o… Non-co… Large non-coast…
##  9 E34000056 Southam…            6567 Small To… West M… Non-co… Smaller non-coa…
## 10 E34000067 Royston…           15781 Small To… East o… Non-co… Smaller non-coa…
## # ℹ 1,094 more rows
## # ℹ 24 more variables: ttwa11cd <chr>, ttwa11nm <chr>,
## #   ttwa_classification <chr>, job_density_flag <chr>, income_flag <chr>,
## #   university_flag <chr>, level4qual_residents35_64_2011 <chr>,
## #   ks4_2012_2013_counts <dbl>,
## #   key_stage_2_attainment_school_year_2007_to_2008 <dbl>,
## #   key_stage_4_attainment_school_year_2012_to_2013 <dbl>, …

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = myData) +
    geom_bar(aes(x = size_flag))

myData %>%
    count(size_flag)
## # A tibble: 8 × 2
##   size_flag            n
##   <chr>            <int>
## 1 City                18
## 2 Inner London BUA     1
## 3 Large Towns         89
## 4 Medium Towns       331
## 5 Not BUA              1
## 6 Other Small BUAs     1
## 7 Outer london BUA     1
## 8 Small Towns        662
ggplot(data = myData) +
    geom_histogram(aes(x = population_2011), binwidth = 100000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

myData %>%
    filter(population_2011 < 600000)
## # A tibble: 1,099 × 31
##    town11cd  town11nm population_2011 size_flag rgn11nm coastal coastal_detailed
##    <chr>     <chr>              <dbl> <chr>     <chr>   <chr>   <chr>           
##  1 E34000007 Carlton…            5456 Small To… East M… Non-co… Smaller non-coa…
##  2 E34000016 Dorches…           19060 Small To… South … Non-co… Smaller non-coa…
##  3 E34000020 Ely BUA            19090 Small To… East o… Non-co… Smaller non-coa…
##  4 E34000026 Market …            6429 Small To… Yorksh… Non-co… Smaller non-coa…
##  5 E34000027 Downham…           10884 Small To… East o… Non-co… Smaller non-coa…
##  6 E34000039 Penrith…           15181 Small To… North … Non-co… Smaller non-coa…
##  7 E34000048 Bolsove…           11754 Small To… East M… Non-co… Smaller non-coa…
##  8 E34000055 March B…           21051 Medium T… East o… Non-co… Large non-coast…
##  9 E34000056 Southam…            6567 Small To… West M… Non-co… Smaller non-coa…
## 10 E34000067 Royston…           15781 Small To… East o… Non-co… Smaller non-coa…
## # ℹ 1,089 more rows
## # ℹ 24 more variables: ttwa11cd <chr>, ttwa11nm <chr>,
## #   ttwa_classification <chr>, job_density_flag <chr>, income_flag <chr>,
## #   university_flag <chr>, level4qual_residents35_64_2011 <chr>,
## #   ks4_2012_2013_counts <dbl>,
## #   key_stage_2_attainment_school_year_2007_to_2008 <dbl>,
## #   key_stage_4_attainment_school_year_2012_to_2013 <dbl>, …
ggplot(data = myData, aes(x = population_2011)) +
    geom_histogram(binwidth = 100000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Typical values

ggplot(data = myData, aes(x = population_2011)) +
    geom_histogram(binwidth = 10000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Unusual values

ggplot(myData) +
    geom_histogram(aes(x = population_2011), binwidth = 10000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(myData) +
    geom_histogram(aes(x = population_2011), binwidth = 10000) +
    coord_cartesian(xlim = c(0, 300000))
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Missing Values

myData2 <- myData %>%
    filter(between(population_2011, 0, 300000))

myData2 <- myData %>%
    mutate(population_2011 = ifelse(population_2011 < 0 | population_2011 > 300000, NA, population_2011))

ggplot(myData2, aes(x = education_score, y = population_2011)) +
    geom_point(na.rm = TRUE)

Covariation

A categorical and continuous variable

ggplot(data = myData, aes(x = education_score)) +
    geom_freqpoly(aes(color = size_flag), binwidth = 1)

ggplot(data = myData, aes(x = education_score, y = size_flag)) +
    geom_boxplot()

Two categorical variables

ggplot(data = myData) +
    geom_count(aes(x = education_score, y = size_flag))

myData %>%
    count(size_flag, population_2011) %>%
    ggplot(aes(x = size_flag, y = population_2011)) +
    geom_tile(aes(fill = n))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_tile()`).

Two continous variables

ggplot(data = myData) +
    geom_point(aes(x = size_flag, y = population_2011), alpha = 1/100)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = myData) +
    geom_bin2d(aes(x = size_flag, y = population_2011))
## `stat_bin2d()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin2d()`).

library(hexbin)

ggplot(data = myData) +
    geom_hex(aes(x = size_flag, y = population_2011))
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_binhex()`).

ggplot(data = myData, aes(x = size_flag, y = population_2011)) +
    geom_boxplot(aes(group = cut_width(education_score, 1)))
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Patterns and models

library(modelr)

mod <- lm(population_2011 ~ education_score, data = myData)

myData2 <- myData %>%
    add_residuals(mod) 

ggplot(data = myData2) + 
    geom_point(aes(x = education_score, y = resid))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = myData2) + 
    geom_boxplot(aes(x = education_score, y = resid))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).