Import Data
myData <- readxl::read_xlsx("../01_module4/data/myData.xlsx")
myData
## # A tibble: 1,104 × 31
## town11cd town11nm population_2011 size_flag rgn11nm coastal coastal_detailed
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 E34000007 Carlton… 5456 Small To… East M… Non-co… Smaller non-coa…
## 2 E34000016 Dorches… 19060 Small To… South … Non-co… Smaller non-coa…
## 3 E34000020 Ely BUA 19090 Small To… East o… Non-co… Smaller non-coa…
## 4 E34000026 Market … 6429 Small To… Yorksh… Non-co… Smaller non-coa…
## 5 E34000027 Downham… 10884 Small To… East o… Non-co… Smaller non-coa…
## 6 E34000039 Penrith… 15181 Small To… North … Non-co… Smaller non-coa…
## 7 E34000048 Bolsove… 11754 Small To… East M… Non-co… Smaller non-coa…
## 8 E34000055 March B… 21051 Medium T… East o… Non-co… Large non-coast…
## 9 E34000056 Southam… 6567 Small To… West M… Non-co… Smaller non-coa…
## 10 E34000067 Royston… 15781 Small To… East o… Non-co… Smaller non-coa…
## # ℹ 1,094 more rows
## # ℹ 24 more variables: ttwa11cd <chr>, ttwa11nm <chr>,
## # ttwa_classification <chr>, job_density_flag <chr>, income_flag <chr>,
## # university_flag <chr>, level4qual_residents35_64_2011 <chr>,
## # ks4_2012_2013_counts <dbl>,
## # key_stage_2_attainment_school_year_2007_to_2008 <dbl>,
## # key_stage_4_attainment_school_year_2012_to_2013 <dbl>, …
Introduction
Questions
Variation
Visualizing distributions
ggplot(data = myData) +
geom_bar(aes(x = size_flag))

myData %>%
count(size_flag)
## # A tibble: 8 × 2
## size_flag n
## <chr> <int>
## 1 City 18
## 2 Inner London BUA 1
## 3 Large Towns 89
## 4 Medium Towns 331
## 5 Not BUA 1
## 6 Other Small BUAs 1
## 7 Outer london BUA 1
## 8 Small Towns 662
ggplot(data = myData) +
geom_histogram(aes(x = population_2011), binwidth = 100000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

myData %>%
filter(population_2011 < 600000)
## # A tibble: 1,099 × 31
## town11cd town11nm population_2011 size_flag rgn11nm coastal coastal_detailed
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 E34000007 Carlton… 5456 Small To… East M… Non-co… Smaller non-coa…
## 2 E34000016 Dorches… 19060 Small To… South … Non-co… Smaller non-coa…
## 3 E34000020 Ely BUA 19090 Small To… East o… Non-co… Smaller non-coa…
## 4 E34000026 Market … 6429 Small To… Yorksh… Non-co… Smaller non-coa…
## 5 E34000027 Downham… 10884 Small To… East o… Non-co… Smaller non-coa…
## 6 E34000039 Penrith… 15181 Small To… North … Non-co… Smaller non-coa…
## 7 E34000048 Bolsove… 11754 Small To… East M… Non-co… Smaller non-coa…
## 8 E34000055 March B… 21051 Medium T… East o… Non-co… Large non-coast…
## 9 E34000056 Southam… 6567 Small To… West M… Non-co… Smaller non-coa…
## 10 E34000067 Royston… 15781 Small To… East o… Non-co… Smaller non-coa…
## # ℹ 1,089 more rows
## # ℹ 24 more variables: ttwa11cd <chr>, ttwa11nm <chr>,
## # ttwa_classification <chr>, job_density_flag <chr>, income_flag <chr>,
## # university_flag <chr>, level4qual_residents35_64_2011 <chr>,
## # ks4_2012_2013_counts <dbl>,
## # key_stage_2_attainment_school_year_2007_to_2008 <dbl>,
## # key_stage_4_attainment_school_year_2012_to_2013 <dbl>, …
ggplot(data = myData, aes(x = population_2011)) +
geom_histogram(binwidth = 100000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Typical values
ggplot(data = myData, aes(x = population_2011)) +
geom_histogram(binwidth = 10000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Unusual values
ggplot(myData) +
geom_histogram(aes(x = population_2011), binwidth = 10000)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(myData) +
geom_histogram(aes(x = population_2011), binwidth = 10000) +
coord_cartesian(xlim = c(0, 300000))
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Missing Values
myData2 <- myData %>%
filter(between(population_2011, 0, 300000))
myData2 <- myData %>%
mutate(population_2011 = ifelse(population_2011 < 0 | population_2011 > 300000, NA, population_2011))
ggplot(myData2, aes(x = education_score, y = population_2011)) +
geom_point(na.rm = TRUE)

Covariation
A categorical and continuous variable
ggplot(data = myData, aes(x = education_score)) +
geom_freqpoly(aes(color = size_flag), binwidth = 1)

ggplot(data = myData, aes(x = education_score, y = size_flag)) +
geom_boxplot()

Two categorical variables
ggplot(data = myData) +
geom_count(aes(x = education_score, y = size_flag))

myData %>%
count(size_flag, population_2011) %>%
ggplot(aes(x = size_flag, y = population_2011)) +
geom_tile(aes(fill = n))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_tile()`).

Two continous variables
ggplot(data = myData) +
geom_point(aes(x = size_flag, y = population_2011), alpha = 1/100)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = myData) +
geom_bin2d(aes(x = size_flag, y = population_2011))
## `stat_bin2d()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin2d()`).

library(hexbin)
ggplot(data = myData) +
geom_hex(aes(x = size_flag, y = population_2011))
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_binhex()`).

ggplot(data = myData, aes(x = size_flag, y = population_2011)) +
geom_boxplot(aes(group = cut_width(education_score, 1)))
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Patterns and models
library(modelr)
mod <- lm(population_2011 ~ education_score, data = myData)
myData2 <- myData %>%
add_residuals(mod)
ggplot(data = myData2) +
geom_point(aes(x = education_score, y = resid))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = myData2) +
geom_boxplot(aes(x = education_score, y = resid))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
