B02 but I do the actual assingment with the dataset everyone else is
using instead of my own thing (oops)
loading libraries and csv
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
edu_raw <- read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/robustbase/education.csv')
## New names:
## Rows: 50 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): State dbl (6): ...1, Region, X1, X2, X3, Y
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
glimpse(edu_raw)
## Rows: 50
## Columns: 7
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ State <chr> "ME", "NH", "VT", "MA", "RI", "CT", "NY", "NJ", "PA", "OH", "IN…
## $ Region <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ X1 <dbl> 508, 564, 322, 846, 871, 774, 856, 889, 715, 753, 649, 830, 738…
## $ X2 <dbl> 3944, 4578, 4011, 5233, 4780, 5889, 5663, 5759, 4894, 5012, 490…
## $ X3 <dbl> 325, 323, 328, 305, 303, 307, 301, 310, 300, 324, 329, 320, 337…
## $ Y <dbl> 235, 231, 270, 261, 300, 317, 387, 285, 300, 221, 264, 308, 379…
renaming and factoring discrete variables
edu <- edu_raw %>%
rename(res_density = X1, income = X2, minors = X3, spend_public = Y) %>%
mutate(Region = factor(Region))
glimpse(edu)
## Rows: 50
## Columns: 7
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ State <chr> "ME", "NH", "VT", "MA", "RI", "CT", "NY", "NJ", "PA", "OH…
## $ Region <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ res_density <dbl> 508, 564, 322, 846, 871, 774, 856, 889, 715, 753, 649, 83…
## $ income <dbl> 3944, 4578, 4011, 5233, 4780, 5889, 5663, 5759, 4894, 501…
## $ minors <dbl> 325, 323, 328, 305, 303, 307, 301, 310, 300, 324, 329, 32…
## $ spend_public <dbl> 235, 231, 270, 261, 300, 317, 387, 285, 300, 221, 264, 30…
#1 Box plot of income
ggplot(edu, aes(x = income)) +
geom_boxplot()

#2 Bar chart showing how many states in each region.
#counting the number of states in each region
edu %>%
count(Region) %>%
ggplot(aes(x = Region, y = n)) +
geom_bar(stat = 'identity') +
ylab('Number of States')

#3 Scatter plot of minors vs spend_public (remember spend is per
capita so this isn’t exactly a meaningful graph)
ggplot(edu, aes(x = minors, y = spend_public)) +
geom_point()

#4 Box plot of income separated by region… use
factor(Region) to treat numbers as categories
ggplot(edu, aes(x = factor(Region), y = income)) +
geom_boxplot() +
xlab('Region')

#5 Dotplot of density, colored by region
ggplot(edu, aes(x = res_density, fill = Region)) +
geom_dotplot(method = "histodot", binwidth = 25)
