# Read the data from the web
FetchedData <-
  read.csv("https://raw.githubusercontent.com/drkblake/Data/main/DataWrangling.csv")
# Save the data on your computer
write.csv(FetchedData, "DataWrangling.csv", row.names = FALSE)
# remove the data from the environment
rm (FetchedData)

# Installing required packages
if (!require("tidyverse"))
  install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)

# Read the data
mydata <- read.csv("DataWrangling.csv")

# Create a continuous "Density" variable measuring
# households per square mile, then a two-level and
# a three-level categorical version
mydata <- mydata %>%
  mutate(Density = Households / Land_area) %>%
  mutate(Density_2 = cut_number(Density, n = 2)) %>%   mutate(Density_3 = cut_number(Density, n = 3))
mydata <- mydata %>%
  mutate(
    Density_2 = case_when(
      Density_2 == "[7.35,28.6]" ~ "Low density",
      Density_2 == "(28.6,583]" ~ "High density",
      .default = "Error"
    )
  )
mydata <- mydata %>%
  mutate(
    Density_3 = case_when(
      Density_3 == "[7.35,21]" ~ "Low density",
      Density_3 == "(21,40.4]" ~ "Intermediate density",
      Density_3 == "(40.4,583]" ~ "High density",
      .default = "Error"
    )
  )

# Re-save the data on your computer
write.csv(mydata, "DataWrangling.csv", row.names = FALSE)

# Basic histogram
ggplot(mydata, aes(x=Pct_College))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Basic histogram
ggplot(mydata, aes(x=Pct_College))+
  geom_histogram(color = "gray",
                 fill = "darkblue")+
  labs(x = "Pct. College",
       y = "Number of counties",
       title = "Perecent of College Gradutes in Counties")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata, aes(x = Density,
                   y = Pct_College))+
  geom_point()

ggplot(mydata, aes(x = Density,
                   y = Pct_College))+
  geom_point(color = "red",
             fill = "red")