Methods 1, Week 5

Class 5 prep

Open part1 project:

Install the scales packages

  • install.packages("scales"):for formatting text

Outline

  • County joins questions

  • Homework review

  • Visualization with ggplot

  • County, school district poverty analysis continues

  • Assignment 5

County data joins

library(tidyverse)
library(readxl)

# import ny  health data
raw_health_data <- read_excel("data/raw/CountyHealthRankings/2021 County Health Rankings Data - v1.xlsx", 
    sheet = "Ranked Measure Data", skip = 1)
raw_lottery <- read_csv("data/raw/NYS_Lottery_Retailers.csv")


# import our processed county dataset
county_pov <- read_csv("data/processed/county_pov_rate_2022.csv")

# process lottery data - county of lottery retailers per county
lottery_count <- raw_lottery |> 
  group_by(County, GEOID) |> 
  summarise(lottery_retailers = as.numeric(n())) |> 
  mutate(GEOID = as.numeric(GEOID))

# process health data 
ny_health <- raw_health_data |> 
  select(FIPS, `% Severe Housing Problems`, `Average Number of Physically Unhealthy Days`,
         `Average Number of Mentally Unhealthy Days`, `Food Environment Index`, `% Uninsured`,
         `Primary Care Physicians Rate`) |> 
  mutate(FIPS = as.numeric(FIPS))

county_data <- county_pov |> 
  left_join(lottery_count, by = c("conum" = "GEOID"))  |>
  mutate(lottery_per10k_kids = as.numeric(lottery_retailers)/county_child_count*10000) |>
  left_join(ny_health, by = c("conum" = "FIPS")) 

write_csv(county_data, "data/processed/ny_county_health_data_2022.csv")

Visualizations

hist(county_data$lottery_per10k_kids)

Explore data based on histogram

This is your opportunity to learn more about New York and your data, and learn about flaws or biases. When you see an outlier, always take a look:

  • look at your dataframe and find the outlier
  • think about why
  • google to learn about that area to determine if this is a true outlier or a flaw

View the table

Name conum county_child_count county_child_poverty_count county_child_poverty_rate County lottery_retailers lottery_per10k_kids % Severe Housing Problems Average Number of Physically Unhealthy Days Average Number of Mentally Unhealthy Days Food Environment Index % Uninsured Primary Care Physicians Rate
Hamilton County 36041 496.063 63 0.127 Hamilton 11 221.7460 13.69863 4.123383 4.654123 7.4 6.264349 22.55300
Essex County 36031 3946.667 592 0.150 Essex 47 119.0878 12.55326 4.134879 3.998942 8.6 4.495877 37.53351
Delaware County 36025 4989.899 988 0.198 Delaware 59 118.2389 17.06806 4.386655 4.817896 8.3 5.842152 33.68743
Greene County 36039 5230.769 816 0.156 Greene 59 112.7941 17.74335 4.328537 4.503879 8.1 5.599767 37.90192
Otsego County 36077 6224.138 1083 0.174 Otsego 68 109.2521 16.19048 4.117031 4.524647 7.9 5.533152 115.48310

% Severe Housing Problems

hist(county_data$`% Severe Housing Problems`)

Average Number of Physically Unhealthy Days

hist(county_data$`Average Number of Physically Unhealthy Days`)

Average Number of Mentally Unhealthy Days

hist(county_data$`Average Number of Mentally Unhealthy Days`)

Food Environment Index

hist(county_data$`Food Environment Index`)

% Uninsured

hist(county_data$`% Uninsured`)

Primary Care Physicians Rate

hist(county_data$`Primary Care Physicians Rate`)

Scatterplots: Average Number of Physically Unhealthy Days

plot(county_data$county_child_poverty_rate, county_data$`Average Number of Physically Unhealthy Days`)

Scatterplots: Food Environment Index

plot(county_data$county_child_poverty_rate, county_data$`Food Environment Index`)

Scatterplots: Primary Care Physicians Rate

plot(county_data$county_child_poverty_rate, county_data$`Primary Care Physicians Rate`)

Homework

Electoral Votes = 538

  • Seats in the U.S. House of Representatives = 435
  • Seats in the Senate = 100
  • D.C. Electoral Votes = 3

Seats in the U.S. House of Representatives

  • allocated to each state by population
    • U.S. population (2020) ~ 331 million
    • Each House District ~ 761,000

Seats in the U.S. Senate

  • each state has 2 Senators, regardless of population

Homework script

library(tidyverse)
library(readxl)

## remove scientific notation
options(scipen = 999)

# import apportionment and race/ethnicity data
raw_apportion <- read_excel("data/raw/apportionment-2020-table01.xlsx", 
                                                      skip = 3)
raw_race <- read_csv("data/raw/DECENNIALPL2020.P2_Hispanic_Latino_by_race/DECENNIALPL2020.P2_data.csv")

race_documentaion <- read_csv("data/raw/DECENNIALPL2020.P2_Hispanic_Latino_by_race/DECENNIALPL2020.P2_metadata.csv")

# process race data
race <- raw_race |> 
  mutate(percent_latinx = P2_002N/P2_001N,
         percent_white = P2_005N/P2_001N,
         percent_black = P2_006N/P2_001N,
         percent_bipoc = 1 - percent_white) |> 
  select(GEO_ID, NAME, percent_latinx, percent_white, percent_black, percent_bipoc)

# process apportionment
apportion <- raw_apportion |> 
  select(GEO_ID, STATE, `POPULATION`, `APPORTIONED REPRESENTATIVES`) |> 
  rename(pop = `POPULATION`,
         representatives = `APPORTIONED REPRESENTATIVES`) |> 
  mutate(electoral_votes = representatives + 2,
         pop_per_electoral_vote = round(pop/electoral_votes, 0),
         electoral_votes_per_million = electoral_votes/pop*1000000) |> 
  full_join(race, by = "GEO_ID") 

Electoral votes histogram

hist(apportion$electoral_votes_per_million)

Electoral votes data summary

summary(apportion)
    GEO_ID             STATE                pop           representatives
 Length:52          Length:52          Min.   :  577719   Min.   : 1.00  
 Class :character   Class :character   1st Qu.: 1871866   1st Qu.: 2.25  
 Mode  :character   Mode  :character   Median : 4585405   Median : 6.00  
                                       Mean   : 6622169   Mean   : 8.70  
                                       3rd Qu.: 7576690   3rd Qu.: 9.75  
                                       Max.   :39576757   Max.   :52.00  
                                       NA's   :2          NA's   :2      
 electoral_votes pop_per_electoral_vote electoral_votes_per_million
 Min.   : 3.00   Min.   :192573         Min.   :1.364              
 1st Qu.: 4.25   1st Qu.:430223         1st Qu.:1.541              
 Median : 8.00   Median :564942         Median :1.770              
 Mean   :10.70   Mean   :525324         Mean   :2.148              
 3rd Qu.:11.75   3rd Qu.:648857         3rd Qu.:2.326              
 Max.   :54.00   Max.   :732903         Max.   :5.193              
 NA's   :2       NA's   :2              NA's   :2                  
     NAME           percent_latinx    percent_white      percent_black     
 Length:52          Min.   :0.01942   Min.   :0.007471   Min.   :0.001304  
 Class :character   1st Qu.:0.05943   1st Qu.:0.548545   1st Qu.:0.032242  
 Mode  :character   Median :0.10508   Median :0.663434   Median :0.070190  
                    Mean   :0.14252   Mean   :0.640237   Mean   :0.105462  
                    3rd Qu.:0.15448   3rd Qu.:0.759888   3rd Qu.:0.140235  
                    Max.   :0.98879   Max.   :0.901571   Max.   :0.409061  
                                                                           
 percent_bipoc    
 Min.   :0.09843  
 1st Qu.:0.24011  
 Median :0.33657  
 Mean   :0.35976  
 3rd Qu.:0.45145  
 Max.   :0.99253  
                  

Electoral votes plot (white)

plot(apportion$percent_white, apportion$electoral_votes_per_million)

Electoral votes plot (Black)

plot(apportion$percent_black, apportion$electoral_votes_per_million)

Look at the continental US

continental_us <- apportion |> 
  filter(STATE != "Alaska",
         STATE != "Hawaii")

plot(continental_us$percent_white, continental_us$electoral_votes_per_million)

Visualization as part of analysis


Visualization is a tool:

  • to explore our datasets
  • check results
  • share with colleagues
  • share final analysis results

ggplot scatterplot

ggplot code

library(tidyverse)
library(scales)

# use ggplot
ggplot(continental_us, 
       aes(x = percent_white, 
           y = electoral_votes_per_million
           )
       ) +
  geom_point() + 
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = comma) + 
  labs(x = "Percent White", y = "Electoral Votes per Million People",
       title = "Race and Electoral Power",
       caption = "Source: U.S. Census, 2020. 
       ** Note, excludes Alaska, Hawaii, and Washington D.C.")

ggplot2

Tidyverse package for producing plots of your dataframe

Every ggplot has 3 required components:

  • data: the dataframe you want to visualize
  • aes: variables in the dataframe that you want to visualize
  • at least one layer that defines what type of plot you want to create
    • examples: points, bars, lines
  • you can add many more elements to make it look nicer

ggplot scatterplot example

ggplot(data = apportion, # data
       aes(x = percent_white, 
           y = electoral_votes_per_million)) + # aesthetics
  geom_point() # layer

ggplot line example

A line graph doesn’t make sense for this data, but as an example:

  • the layer type determines how you display the data
ggplot(data = apportion, # data
       aes(x = percent_white, 
           y = electoral_votes_per_million)) + # aesthetics
  geom_line() # line layer

Analysis plan

  • Create dataframe of poverty rate by county
  • Create dataframe of student poverty rate by school district
  • Calculate the statewide student poverty rate
  • Join the school district and county poverty dataframes to compare the poverty rates
  • Measure the difference in poverty rates of each school district and it’s county and the state
  • Use summary statistics to explore and gain understanding
  • Use visualizations to explore and gain understanding

Analysis so far

We have 3 scripts for our analysis so far:

  • new_york_student_poverty_2022
  • ny_county_poverty_rate_22
  • analyze_ny_poverty

Visualization plan

What counties have the most economic inequality, as measured by the student poverty rate of school districts?

  • Add school district enrollment data for context
  • Create a scatterplot to explore the county with the largest range in student poverty
  • Create scatterplots to explore other counties
  • Create scatterplots to explore the state as a whole

Visualization script

Create a new script visualize_poverty_analysis.R

  • add necessary packages
  • import data
  • join school district enrollment data
library(tidyverse)
library(scales)

### Import the summary data so we can look at it to pick the counties we want to focus on
county_stats <- read_csv("data/output/ny_county_poverty_stats.csv")

# import some extra school district data
sd_enroll <- read_csv("data/raw/school_district_demographics_2022.csv") |> 
  mutate(district_id = as.numeric(district_id))

sd_county_pov <- read_csv("data/processed/ny_sd_county_pov_data.csv")

# import the school district - county poverty data and join the school district data
sd_county_pov_enroll <- sd_county_pov |> 
  left_join(sd_enroll, by = c("id"="district_id"))

What county has the largest range in student poverty?

Orange County scatterplot, v1

### Create a scatterplot to explore the county with the largest range in student poverty 
ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, y = pct_bipoc)) +
  geom_point()

Orange County scatterplot, v2 code

  • Within the aesthetic mapping aes():

    • size the dots by enrollment
    • color by “pct_bipoc category”
ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point() 

Orange County scatterplot, v2 plot

Orange County scatterplot, v3 code

  • Within geom_point() make the dots 65% transparent with alpha = .65
ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .65) 

Orange County scatterplot, v3 plot

Orange County scatterplot, v4 code

Format the axis labels as percent, with no decimal place

  • use scale_x_continuous() to format the x-axis
  • percent_format() is a scales package function
  • accuracy = 1 rounds to a whole number
    • accuracy = .1 includes one decimal place
ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) 

Orange County scatterplot, v4 plot

Orange County scatterplot, v5 code

Define nice-looking title, caption, axis labels and legend lables within the labs function

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "Percent BIPOC") 

Orange County scatterplot, v5 plot

Orange County scatterplot, v6 code

Add a theme to add some standard styling

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "Percent BIPOC") +
  theme_bw()

Orange County scatterplot, v6 plot

Orange County scatterplot, v7 code

Format the legend labels

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  scale_color_continuous(labels = percent_format(accuracy = 1)) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "Percent BIPOC") +
  theme_bw()

Orange County scatterplot, v7 plot

New York scatterplot code

Remove the filter to look at New York as a whole

ggplot(data = sd_county_pov_enroll, 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = COUNTY)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in New York School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "County") +
  theme_bw() +
  guides(color = "none")  # Remove the color legend

New York scatterplot

New York scatterplot (no nyc) code

Remove New York City to see how it changes

ggplot(data = sd_county_pov_enroll |> 
         filter(district.x != "New York City Department Of Education"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = COUNTY)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       subtitle = "Excluding New York City",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "County") +
  theme_bw() +
  guides(color = "none")  # Remove the color legend

New York scatterplot (no nyc)

Save a plot

To save a plot:

  • first save it as an object
  • use ggsave to save it
ny_scatter <- ggplot(data = sd_county_pov_enroll |> 
         filter(district.x != "New York City Department Of Education"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = COUNTY)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment") +
  theme_bw() +
  guides(color = "none")  # Remove the color legend

# example code to save last plot as a 5" by 7" .png file
ggsave("data/output/NewYork_school_district_poverty.png", #specify the file path/name/type
       plot = ny_scatter, # specify the ggplot object you stored
       units = "in", # specify the units for your image
       height = 5, width = 7) # specify the image dimensions

Homework 5b.

Use the visualization skills you learned today to create 3 plots to explore the New York County data from last week (county poverty, lottery retailers, health indicators).

Follow your interest. Add more data if you desire. On canvas upload your plots and a short paragraph description of what each scatterplot shows.