Methods 1, Week 5

Class 5 prep

Open part1 project:

Install the scales packages

install.packages("scales"):for formatting text

Outline

County joins questions
Homework review
Visualization with ggplot
County, school district poverty analysis continues
Assignment 5

County data joins

library(tidyverse)
library(readxl)

# import ny  health data
raw_health_data <- read_excel("data/raw/CountyHealthRankings/2021 County Health Rankings Data - v1.xlsx", 
    sheet = "Ranked Measure Data", skip = 1)
raw_lottery <- read_csv("data/raw/NYS_Lottery_Retailers.csv")


# import our processed county dataset
county_pov <- read_csv("data/processed/county_pov_rate_2022.csv")

# process lottery data - county of lottery retailers per county
lottery_count <- raw_lottery |> 
  group_by(County, GEOID) |> 
  summarise(lottery_retailers = as.numeric(n())) |> 
  mutate(GEOID = as.numeric(GEOID))

# process health data 
ny_health <- raw_health_data |> 
  select(FIPS, `% Severe Housing Problems`, `Average Number of Physically Unhealthy Days`,
         `Average Number of Mentally Unhealthy Days`, `Food Environment Index`, `% Uninsured`,
         `Primary Care Physicians Rate`) |> 
  mutate(FIPS = as.numeric(FIPS))

county_data <- county_pov |> 
  left_join(lottery_count, by = c("conum" = "GEOID"))  |>
  mutate(lottery_per10k_kids = as.numeric(lottery_retailers)/county_child_count*10000) |>
  left_join(ny_health, by = c("conum" = "FIPS")) 

write_csv(county_data, "data/processed/ny_county_health_data_2022.csv")

Visualizations

hist(county_data$lottery_per10k_kids)

Explore data based on histogram

This is your opportunity to learn more about New York and your data, and learn about flaws or biases. When you see an outlier, always take a look:

look at your dataframe and find the outlier
think about why
google to learn about that area to determine if this is a true outlier or a flaw

View the table

Name	conum	county_child_count	county_child_poverty_count	county_child_poverty_rate	County	lottery_retailers	lottery_per10k_kids	% Severe Housing Problems	Average Number of Physically Unhealthy Days	Average Number of Mentally Unhealthy Days	Food Environment Index	% Uninsured	Primary Care Physicians Rate
Hamilton County	36041	496.063	63	0.127	Hamilton	11	221.7460	13.69863	4.123383	4.654123	7.4	6.264349	22.55300
Essex County	36031	3946.667	592	0.150	Essex	47	119.0878	12.55326	4.134879	3.998942	8.6	4.495877	37.53351
Delaware County	36025	4989.899	988	0.198	Delaware	59	118.2389	17.06806	4.386655	4.817896	8.3	5.842152	33.68743
Greene County	36039	5230.769	816	0.156	Greene	59	112.7941	17.74335	4.328537	4.503879	8.1	5.599767	37.90192
Otsego County	36077	6224.138	1083	0.174	Otsego	68	109.2521	16.19048	4.117031	4.524647	7.9	5.533152	115.48310

% Severe Housing Problems

hist(county_data$`% Severe Housing Problems`)

Average Number of Physically Unhealthy Days

hist(county_data$`Average Number of Physically Unhealthy Days`)

Average Number of Mentally Unhealthy Days

hist(county_data$`Average Number of Mentally Unhealthy Days`)

Food Environment Index

look at the data documentation for the variable

hist(county_data$`Food Environment Index`)

% Uninsured

hist(county_data$`% Uninsured`)

Primary Care Physicians Rate

hist(county_data$`Primary Care Physicians Rate`)

Scatterplots: Average Number of Physically Unhealthy Days

plot(county_data$county_child_poverty_rate, county_data$`Average Number of Physically Unhealthy Days`)

Scatterplots: Food Environment Index

plot(county_data$county_child_poverty_rate, county_data$`Food Environment Index`)

Scatterplots: Primary Care Physicians Rate

plot(county_data$county_child_poverty_rate, county_data$`Primary Care Physicians Rate`)

Homework

Electoral Votes = 538

Seats in the U.S. House of Representatives = 435
Seats in the Senate = 100
D.C. Electoral Votes = 3

Seats in the U.S. House of Representatives

allocated to each state by population
- U.S. population (2020) ~ 331 million
- Each House District ~ 761,000

Seats in the U.S. Senate

each state has 2 Senators, regardless of population

Homework script

library(tidyverse)
library(readxl)

## remove scientific notation
options(scipen = 999)

# import apportionment and race/ethnicity data
raw_apportion <- read_excel("data/raw/apportionment-2020-table01.xlsx", 
                                                      skip = 3)
raw_race <- read_csv("data/raw/DECENNIALPL2020.P2_Hispanic_Latino_by_race/DECENNIALPL2020.P2_data.csv")

race_documentaion <- read_csv("data/raw/DECENNIALPL2020.P2_Hispanic_Latino_by_race/DECENNIALPL2020.P2_metadata.csv")

# process race data
race <- raw_race |> 
  mutate(percent_latinx = P2_002N/P2_001N,
         percent_white = P2_005N/P2_001N,
         percent_black = P2_006N/P2_001N,
         percent_bipoc = 1 - percent_white) |> 
  select(GEO_ID, NAME, percent_latinx, percent_white, percent_black, percent_bipoc)

# process apportionment
apportion <- raw_apportion |> 
  select(GEO_ID, STATE, `POPULATION`, `APPORTIONED REPRESENTATIVES`) |> 
  rename(pop = `POPULATION`,
         representatives = `APPORTIONED REPRESENTATIVES`) |> 
  mutate(electoral_votes = representatives + 2,
         pop_per_electoral_vote = round(pop/electoral_votes, 0),
         electoral_votes_per_million = electoral_votes/pop*1000000) |> 
  full_join(race, by = "GEO_ID")

Electoral votes histogram

hist(apportion$electoral_votes_per_million)

Electoral votes data summary

summary(apportion)

    GEO_ID             STATE                pop           representatives
 Length:52          Length:52          Min.   :  577719   Min.   : 1.00  
 Class :character   Class :character   1st Qu.: 1871866   1st Qu.: 2.25  
 Mode  :character   Mode  :character   Median : 4585405   Median : 6.00  
                                       Mean   : 6622169   Mean   : 8.70  
                                       3rd Qu.: 7576690   3rd Qu.: 9.75  
                                       Max.   :39576757   Max.   :52.00  
                                       NA's   :2          NA's   :2      
 electoral_votes pop_per_electoral_vote electoral_votes_per_million
 Min.   : 3.00   Min.   :192573         Min.   :1.364              
 1st Qu.: 4.25   1st Qu.:430223         1st Qu.:1.541              
 Median : 8.00   Median :564942         Median :1.770              
 Mean   :10.70   Mean   :525324         Mean   :2.148              
 3rd Qu.:11.75   3rd Qu.:648857         3rd Qu.:2.326              
 Max.   :54.00   Max.   :732903         Max.   :5.193              
 NA's   :2       NA's   :2              NA's   :2                  
     NAME           percent_latinx    percent_white      percent_black     
 Length:52          Min.   :0.01942   Min.   :0.007471   Min.   :0.001304  
 Class :character   1st Qu.:0.05943   1st Qu.:0.548545   1st Qu.:0.032242  
 Mode  :character   Median :0.10508   Median :0.663434   Median :0.070190  
                    Mean   :0.14252   Mean   :0.640237   Mean   :0.105462  
                    3rd Qu.:0.15448   3rd Qu.:0.759888   3rd Qu.:0.140235  
                    Max.   :0.98879   Max.   :0.901571   Max.   :0.409061  
                                                                           
 percent_bipoc    
 Min.   :0.09843  
 1st Qu.:0.24011  
 Median :0.33657  
 Mean   :0.35976  
 3rd Qu.:0.45145  
 Max.   :0.99253

Electoral votes plot (white)

plot(apportion$percent_white, apportion$electoral_votes_per_million)

Electoral votes plot (Black)

plot(apportion$percent_black, apportion$electoral_votes_per_million)

Look at the continental US

continental_us <- apportion |> 
  filter(STATE != "Alaska",
         STATE != "Hawaii")

plot(continental_us$percent_white, continental_us$electoral_votes_per_million)

Visualization as part of analysis

Visualization is a tool:

to explore our datasets
check results
share with colleagues
share final analysis results

ggplot scatterplot

ggplot code

library(tidyverse)
library(scales)

# use ggplot
ggplot(continental_us, 
       aes(x = percent_white, 
           y = electoral_votes_per_million
           )
       ) +
  geom_point() + 
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = comma) + 
  labs(x = "Percent White", y = "Electoral Votes per Million People",
       title = "Race and Electoral Power",
       caption = "Source: U.S. Census, 2020. 
       ** Note, excludes Alaska, Hawaii, and Washington D.C.")

`ggplot2`

Tidyverse package for producing plots of your dataframe

Every ggplot has 3 required components:

data: the dataframe you want to visualize
aes: variables in the dataframe that you want to visualize
at least one layer that defines what type of plot you want to create
- examples: points, bars, lines
you can add many more elements to make it look nicer

ggplot scatterplot example

ggplot(data = apportion, # data
       aes(x = percent_white, 
           y = electoral_votes_per_million)) + # aesthetics
  geom_point() # layer

ggplot line example

A line graph doesn’t make sense for this data, but as an example:

the layer type determines how you display the data

ggplot(data = apportion, # data
       aes(x = percent_white, 
           y = electoral_votes_per_million)) + # aesthetics
  geom_line() # line layer

Analysis plan

Create dataframe of poverty rate by county
Create dataframe of student poverty rate by school district
Calculate the statewide student poverty rate
Join the school district and county poverty dataframes to compare the poverty rates
Measure the difference in poverty rates of each school district and it’s county and the state
Use summary statistics to explore and gain understanding
Use visualizations to explore and gain understanding

Analysis so far

We have 3 scripts for our analysis so far:

new_york_student_poverty_2022
ny_county_poverty_rate_22
analyze_ny_poverty

Visualization plan

What counties have the most economic inequality, as measured by the student poverty rate of school districts?

Add school district enrollment data for context
Create a scatterplot to explore the county with the largest range in student poverty
Create scatterplots to explore other counties
Create scatterplots to explore the state as a whole

Visualization script

Create a new script visualize_poverty_analysis.R

add necessary packages
import data
join school district enrollment data

library(tidyverse)
library(scales)

### Import the summary data so we can look at it to pick the counties we want to focus on
county_stats <- read_csv("data/output/ny_county_poverty_stats.csv")

# import some extra school district data
sd_enroll <- read_csv("data/raw/school_district_demographics_2022.csv") |> 
  mutate(district_id = as.numeric(district_id))

sd_county_pov <- read_csv("data/processed/ny_sd_county_pov_data.csv")

# import the school district - county poverty data and join the school district data
sd_county_pov_enroll <- sd_county_pov |> 
  left_join(sd_enroll, by = c("id"="district_id"))

What county has the largest range in student poverty?

Orange County scatterplot, v1

### Create a scatterplot to explore the county with the largest range in student poverty 
ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, y = pct_bipoc)) +
  geom_point()

Orange County scatterplot, v2 code

Within the aesthetic mapping aes():
- size the dots by enrollment
- color by “pct_bipoc category”

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point()

Orange County scatterplot, v2 plot

Orange County scatterplot, v3 code

Within geom_point() make the dots 65% transparent with alpha = .65

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .65)

Orange County scatterplot, v3 plot

Orange County scatterplot, v4 code

Format the axis labels as percent, with no decimal place

use scale_x_continuous() to format the x-axis
percent_format() is a scales package function
accuracy = 1 rounds to a whole number
- accuracy = .1 includes one decimal place

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1))

Orange County scatterplot, v4 plot

Orange County scatterplot, v5 code

Define nice-looking title, caption, axis labels and legend lables within the labs function

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "Percent BIPOC")

Orange County scatterplot, v5 plot

Orange County scatterplot, v6 code

Add a theme to add some standard styling

see other themes

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "Percent BIPOC") +
  theme_bw()

Orange County scatterplot, v6 plot

Orange County scatterplot, v7 code

Format the legend labels

ggplot(data = sd_county_pov_enroll |> 
         filter(COUNTY == "Orange County"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = pct_bipoc)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  scale_color_continuous(labels = percent_format(accuracy = 1)) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "Percent BIPOC") +
  theme_bw()

Orange County scatterplot, v7 plot

New York scatterplot code

Remove the filter to look at New York as a whole

ggplot(data = sd_county_pov_enroll, 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = COUNTY)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in New York School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "County") +
  theme_bw() +
  guides(color = "none")  # Remove the color legend

New York scatterplot

New York scatterplot (no nyc) code

Remove New York City to see how it changes

ggplot(data = sd_county_pov_enroll |> 
         filter(district.x != "New York City Department Of Education"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = COUNTY)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       subtitle = "Excluding New York City",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment",
       color = "County") +
  theme_bw() +
  guides(color = "none")  # Remove the color legend

New York scatterplot (no nyc)

Save a plot

To save a plot:

first save it as an object
use ggsave to save it

ny_scatter <- ggplot(data = sd_county_pov_enroll |> 
         filter(district.x != "New York City Department Of Education"), 
  aes(x = stpovrate, 
      y = pct_bipoc,
      size = enroll,
      color = COUNTY)) +
  geom_point(alpha = .5) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) + 
  scale_size_area(labels = comma) +
  labs(x = "Student Poverty Rate", y = "Percent BIPOC",
       title = "Racial Diversity and Student Poverty in Orange County School Districts",
       caption = "Sources: NCES, 2019 and SAIPE, 2019",
       size = "Enrollment") +
  theme_bw() +
  guides(color = "none")  # Remove the color legend

# example code to save last plot as a 5" by 7" .png file
ggsave("data/output/NewYork_school_district_poverty.png", #specify the file path/name/type
       plot = ny_scatter, # specify the ggplot object you stored
       units = "in", # specify the units for your image
       height = 5, width = 7) # specify the image dimensions

Homework 5b.

Use the visualization skills you learned today to create 3 plots to explore the New York County data from last week (county poverty, lottery retailers, health indicators).

Follow your interest. Add more data if you desire. On canvas upload your plots and a short paragraph description of what each scatterplot shows.