QBS181_ProblemSet3

Part One

Question One

Recall the HigherMe dataset. On the canvas homepage, I’ve linked a csv file which contains the data cleaning up to the step where we need to convert those invoices which are quarterly to monthly invoices. Using the same logic you applied in the excel project, convert quarterly invoices to monthly invoices in R. Display the first 10 rows of your updated dataset.

setwd("C:/Users/10213/OneDrive/桌面")

higherme <- read.csv("HigherMeDataForR.csv")

rmCurrency <- function(x){
  x <- trimws(as.character(x))
  x <- gsub("\\$", "", x)
  x <- gsub(",", "", x)
  x[x == "-"] <- "0"
  as.numeric(x)
}

higherme$Amount <- rmCurrency(higherme$Amount)

colnames(higherme)[colnames(higherme) == "Quarterly.Monthly"] <- "Quarterly_Monthly"

quarterly <- subset(higherme, Quarterly_Monthly == "QUARTERLY")
monthly_expanded <- quarterly[rep(1:nrow(quarterly), each = 3), ]
monthly_expanded$Quarterly_Monthly <- "MONTHLY"
monthly_expanded$Amount <- monthly_expanded$Amount / 3
months <- c("Month1", "Month2", "Month3")
monthly_expanded$Month <- rep(months, times = nrow(quarterly))

monthly_only <- subset(higherme, Quarterly_Monthly == "MONTHLY")
monthly_only$Month <- NA

updated <- rbind(monthly_only, monthly_expanded)

head(updated, 10)

Question Two

Recall the hospital database. Recreate the physician-referral table from Question 9 on your SQL homework using R. I’ve loaded a zip folder of all the necessary csv files to do this on canvas, as well as a snapshot of what the table should look like for your reference.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.2     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## Warning: package 'janitor' was built under R version 4.5.2

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

physician <- read_delim("Physician.csv", delim = ";", quote = "\"") %>% clean_names()

## Rows: 9 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (2): Name, Position
## dbl (2): EmployeeID, SSN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

patient <- read_delim("Patient.csv", delim = ";", quote = "\"") %>% clean_names()

## Rows: 103 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (3): Name, Address, Phone
## dbl (3): SSN, InsuranceID, PCP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

undergoes <- read_delim("Undergoes.csv", delim = ";", quote = "\"") %>% clean_names()

## Rows: 35 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl  (5): Patient, Medical_Procedure, Stay, Physician, AssistingNurse
## dttm (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

affiliated <- read_delim("Affiliated_With.csv", delim = ";", quote = "\"") %>% clean_names()

## Rows: 11 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (3): Physician, Department, PrimaryAffiliation
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dept <- read_delim("Department.csv", delim = ";", quote = "\"") %>% clean_names()

## Rows: 3 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (1): Name
## dbl (2): DepartmentID, Head
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

procedure <- read_delim("Medical_Procedure.csv", delim = ";", quote = "\"") %>% clean_names()

## Rows: 7 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (1): Name
## dbl (2): Code, Cost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

referral_data <- patient %>%
  left_join(physician, by = c("pcp" = "employee_id")) %>%
  rename(referring_physician = name.y,
         patient_name = name.x) %>%
  select(-patient_name) %>%
  left_join(undergoes, by = c("ssn.x" = "patient")) %>%
  left_join(physician, by = c("physician" = "employee_id")) %>%
  rename(referral = name) %>%
  left_join(affiliated, by = c("pcp" = "physician")) %>%
  rename(primary_department_id = department) %>%
  left_join(dept, by = c("primary_department_id" = "department_id")) %>%
  rename(primary_department = name) %>%
  left_join(affiliated, by = c("physician" = "physician")) %>%
  rename(referral_department_id = department) %>%
  left_join(dept, by = c("referral_department_id" = "department_id")) %>%
  rename(referral_department = name) %>%
  left_join(procedure, by = c("medical_procedure" = "code")) %>%
  filter(referring_physician != referral)

## Warning in left_join(., affiliated, by = c(pcp = "physician")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Warning in left_join(., affiliated, by = c(physician = "physician")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 8 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

referral_summary <- referral_data %>%
  group_by(referring_physician, referral, primary_department, referral_department) %>%
  summarise(
    shared_patients = n_distinct(ssn.x),
    shared_billing_cost = sum(cost, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(shared_billing_cost = ifelse(shared_billing_cost == 0, NA, shared_billing_cost)) %>%
  arrange(referring_physician, desc(shared_patients), referral)

head(referral_summary, 15)



### Question Three

Which tool did you find it 'easiest' to use while completing these exercises? What advice would you give novice data wranglers when it comes to choosing between Excel, SQL, and R? Please make your answer either a different text colour, or bolded, when you knit this document so TA's can find it. 

**hint, I'll be bolded when this knits**

**I found R the easiest to use** while completing these exercises.  
R allows me to automate data cleaning and merging tasks that would take much longer in Excel, and it is more flexible than SQL when dealing with mixed data types.  

My advice for novice data wranglers is to start with Excel to understand the basic logic of data manipulation, then move to R for automation and reproducibility. **R is powerful once you get familiar with its syntax.**


## Part Two

We are going download US Census data using the Census API. To start, you will need to request a key here: <https://api.census.gov/data/key_signup.html>. 

We'll be using the following package:



A vignette demonstrating much of the functionality of this package can be found here <https://walker-data.com/census-r/index.html>

Start by setting your API key. 


``` r
census_api_key("3a275dd0931a9e2488f36366b88616b79968a68c", overwrite = TRUE, install = TRUE)

## Your original .Renviron will be backed up and stored in your R HOME directory if needed.

## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`

## [1] "3a275dd0931a9e2488f36366b88616b79968a68c"

The function ‘get_acs()’ will download the American Community Survey (ACS) Census data. You will need to know the variable ID - and there are thousands of variables across the different files. To rapidly search for variables, use the commands ‘load_variables()’ and ‘View()’. We’ll do this below:

v19 <- load_variables(2019, "acs5", cache = TRUE)
View(v19)

As you can see, there are many types of data avaiable to us in the census. In the View table, you can user filters to explore the kind of data that is available to you. For instance, try fitering by ‘income’ in the concept column.

The full metadata is available here https://www.socialexplorer.com/data/ACS2019_5yr/metadata/.

For now, we’ll use the following:

newEngDat <- get_acs(geography = "new england city and town area", 
              year = 2019,
              variables = c(popn = "B03002_001", 
                            white = "B03002_003", blk = "B03002_004", 
                            asn = "B03002_006", hisp = "B03002_012", 
                            medHouseInc="B19013_001", hlthInsCov="B27001_001",
                            workPop="B08604_001",workTravel="B08013_001",
                            workHome="B08006_017", mthExp="B25088_001", 
                            mthHousing="B25105_001"), 
              survey = "acs5",
              output = "wide")

## Getting data from the 2015-2019 5-year ACS

In the above code, we specified the following arguments:

geography: The level of geography we want the data in year: The end year of the data (because we want 2015-2019, we use 2019). variables: The variables we want to bring in as specified in a vector you create using the function c(). Note that we created variable names of our own (e.g. “popn”) and we put the ACS IDs in quotes (“B03002_001”). survey: The specific Census survey were extracting data from. We want data from the 5-year American Community Survey, so we specify “acs5”. The ACS comes in 1-, 3-, and 5-year varieties. output: gives us a traditional dataset, alternatively “tidy” would give us a tibble.

See ?get_acs for more variables you could request.

We then have the following columns in our data:

GEOID: A unique ID variable of the geography Name: The Name of the geographic area popn: The total population white: The population of people who identify as white blk: The population of people who identify as black asn: The population of people who identify as asian hisp: The population of people who identify as hispanic medHouseInc: The median household income hlthInsCov: The population who have health insurance coverage workPop: The worker population workTravel: Aggregrate travel time to work, in minutes workHome: Number of workers who work from home mthExp: Median monthly cost of living estimate mthHousing: Median housing costs per month

You’ll notice that there is an ‘E’ and an ‘M’ beside each of the column names in your dataset. The ‘E’ stands for estimate, and ‘M’ margin of error. While important, we will not be analyzing margins of error.

Question Four

Remove the margin of error columns, and then remove the ‘E’ from the end of the other column names.

# your code here
# Remove margin of error columns (those ending with "M")
newEngDat <- newEngDat[, !grepl("M$", names(newEngDat))]

# Remove "E" from the end of the remaining variable names
names(newEngDat) <- sub("E$", "", names(newEngDat))

# View cleaned dataset
head(newEngDat)

Question 5

Which 10 communities have the largest proportion of their working population work from home?

library(dplyr)

top10_workhome <- newEngDat %>%
  mutate(workHomeProp = workHome / workPop) %>%  
  arrange(desc(workHomeProp)) %>%                
  select(NAM, workHome, workPop, workHomeProp) %>%
  head(10)                                        

top10_workhome

Question 6

We’ll define discretionary income as Income-Expenses. Right now, you have annual income and monthly expenses. Create a new column to calculate monthly discretionay income, and display the towns with the highest amounts of discretionary income.

# your code here
library(dplyr)

top10_income <- newEngDat %>%
  mutate(discIncome = (medHouseInc / 12) - mthExp) %>%  
  arrange(desc(discIncome)) %>%                        
  select(NAM, medHouseInc, mthExp, discIncome) %>%    
  head(10)                                            

top10_income

Question 7

Which 5 towns have the largest proportional gaps in healthcare coverage?

# your code here
library(dplyr)

top5_health_gap <- newEngDat %>%
  mutate(coverageRate = hlthInsCov / popn,             
         coverageGap = 1 - coverageRate) %>%         
  arrange(desc(coverageGap)) %>%                       
  select(NAM, popn, hlthInsCov, coverageRate, coverageGap) %>%
  head(5)                                            

top5_health_gap

Question 8

The divesity index of a geographic area is the probability that two people selected at random will be the same race. Create a function which will sample from the reported ethnic population in each geographic area and return the diversity index. Display the top 5 diverse towns.

# your code here
library(dplyr)

calc_diversity <- function(white, blk, asn, hisp, popn) {
  p_white <- white / popn
  p_blk   <- blk / popn
  p_asn   <- asn / popn
  p_hisp  <- hisp / popn
  
  same_prob <- p_white^2 + p_blk^2 + p_asn^2 + p_hisp^2
  diversity <- 1 - same_prob
  return(diversity)
}

top5_diverse <- newEngDat %>%
  mutate(diversity_index = calc_diversity(white, blk, asn, hisp, popn)) %>%
  arrange(desc(diversity_index)) %>%
  select(NAM, white, blk, asn, hisp, popn, diversity_index) %>%
  head(5)

top5_diverse

Question 9

Convert the ethnicity columns to be percentages. Make a boxplot where each ethnicity is represented on the x-axis, and percent is on the y-axis. Points will be awarded for ‘prettier’ plots!

library(ggplot2)
library(dplyr)
library(tidyr)

ethnicity_pct <- newEngDat %>%
  mutate(white_pct = white / popn * 100,
         blk_pct   = blk / popn * 100,
         asn_pct   = asn / popn * 100,
         hisp_pct  = hisp / popn * 100) %>%
  select(NAM, white_pct, blk_pct, asn_pct, hisp_pct)

ethnicity_long <- ethnicity_pct %>%
  pivot_longer(cols = c(white_pct, blk_pct, asn_pct, hisp_pct),
               names_to = "Ethnicity",
               values_to = "Percent")

ggplot(ethnicity_long, aes(x = Ethnicity, y = Percent, fill = Ethnicity)) +
  geom_boxplot(alpha = 0.8, outlier.color = "red", color = "gray40") +
  geom_jitter(width = 0.15, alpha = 0.4, color = "black") +
  labs(
    title = "Ethnic Composition Across New England Towns",
    subtitle = "Distribution of major ethnic groups as a share of local population (%)",
    x = "Ethnicity",
    y = "Percentage of Population (%)"
  ) +
  scale_fill_brewer(palette = "Pastel1") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
    plot.subtitle = element_text(hjust = 0.5, size = 12, color = "gray30"),
    legend.position = "none"
  )

Question 10

Ask a question of your choosing. Output both the head of a table, and a simple plot answering your question. Feel free to use the API to import extra variables that may be of interest to you. My Question is: Which New England towns spend the largest share of their income on housing?

library(dplyr)
library(ggplot2)

housing_burden <- newEngDat %>%
  mutate(housing_share = (mthHousing * 12) / medHouseInc * 100) %>%
  select(NAM, medHouseInc, mthHousing, housing_share) %>%
  arrange(desc(housing_share))

head(housing_burden)

top10_housing <- housing_burden %>%
  slice_max(housing_share, n = 10)

ggplot(top10_housing, aes(x = reorder(NAM, housing_share), y = housing_share)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 New England Towns by Housing Cost Burden",
       subtitle = "Housing cost as a percentage of median household income",
       x = "Town",
       y = "Housing Share (%)") +
  theme_minimal(base_size = 13)