Humana Case Competation
Initial Settings
Data Import & Reshape
varnames <- c("ID", paste0(month.abb, "-01-2016"), paste0(month.abb, "-01-2017"),
paste0(month.abb, "-01-2018"), paste0(month.abb, "-01-2019"))
Demongraphics <- read_xlsx("Demongrahics.xlsx")
Claims <- read_xlsx("Claims.xlsx")
ER_Utilization <- read_xlsx("ER Utilization.xlsx")
Hospital_Admissions <- read_xlsx("Hospital Admissions.xlsx")
PCP_Visits <- read_xlsx("PCP Visits.xlsx")
Rx_Utilization <- read_xlsx("Rx Utilization.xlsx")
# Rename & Reshape====================================================
names(Claims) <- varnames
names(ER_Utilization) <- varnames
names(Hospital_Admissions) <- varnames
names(PCP_Visits) <- varnames
names(Rx_Utilization) <- varnames
Claims <- Claims %>% pivot_longer(`Jan-01-2016`:`Dec-01-2019`, names_to = "Date") %>%
rename(claim = value) %>% mutate(Date = mdy(Date), month = month(Date), year = year(Date))
ER_Utilization <- ER_Utilization %>% pivot_longer(`Jan-01-2016`:`Dec-01-2019`, names_to = "Date") %>%
rename(ER_utilization = value) %>% mutate(Date = mdy(Date), month = month(Date),
year = year(Date))
Hospital_Admissions <- Hospital_Admissions %>% pivot_longer(`Jan-01-2016`:`Dec-01-2019`,
names_to = "Date") %>% rename(Hospital_admissions = value) %>% mutate(Date = mdy(Date),
month = month(Date), year = year(Date))
PCP_Visits <- PCP_Visits %>% pivot_longer(`Jan-01-2016`:`Dec-01-2019`, names_to = "Date") %>%
rename(PCP_visits = value) %>% mutate(Date = mdy(Date), month = month(Date),
year = year(Date))
Rx_Utilization <- Rx_Utilization %>% pivot_longer(`Jan-01-2016`:`Dec-01-2019`, names_to = "Date") %>%
rename(Rx_utilization = value) %>% mutate(Date = mdy(Date), month = month(Date),
year = year(Date))
# Merge ==============================================================
merge_data <- Claims %>% left_join(ER_Utilization, by = c("ID", "Date", "year", "month")) %>%
left_join(Hospital_Admissions, by = c("ID", "Date", "year", "month")) %>% left_join(PCP_Visits,
by = c("ID", "Date", "year", "month")) %>% left_join(Rx_Utilization, by = c("ID",
"Date", "year", "month")) %>% select(ID, Date, month, year, everything())
# Control Group =====================================================
Control_Demongraphics <- Demongraphics %>% filter((Outreach_Transportation == 0) &
(Outreach_FinancialAssistance == 0) & (Outreach_Loneliness == 0) & (Outreach_FoodInsecurity ==
0))
Train_Demongraphics <- Demongraphics %>% filter(!ID %in% Control_Demongraphics$ID)
Train_group <- merge_data %>% filter(!ID %in% Control_Demongraphics$ID)
Control_group <- merge_data %>% filter(ID %in% Control_Demongraphics$ID)
#
write.csv(Control_Demongraphics, "Control_deomongraphics.csv")
write.csv(Train_Demongraphics, "Train_Demongraphics.csv")
write.csv(Train_group, "Train_group.csv")
write.csv(Control_group, "Control_group.csv")
# Data Cleaning Train
Train_Demongraphics <- Train_Demongraphics %>% filter(Low_Income %in% c("N", "Y"),
Region != "Unknown", Rural != "Unknown", Gender %in% c("F", "M"))
Train_group <- Train_group %>% filter(ID %in% Train_Demongraphics$ID)
## Control
Control_Demongraphics <- Control_Demongraphics %>% filter(Region != "Unknown", Rural !=
"Unknown")
Control_group <- Control_group %>% filter(ID %in% Control_Demongraphics$ID)EDA
Demongraphics
Train_Demongraphics %>% ggplot(aes(x = Age)) + geom_density(fill = "darkred", color = "darkred",
alpha = 0.7) + geom_vline(xintercept = round(median(Train_Demongraphics$Age),
2), size = 1.5, color = "black", linetype = 2) + ggtitle("Distribution of Age",
subtitle = paste0("The median of age is", round(median(Train_Demongraphics$Age),
2), "."))
Central Northeast Northwest Southeast Southwest
1272 975 512 1572 585
Central Northeast Northwest Southeast Southwest
650 500 263 729 270
Rural Semi-Rural Suburban Urban
409 934 1550 2023
Rural Semi-Rural Suburban Urban
193 477 725 1017
N Y
2672 2244
N Y
1615 797
Trend in monthly claims
Control_group %>%
filter(!claim > 50000) %>% # outliers
group_by(Date) %>%
summarise(monthly_claim = sum(claim)) %>%
ggplot(aes(x = Date, y = monthly_claim)) +
geom_point(aes(size = log(monthly_claim)), alpha = 0.7) +
geom_smooth() +
labs(y = "Monthly Claim") +
theme(legend.position = "null") +
ggtitle("Trend in monthly claims",
"The aggreagated monthly claim exsits a escalating trend")Aggregation of outreach
Control_group %>% group_by(Date) %>% summarise(ER_utilization = sum(ER_utilization),
Hospital_admissions = sum(Hospital_admissions), PCP_visits = sum(PCP_visits),
Rx_utilization = sum(Rx_utilization)) %>% pivot_longer(ER_utilization:Rx_utilization,
names_to = "Type") %>% ggplot(aes(x = Date, y = value)) + geom_area(aes(fill = Type)) +
labs(y = NULL) + ggtitle("Aggregation of Utilization")library(ggthemes)
agg_demo <- merge_data %>% group_by(ID) %>% summarise(ER_utilization = sum(ER_utilization),
Hospital_admissions = sum(Hospital_admissions), PCP_visits = sum(PCP_visits),
Rx_utilization = sum(Rx_utilization)) %>% left_join(Demongraphics, by = "ID")
agg_demo %>% filter(Gender %in% c("F", "M")) %>% select(ID, ER_utilization:Rx_utilization,
Gender) %>% pivot_longer(ER_utilization:Rx_utilization, names_to = "Type") %>%
mutate(value = ifelse(Gender == "M", -value, value), Type = factor(Type, levels = c("Rx_utilization",
"PCP_visits", "ER_utilization", "Hospital_admissions"))) %>% ggplot(aes(x = Type,
y = value, fill = Gender)) + geom_bar(stat = "identity", width = 0.5) + coord_flip() +
theme_tufte() + theme(axis.ticks = element_blank(), plot.title = element_text(hjust = 0.45)) +
scale_y_continuous(breaks = c(-35000, -20000, 0, 20000, 40000), labels = c(35000,
20000, 0, 20000, 40000)) + labs(x = "Type", y = "Times") + ggtitle("Utilization")