OD <- read_csv("~/Documents/School/Junior Year/Fall24/Exploratory Cloud/DrugOverdose_EDA_AndrewMarchant/Data/Drug Overdose 2022.csv",
col_types = cols(Opioids = col_double(),
Heroin = col_double(), `Natural & semi-synthetic opioids, incl. methadone (T40.2, T40.3)` = col_double(),
`Natural, semi-synthetic, & synthetic opioids, incl. methadone (T40.2-T40.4)` = col_double(),
`Synthetic opioids, excl. methadone (T40.4)` = col_double(),
`Psychostimulants with abuse potential (T43.6)` = col_double()), show_col_types = FALSE)
Census <- read_csv("~/Documents/School/Junior Year/Fall24/Exploratory Cloud/DrugOverdose_EDA_AndrewMarchant/Data/Census.csv", show_col_types = FALSE)
OD$NAME <- OD$`State Name`
OD.Merged <- inner_join(Census, OD, by = "NAME")
Depression.Rate <- read_csv("~/Documents/School/Junior Year/Fall24/Exploratory Cloud/DrugOverdose_EDA_AndrewMarchant/Data/depression-rates-by-state-2024.csv", show_col_types = FALSE)
Homeless.Population <- read_csv("~/Documents/School/Junior Year/Fall24/Exploratory Cloud/DrugOverdose_EDA_AndrewMarchant/Data/homeless-population-by-state-2024.csv", show_col_types = FALSE)
Alcohol.Consumption <- read_csv("~/Documents/School/Junior Year/Fall24/Exploratory Cloud/DrugOverdose_EDA_AndrewMarchant/Data/alcohol-consumption-by-state-2024 (1).csv", show_col_types = FALSE)
Depression.Rate$NAME <- Depression.Rate$state
Homeless.Population$NAME <- Homeless.Population$state
Alcohol.Consumption$NAME <- Alcohol.Consumption$state
OD.Merged <- inner_join(Depression.Rate, OD.Merged, by = "NAME")
OD.Merged <- inner_join(Homeless.Population, OD.Merged, by = "NAME")
OD.Merged <- inner_join(Alcohol.Consumption, OD.Merged, by = "NAME")
OD.Merged <- OD.Merged[, c(4, 8, 15, 18:19, 21:22, 24:29)]
OD.Merged$OD.Percentage <- OD.Merged$`Number of Drug Overdose Deaths` / OD.Merged$`Number of Deaths` * 100
OD.Merged1 <- OD.Merged
colnames(OD.Merged1)[1] = "Depression"
colnames(OD.Merged1)[2] = "Homelessness"
colnames(OD.Merged1)[3] = "Alcohol"
colnames(OD.Merged1)[14] = "ODPercent"
subset_OD_Data <- OD.Merged[, c(1:3, 14)]
colnames(subset_OD_Data)[1] <- "Alcohol.Excessive"
colnames(subset_OD_Data)[2] <- "Homeless.Pop"
colnames(subset_OD_Data)[3] <- "Depression.Rate"
colnames(subset_OD_Data)[4] <- "OD.Percentage"Overdose_EDA_Week2
Clean Data
https://worldpopulationreview.com
Shiny App
Bar Graphs
# Define UI for application
ui <- fluidPage(
titlePanel("State Data Dashboard"),
# Sidebar layout
sidebarLayout(
sidebarPanel(
selectInput("x_variable", "Select a X variable:",
choices = c("Depression Rate" = "Depression",
"Homelessness" = "Homelessness",
"Alcohol Use" = "Alcohol",
"OD Percent" = "ODPercent")),
selectInput("y_variable", "Select a Y Variable",
choices = c("Depression Rate" = "Depression",
"Homelessness" = "Homelessness",
"Alcohol Use" = "Alcohol",
"OD Percent" = "ODPercent"))
),
# Main panel for displaying outputs
mainPanel(
plotlyOutput("statePlot"),
plotlyOutput("ComparisonPlot")
)
)
)
# Define server logic
server <- function(input, output) {
output$statePlot <- renderPlotly({
# Generate plot based on the selected input
ggplot(OD.Merged1, aes_string(x = "State", y = input$x_variable)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(y = input$x_variable, x = "State") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
output$ComparisonPlot <- renderPlotly({
ggplot(OD.Merged1, aes_string(x = input$x_variable, y = input$y_variable)) +
geom_text(aes(label = State), color = "black", alpha = .5) +
theme_minimal() +
labs(x = input$x_variable, y = input$y_variable)
})
}
# Run the application
shinyApp(ui = ui, server = server) K means clustering plots between: excessive alcohol consumption, homeless population, depression rate, compared to OD Percentage
# Define UI for application that draws a histogram
ui <- fluidPage(
# Application title
titlePanel("K Means Cluster Plots"),
# Sidebar with a slider input for number of bins
sidebarLayout(
sidebarPanel(
selectInput("x_variable", "Select x variable",
choices = c("Excessive Alcohol Consumption" = "Alcohol.Excessive",
"Homeless Population" = "Homeless.Pop",
"Depression Rate" = "Depression.Rate",
"Overdose Percentage" = "OD.Percentage")
),
selectInput("y_variable", "Select y variable",
choices = c("Excessive Alcohol Consumption" = "Alcohol.Excessive",
"Homeless Population" = "Homeless.Pop",
"Depression Rate" = "Depression.Rate",
"Overdose Percentage" = "OD.Percentage")),
selectInput("z_variable", "Select z variable",
choices = c("Excessive Alcohol Consumption" = "Alcohol.Excessive",
"Homeless Population" = "Homeless.Pop",
"Depression Rate" = "Depression.Rate",
"Overdose Percentage" = "OD.Percentage",
"None")),
sliderInput("clusters", "Number of Clusters:", min = 1, max = 10, value = 3)
),
# Show a plot of the generated distribution
mainPanel(
plotlyOutput("KMeansClustering"),
plotlyOutput("ElbowGraph")
)
)
)
# Define server logic required to draw a histogram
server <- function(input, output) {
output$KMeansClustering <- renderPlotly({
if (input$z_variable == "None") {
set.seed(123)
k <- input$clusters
x_var <- input$x_variable
y_var <- input$y_variable
df <- subset_OD_Data[, c(x_var,y_var)]
df <- scale(df)
km.res <- kmeans(df, k, nstart = 25)
fviz_cluster(km.res, data = df,
geom = "point", ellipse.type = "norm",
main = paste("Clustering of", x_var, "vs", y_var),
xlab = input$x_var, ylab = input$y_var)
} else {
set.seed(123)
x_var <- input$x_variable
y_var <- input$y_variable
z_var <- input$z_variable
df <- subset_OD_Data[, c(x_var,y_var, z_var)]
df <- scale(df)
k <- input$clusters
df <- as.data.frame(df)
km.res <- kmeans(df, k, nstart = 25)
df$cluster <- as.factor(km.res$cluster)
plot_ly(df, x = ~df[[x_var]], y = ~df[[y_var]], z = df[[z_var]],
color = ~cluster, colors = c("#1f77b4", "#ff7f0e", "#2ca02c"),
type = "scatter3d", mode = "markers", marker = list(size = 5)) %>%
layout(scene = list(xaxis = list(title = input$x_var),
yaxis = list(title = input$y_var),
zaxis = list(title = input$z_var)))
}
})
output$ElbowGraph <- renderPlotly({
if (input$z_variable == "None") {
x_var <- input$x_variable
y_var <- input$y_variable
df <- subset_OD_Data[, c(x_var,y_var)]
df <- scale(df)
fviz_nbclust(df, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal Clusters")
} else {
x_var <- input$x_variable
y_var <- input$y_variable
z_var <- input$z_variable
df <- subset_OD_Data[, c(x_var,y_var, z_var)]
df <- scale(df)
fviz_nbclust(df, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal Clusters")
}
})
}
# Run the application
shinyApp(ui = ui, server = server)T-Tests
OD.Merged$OverdoseRatePer100k <- OD.Merged$`Number of Drug Overdose Deaths` / OD.Merged$POPESTIMATE2019 * 100000
high_alc <- subset(OD.Merged, AlcoholConsumptionExcessiveDrinkingRate > median(AlcoholConsumptionExcessiveDrinkingRate, na.rm=TRUE))
low_alc <- subset(OD.Merged, AlcoholConsumptionExcessiveDrinkingRate <= median(AlcoholConsumptionExcessiveDrinkingRate, na.rm=TRUE))
alc_result <- t.test(high_alc$OverdoseRatePer100k, low_alc$OverdoseRatePer100k)
alc_result
Welch Two Sample t-test
data: high_alc$OverdoseRatePer100k and low_alc$OverdoseRatePer100k
t = -0.7002, df = 46.886, p-value = 0.4873
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-10.207261 4.936672
sample estimates:
mean of x mean of y
32.05609 34.69139
high_homeless <- subset(OD.Merged, HomelessPopulationPer10kResidents
> median(HomelessPopulationPer10kResidents, na.rm=TRUE))
low_homeless <- subset(OD.Merged, HomelessPopulationPer10kResidents
<= median(HomelessPopulationPer10kResidents, na.rm=TRUE))
homeless_result <- t.test(high_homeless$OverdoseRatePer100k, low_homeless$OverdoseRatePer100k)
homeless_result
Welch Two Sample t-test
data: high_homeless$OverdoseRatePer100k and low_homeless$OverdoseRatePer100k
t = 0.093882, df = 46.33, p-value = 0.9256
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-7.268666 7.980009
sample estimates:
mean of x mean of y
33.58210 33.22642
high_depression <- subset(OD.Merged, `DepressionRatesByStateAge-AdjustedDepressionRate`
> median(`DepressionRatesByStateAge-AdjustedDepressionRate`, na.rm=TRUE))
low_depression <- subset(OD.Merged, `DepressionRatesByStateAge-AdjustedDepressionRate`
<= median(`DepressionRatesByStateAge-AdjustedDepressionRate`, na.rm=TRUE))
depression_result <- t.test(high_depression$OverdoseRatePer100k, low_depression$OverdoseRatePer100k)
depression_result
Welch Two Sample t-test
data: high_depression$OverdoseRatePer100k and low_depression$OverdoseRatePer100k
t = 1.9517, df = 40.192, p-value = 0.05796
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.2575832 14.8177839
sample estimates:
mean of x mean of y
37.41211 30.13201
Shapiro Test - OD Rate
shapiro_test <- shapiro.test(OD.Merged$OverdoseRatePer100k)
print(shapiro_test)
Shapiro-Wilk normality test
data: OD.Merged$OverdoseRatePer100k
W = 0.97285, p-value = 0.3132
Levene Test
leveneTest(OverdoseRatePer100k ~ factor(AlcoholConsumptionExcessiveDrinkingRate > median(AlcoholConsumptionExcessiveDrinkingRate)), data = OD.Merged)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 1 0.2519 0.6181
47
leveneTest(OverdoseRatePer100k ~ factor(HomelessPopulationPer10kResidents > median(HomelessPopulationPer10kResidents)), data = OD.Merged)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 1 0.5253 0.4722
47
leveneTest(OverdoseRatePer100k ~ factor(`DepressionRatesByStateAge-AdjustedDepressionRate` > median(`DepressionRatesByStateAge-AdjustedDepressionRate`)), data = OD.Merged)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 1 0.5639 0.4564
47