library(plyr)
library(car)
library(ggplot2)
library(fastDummies)
library(tidyverse)
library(dplyr)
library(tidyr)
library(caTools)
mdata <- read.csv("C:/Users/nahur/Desktop/WGU/Final Projects/D208 Predictive Analysis/medical_clean.csv")
# Check for duplicates
sum(duplicated(mdata))
## [1] 0
# Check for missing values
colSums(is.na(mdata))
## CaseOrder Customer_id Interaction UID
## 0 0 0 0
## City State County Zip
## 0 0 0 0
## Lat Lng Population Area
## 0 0 0 0
## TimeZone Job Children Age
## 0 0 0 0
## Income Marital Gender ReAdmis
## 0 0 0 0
## VitD_levels Doc_visits Full_meals_eaten vitD_supp
## 0 0 0 0
## Soft_drink Initial_admin HighBlood Stroke
## 0 0 0 0
## Complication_risk Overweight Arthritis Diabetes
## 0 0 0 0
## Hyperlipidemia BackPain Anxiety Allergic_rhinitis
## 0 0 0 0
## Reflux_esophagitis Asthma Services Initial_days
## 0 0 0 0
## TotalCharge Additional_charges Item1 Item2
## 0 0 0 0
## Item3 Item4 Item5 Item6
## 0 0 0 0
## Item7 Item8
## 0 0
Can we predict the likeliness of Anxiety using the following variables: Age, Gender, VitD_levels, HighBlood, Stroke, Arthritis, Diabetes, Asthma, Services, TotalCharge
unique(mdata$Gender)
## [1] "Male" "Female" "Nonbinary"
unique(mdata$HighBlood)
## [1] "Yes" "No"
unique(mdata$Stroke)
## [1] "No" "Yes"
unique(mdata$Arthritis)
## [1] "Yes" "No"
unique(mdata$Diabetes)
## [1] "Yes" "No"
unique(mdata$Anxiety)
## [1] "Yes" "No"
unique(mdata$Asthma)
## [1] "Yes" "No"
unique(mdata$Services)
## [1] "Blood Work" "Intravenous" "CT Scan" "MRI"
mdata$Gender <- as.factor(mdata$Gender)
mdata$Services <- as.factor(mdata$Services)
dummy_cols(mdata, select_columns = c("Gender", "Services"),
remove_first_dummy = TRUE)
## CaseOrder Customer_id Interaction UID City State County Zip Lat Lng
## Population Area TimeZone Job Children Age Income Marital Gender ReAdmis
## VitD_levels Doc_visits Full_meals_eaten vitD_supp Soft_drink Initial_admin
## HighBlood Stroke Complication_risk Overweight Arthritis Diabetes
## Hyperlipidemia BackPain Anxiety Allergic_rhinitis Reflux_esophagitis
## Asthma Services Initial_days TotalCharge Additional_charges Item1 Item2
## Item3 Item4 Item5 Item6 Item7 Item8 Gender_Male Gender_Nonbinary
## Services_CT Scan Services_Intravenous Services_MRI
## [ reached 'max' / getOption("max.print") -- omitted 10000 rows ]
# HighBlood
HighBlood <- revalue(x = mdata$HighBlood, replace = c("No" = 0, "Yes" = 1))
mdata$HighBlood <- as.numeric(HighBlood)
unique(mdata$HighBlood)
## [1] 1 0
class(mdata$HighBlood)
## [1] "numeric"
# Stroke
Stroke <- revalue(x = mdata$Stroke, replace = c("No" = 0, "Yes" = 1))
mdata$Stroke <- as.numeric(Stroke)
unique(mdata$Stroke)
## [1] 0 1
class(mdata$Stroke)
## [1] "numeric"
# Arthritis
Arthritis <- revalue(x = mdata$Arthritis, replace = c("No" = 0, "Yes" = 1))
mdata$Arthritis <- as.numeric(Arthritis)
unique(mdata$Arthritis)
## [1] 1 0
class(mdata$Arthritis)
## [1] "numeric"
# Diabetes
Diabetes <- revalue(x = mdata$Diabetes, replace = c("No" = 0, "Yes" = 1))
mdata$Diabetes <- as.numeric(Diabetes)
unique(mdata$Diabetes)
## [1] 1 0
class(mdata$Diabetes)
## [1] "numeric"
# Anxiety
Anxiety <- revalue(x = mdata$Anxiety, replace = c("No" = 0, "Yes" = 1))
mdata$Anxiety <- as.numeric(Anxiety)
unique(mdata$Anxiety)
## [1] 1 0
class(mdata$Anxiety)
## [1] "numeric"
# Asthma
Asthma <- revalue(x = mdata$Asthma, replace = c("No" = 0, "Yes" = 1))
mdata$Asthma <- as.numeric(Asthma)
unique(mdata$Asthma)
## [1] 1 0
class(mdata$Asthma)
## [1] "numeric"
no_yes_label = c("No", "Yes")
# Anxiety vs Age
boxplot(mdata$Age ~ mdata$Anxiety, xlab = "Anxiety", ylab = "Age", names = no_yes_label, main = "Anxiety and Age")
# Anxiety vs Gender
ggplot(mdata, aes(x = Gender, fill = factor(Anxiety))) +
geom_bar() +
xlab("Gender") +
ylab("Count") +
scale_fill_manual(values = c("steelblue", "azure3"), name = "Anxiety", labels = c("No", "Yes")) +
ggtitle("Count of Anxiety by Gender")
# Anxiety vs VitD_levels
boxplot(mdata$VitD_levels ~ mdata$Anxiety, xlab = "Anxiety", ylab = "VitD_levels", names = no_yes_label, main = "Anxiety & VitD_levels")
# Anxiety vs HighBlood
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(HighBlood))) +
geom_bar(position = "stack") +
scale_fill_discrete(name = "High Blood", labels = c("No", "Yes")) +
scale_x_discrete(labels = c("No", "Yes")) +
labs(title = "Anxiety and High Blood Pressure",
x = "Anxiety",
y = "Count")
# Anxiety vs Stroke
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Stroke))) +
geom_bar(position = "stack") +
scale_fill_discrete(name = "Stroke", labels = c("No", "Yes")) +
scale_x_discrete(labels = c("No", "Yes")) +
labs(title = "Anxiety and Stroke",
x = "Anxiety",
y = "Count")
# Anxiety vs Arthritis
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Arthritis))) +
geom_bar(position = "stack") +
scale_fill_discrete(name = "Arthritis", labels = c("No", "Yes")) +
scale_x_discrete(labels = c("No", "Yes")) +
labs(title = "Anxiety and Arthritis",
x = "Anxiety",
y = "Count")
# Anxiety vs Diabetes
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Diabetes))) +
geom_bar(position = "stack") +
scale_fill_discrete(name = "Diabetes", labels = c("No", "Yes")) +
scale_x_discrete(labels = c("No", "Yes")) +
labs(title = "Anxiety and Diabetes",
x = "Anxiety",
y = "Count")
# Anxiety vs Asthma
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Asthma))) +
geom_bar(position = "stack") +
scale_fill_discrete(name = "Asthma", labels = c("No", "Yes")) +
scale_x_discrete(labels = c("No", "Yes")) +
labs(title = "Anxiety and Asthma",
x = "Anxiety",
y = "Count")
# Anxiety vs Services
ggplot(mdata, aes(x = Services, fill = factor(Anxiety))) +
geom_bar() +
scale_fill_discrete(name = "Anxiety", labels = c("No", "Yes")) +
labs(title = "Services and Anxiety",
x = "Services",
y = "Count")
# Anxiety vs TotalCharge
boxplot(mdata$TotalCharge ~ mdata$Anxiety, xlab = "Anxiety", ylab = "TotalCharge", names = no_yes_label, main = "Anxiety vs TotalCharge")
initial_model <- glm(Anxiety ~ Age + Gender + VitD_levels + HighBlood + Stroke + Arthritis + Diabetes + Asthma + Services + TotalCharge, data = mdata, family = "binomial")
summary(initial_model)
##
## Call:
## glm(formula = Anxiety ~ Age + Gender + VitD_levels + HighBlood +
## Stroke + Arthritis + Diabetes + Asthma + Services + TotalCharge,
## family = "binomial", data = mdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9835 -0.8910 -0.8568 1.4694 1.6332
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.101e+00 2.110e-01 -5.220 1.79e-07 ***
## Age 5.629e-04 1.039e-03 0.542 0.58806
## GenderMale -3.232e-02 4.338e-02 -0.745 0.45620
## GenderNonbinary 4.478e-02 1.480e-01 0.303 0.76223
## VitD_levels 8.110e-03 1.063e-02 0.763 0.44558
## HighBlood 3.330e-02 4.356e-02 0.764 0.44458
## Stroke -7.338e-02 5.412e-02 -1.356 0.17515
## Arthritis 4.810e-02 4.464e-02 1.078 0.28122
## Diabetes -1.291e-02 4.815e-02 -0.268 0.78859
## Asthma 5.827e-02 4.709e-02 1.237 0.21600
## ServicesCT Scan -3.572e-02 6.834e-02 -0.523 0.60121
## ServicesIntravenous 2.508e-02 4.827e-02 0.520 0.60336
## ServicesMRI -7.641e-02 1.154e-01 -0.662 0.50798
## TotalCharge 3.043e-05 9.836e-06 3.094 0.00198 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12560 on 9999 degrees of freedom
## Residual deviance: 12542 on 9986 degrees of freedom
## AIC: 12570
##
## Number of Fisher Scoring iterations: 4
vif(initial_model)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.001180 1 1.000590
## Gender 1.002164 2 1.000541
## VitD_levels 1.001251 1 1.000625
## HighBlood 1.001060 1 1.000530
## Stroke 1.001159 1 1.000579
## Arthritis 1.001871 1 1.000935
## Diabetes 1.001925 1 1.000962
## Asthma 1.001026 1 1.000513
## Services 1.003540 3 1.000589
## TotalCharge 1.002487 1 1.001243
reduced_model <- step(initial_model, direction = "backward")
summary(reduced_model)
##
## Call:
## glm(formula = Anxiety ~ TotalCharge, family = "binomial", data = mdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9232 -0.8978 -0.8569 1.4743 1.5514
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.104e-01 5.687e-02 -16.009 < 2e-16 ***
## TotalCharge 3.063e-05 9.821e-06 3.119 0.00181 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12560 on 9999 degrees of freedom
## Residual deviance: 12550 on 9998 degrees of freedom
## AIC: 12554
##
## Number of Fisher Scoring iterations: 4
plot(reduced_model)
# Confidence Interval of reduced model
confint(reduced_model)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.022174e+00 -7.992405e-01
## TotalCharge 1.138483e-05 4.988378e-05
split <- sample.split(mdata$Anxiety, SplitRatio = 0.8)
train <- subset(mdata, split == 1)
test <- subset(mdata, split == 0)
nrow(train)
## [1] 8000
nrow(test)
## [1] 2000
result <- predict(reduced_model, newdata = test, type = "response")
view(result)
range(result) # Range of probabilities
## [1] 0.2996811 0.3462956
table(test$Anxiety, result > 0.31)
##
## FALSE TRUE
## 0 520 837
## 1 218 425