Predictive Analysis

library(plyr)
library(car)
library(ggplot2)
library(fastDummies)
library(tidyverse)
library(dplyr)
library(tidyr)
library(caTools)

mdata <- read.csv("C:/Users/nahur/Desktop/WGU/Final Projects/D208 Predictive Analysis/medical_clean.csv")

# Check for duplicates
sum(duplicated(mdata))

## [1] 0

# Check for missing values
colSums(is.na(mdata))

##          CaseOrder        Customer_id        Interaction                UID 
##                  0                  0                  0                  0 
##               City              State             County                Zip 
##                  0                  0                  0                  0 
##                Lat                Lng         Population               Area 
##                  0                  0                  0                  0 
##           TimeZone                Job           Children                Age 
##                  0                  0                  0                  0 
##             Income            Marital             Gender            ReAdmis 
##                  0                  0                  0                  0 
##        VitD_levels         Doc_visits   Full_meals_eaten          vitD_supp 
##                  0                  0                  0                  0 
##         Soft_drink      Initial_admin          HighBlood             Stroke 
##                  0                  0                  0                  0 
##  Complication_risk         Overweight          Arthritis           Diabetes 
##                  0                  0                  0                  0 
##     Hyperlipidemia           BackPain            Anxiety  Allergic_rhinitis 
##                  0                  0                  0                  0 
## Reflux_esophagitis             Asthma           Services       Initial_days 
##                  0                  0                  0                  0 
##        TotalCharge Additional_charges              Item1              Item2 
##                  0                  0                  0                  0 
##              Item3              Item4              Item5              Item6 
##                  0                  0                  0                  0 
##              Item7              Item8 
##                  0                  0

Research Question

Can we predict the likeliness of Anxiety using the following variables: Age, Gender, VitD_levels, HighBlood, Stroke, Arthritis, Diabetes, Asthma, Services, TotalCharge

unique(mdata$Gender)

## [1] "Male"      "Female"    "Nonbinary"

unique(mdata$HighBlood)

## [1] "Yes" "No"

unique(mdata$Stroke)

## [1] "No"  "Yes"

unique(mdata$Arthritis)

## [1] "Yes" "No"

unique(mdata$Diabetes)

## [1] "Yes" "No"

unique(mdata$Anxiety)

## [1] "Yes" "No"

unique(mdata$Asthma)

## [1] "Yes" "No"

unique(mdata$Services)

## [1] "Blood Work"  "Intravenous" "CT Scan"     "MRI"

mdata$Gender <- as.factor(mdata$Gender)
mdata$Services <- as.factor(mdata$Services)

One Hot Encoding + K-1

dummy_cols(mdata, select_columns = c("Gender", "Services"),
           remove_first_dummy = TRUE)

##      CaseOrder Customer_id Interaction UID City State County Zip Lat Lng
##      Population Area TimeZone Job Children Age Income Marital Gender ReAdmis
##      VitD_levels Doc_visits Full_meals_eaten vitD_supp Soft_drink Initial_admin
##      HighBlood Stroke Complication_risk Overweight Arthritis Diabetes
##      Hyperlipidemia BackPain Anxiety Allergic_rhinitis Reflux_esophagitis
##      Asthma Services Initial_days TotalCharge Additional_charges Item1 Item2
##      Item3 Item4 Item5 Item6 Item7 Item8 Gender_Male Gender_Nonbinary
##      Services_CT Scan Services_Intravenous Services_MRI
##  [ reached 'max' / getOption("max.print") -- omitted 10000 rows ]

Ordinal Encoding

# HighBlood
HighBlood <- revalue(x = mdata$HighBlood, replace = c("No" = 0, "Yes" = 1))
mdata$HighBlood <- as.numeric(HighBlood)
unique(mdata$HighBlood)

## [1] 1 0

class(mdata$HighBlood)

## [1] "numeric"

# Stroke
Stroke <- revalue(x = mdata$Stroke, replace = c("No" = 0, "Yes" = 1))
mdata$Stroke <- as.numeric(Stroke)
unique(mdata$Stroke)

## [1] 0 1

class(mdata$Stroke)

## [1] "numeric"

# Arthritis
Arthritis <- revalue(x = mdata$Arthritis, replace = c("No" = 0, "Yes" = 1))
mdata$Arthritis <- as.numeric(Arthritis)
unique(mdata$Arthritis)

## [1] 1 0

class(mdata$Arthritis)

## [1] "numeric"

# Diabetes
Diabetes <- revalue(x = mdata$Diabetes, replace = c("No" = 0, "Yes" = 1))
mdata$Diabetes <- as.numeric(Diabetes)
unique(mdata$Diabetes)

## [1] 1 0

class(mdata$Diabetes)

## [1] "numeric"

# Anxiety
Anxiety <- revalue(x = mdata$Anxiety, replace = c("No" = 0, "Yes" = 1))
mdata$Anxiety <- as.numeric(Anxiety)
unique(mdata$Anxiety)

## [1] 1 0

class(mdata$Anxiety)

## [1] "numeric"

# Asthma
Asthma <- revalue(x = mdata$Asthma, replace = c("No" = 0, "Yes" = 1))
mdata$Asthma <- as.numeric(Asthma)
unique(mdata$Asthma)

## [1] 1 0

class(mdata$Asthma)

## [1] "numeric"

Bivariate Analysis

no_yes_label = c("No", "Yes")

# Anxiety vs Age
boxplot(mdata$Age ~ mdata$Anxiety, xlab = "Anxiety", ylab = "Age", names = no_yes_label, main = "Anxiety and Age")

# Anxiety vs Gender
ggplot(mdata, aes(x = Gender, fill = factor(Anxiety))) +
  geom_bar() +
  xlab("Gender") +
  ylab("Count") +
  scale_fill_manual(values = c("steelblue", "azure3"), name = "Anxiety", labels = c("No", "Yes")) +
  ggtitle("Count of Anxiety by Gender")

# Anxiety vs VitD_levels
boxplot(mdata$VitD_levels ~ mdata$Anxiety, xlab = "Anxiety", ylab = "VitD_levels", names = no_yes_label, main = "Anxiety & VitD_levels")

# Anxiety vs HighBlood
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(HighBlood))) +
  geom_bar(position = "stack") +
  scale_fill_discrete(name = "High Blood", labels = c("No", "Yes")) +
  scale_x_discrete(labels = c("No", "Yes")) +
  labs(title = "Anxiety and High Blood Pressure",
       x = "Anxiety",
       y = "Count")

# Anxiety vs Stroke
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Stroke))) +
  geom_bar(position = "stack") +
  scale_fill_discrete(name = "Stroke", labels = c("No", "Yes")) +
  scale_x_discrete(labels = c("No", "Yes")) +
  labs(title = "Anxiety and Stroke",
       x = "Anxiety",
       y = "Count")

# Anxiety vs Arthritis
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Arthritis))) +
  geom_bar(position = "stack") +
  scale_fill_discrete(name = "Arthritis", labels = c("No", "Yes")) +
  scale_x_discrete(labels = c("No", "Yes")) +
  labs(title = "Anxiety and Arthritis",
       x = "Anxiety",
       y = "Count")

# Anxiety vs Diabetes
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Diabetes))) +
  geom_bar(position = "stack") +
  scale_fill_discrete(name = "Diabetes", labels = c("No", "Yes")) +
  scale_x_discrete(labels = c("No", "Yes")) +
  labs(title = "Anxiety and Diabetes",
       x = "Anxiety",
       y = "Count")

# Anxiety vs Asthma
ggplot(mdata, aes(x = factor(Anxiety), fill = factor(Asthma))) +
  geom_bar(position = "stack") +
  scale_fill_discrete(name = "Asthma", labels = c("No", "Yes")) +
  scale_x_discrete(labels = c("No", "Yes")) +
  labs(title = "Anxiety and Asthma",
       x = "Anxiety",
       y = "Count")

# Anxiety vs Services
ggplot(mdata, aes(x = Services, fill = factor(Anxiety))) +
  geom_bar() +
  scale_fill_discrete(name = "Anxiety", labels = c("No", "Yes")) +
  labs(title = "Services and Anxiety",
       x = "Services",
       y = "Count")

# Anxiety vs TotalCharge
boxplot(mdata$TotalCharge ~ mdata$Anxiety, xlab = "Anxiety", ylab = "TotalCharge", names = no_yes_label, main = "Anxiety vs TotalCharge")

Logistic Regression

initial_model <- glm(Anxiety ~ Age + Gender + VitD_levels + HighBlood + Stroke + Arthritis + Diabetes + Asthma + Services + TotalCharge, data = mdata, family = "binomial")
summary(initial_model)

## 
## Call:
## glm(formula = Anxiety ~ Age + Gender + VitD_levels + HighBlood + 
##     Stroke + Arthritis + Diabetes + Asthma + Services + TotalCharge, 
##     family = "binomial", data = mdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9835  -0.8910  -0.8568   1.4694   1.6332  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.101e+00  2.110e-01  -5.220 1.79e-07 ***
## Age                  5.629e-04  1.039e-03   0.542  0.58806    
## GenderMale          -3.232e-02  4.338e-02  -0.745  0.45620    
## GenderNonbinary      4.478e-02  1.480e-01   0.303  0.76223    
## VitD_levels          8.110e-03  1.063e-02   0.763  0.44558    
## HighBlood            3.330e-02  4.356e-02   0.764  0.44458    
## Stroke              -7.338e-02  5.412e-02  -1.356  0.17515    
## Arthritis            4.810e-02  4.464e-02   1.078  0.28122    
## Diabetes            -1.291e-02  4.815e-02  -0.268  0.78859    
## Asthma               5.827e-02  4.709e-02   1.237  0.21600    
## ServicesCT Scan     -3.572e-02  6.834e-02  -0.523  0.60121    
## ServicesIntravenous  2.508e-02  4.827e-02   0.520  0.60336    
## ServicesMRI         -7.641e-02  1.154e-01  -0.662  0.50798    
## TotalCharge          3.043e-05  9.836e-06   3.094  0.00198 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12560  on 9999  degrees of freedom
## Residual deviance: 12542  on 9986  degrees of freedom
## AIC: 12570
## 
## Number of Fisher Scoring iterations: 4

vif(initial_model)

##                 GVIF Df GVIF^(1/(2*Df))
## Age         1.001180  1        1.000590
## Gender      1.002164  2        1.000541
## VitD_levels 1.001251  1        1.000625
## HighBlood   1.001060  1        1.000530
## Stroke      1.001159  1        1.000579
## Arthritis   1.001871  1        1.000935
## Diabetes    1.001925  1        1.000962
## Asthma      1.001026  1        1.000513
## Services    1.003540  3        1.000589
## TotalCharge 1.002487  1        1.001243

reduced_model <- step(initial_model, direction = "backward")

summary(reduced_model)

## 
## Call:
## glm(formula = Anxiety ~ TotalCharge, family = "binomial", data = mdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9232  -0.8978  -0.8569   1.4743   1.5514  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.104e-01  5.687e-02 -16.009  < 2e-16 ***
## TotalCharge  3.063e-05  9.821e-06   3.119  0.00181 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12560  on 9999  degrees of freedom
## Residual deviance: 12550  on 9998  degrees of freedom
## AIC: 12554
## 
## Number of Fisher Scoring iterations: 4

plot(reduced_model)

# Confidence Interval of reduced model 
confint(reduced_model)

## Waiting for profiling to be done...

##                     2.5 %        97.5 %
## (Intercept) -1.022174e+00 -7.992405e-01
## TotalCharge  1.138483e-05  4.988378e-05

Split into Testing and Training

split <- sample.split(mdata$Anxiety, SplitRatio = 0.8)
train <- subset(mdata, split == 1)
test <- subset(mdata, split == 0)

nrow(train)

## [1] 8000

nrow(test)

## [1] 2000

Prediction Results

result <- predict(reduced_model, newdata = test, type = "response")

view(result)
range(result) # Range of probabilities

## [1] 0.2996811 0.3462956

table(test$Anxiety, result > 0.31)

##    
##     FALSE TRUE
##   0   520  837
##   1   218  425

Predictive Analysis

Nicholas Hurd

Research Question

One Hot Encoding + K-1

Ordinal Encoding

Bivariate Analysis

Logistic Regression

Split into Testing and Training

Prediction Results