library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.2
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
#> ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
#> ✔ tibble 3.1.8 ✔ dplyr 1.0.10
#> ✔ tidyr 1.2.1 ✔ stringr 1.4.1
#> ✔ readr 2.1.3 ✔ forcats 0.5.2
#> Warning: package 'ggplot2' was built under R version 4.2.2
#> Warning: package 'stringr' was built under R version 4.2.2
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
patient_records_test <- read.csv("https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/tidyverse_assignment/Train-1617360447408-1660719685476.csv")
class(patient_records_test)
#> [1] "data.frame"
names(patient_records_test)
#> [1] "index" "encounter_id" "patient_id"
#> [4] "race" "gender" "age"
#> [7] "weight" "time_in_hospital" "medical_specialty"
#> [10] "num_lab_procedures" "num_procedures" "num_medications"
#> [13] "number_outpatient" "number_emergency" "number_inpatient"
#> [16] "diag_1" "diag_2" "diag_3"
#> [19] "diag_4" "diag_5" "number_diagnoses"
#> [22] "X1" "X2" "X3"
#> [25] "X4" "X5" "X6"
#> [28] "X7" "X8" "X9"
#> [31] "X10" "X11" "X12"
#> [34] "X13" "X14" "X15"
#> [37] "X16" "X17" "X18"
#> [40] "X19" "X20" "X21"
#> [43] "X22" "X23" "X24"
#> [46] "X25" "change" "diabetesMed"
#> [49] "readmitted"
What is the main factor of patient readmission in the hospital? Are the treatment and age group the reason for readmission? What are the chances of readmission based on medical treatment and age group?
Each case represents individual patient who went to hospital for treatment. There are 66,587 observations
dim(patient_records_test)
#> [1] 66587 49
Data source cited in MLA 9th edition + Vutukuri, Girish. “Hospital_Administration.” Hospital_Administration | Kaggle, www.kaggle.com/datasets/girishvutukuri/hospital-administration/code?resource=download. Accessed 30 Oct. 2022.
Data cleaning There is a data with medical_treatment as “?”. Since we are not sure what sort of medical_treatement it is, we are omitting the records in the new variable. We’re grouping the type of medical_treatement and counting how many patient has readmitted.
hospital_readmission_by_treatment <- patient_records_test %>%
filter(readmitted == 1) %>%
filter(medical_specialty != "?") %>%
group_by(medical_specialty) %>%
count() %>%
arrange(desc(n))
hospital_readmission_by_treatment
#> # A tibble: 58 × 2
#> # Groups: medical_specialty [58]
#> medical_specialty n
#> <chr> <int>
#> 1 InternalMedicine 4183
#> 2 Emergency/Trauma 2503
#> 3 Family/GeneralPractice 2363
#> 4 Cardiology 1489
#> 5 Surgery-General 898
#> 6 Nephrology 605
#> 7 Radiologist 339
#> 8 Orthopedics 305
#> 9 Pulmonology 275
#> 10 Orthopedics-Reconstructive 245
#> # … with 48 more rows
A graphical representation of above code.
ggplot(hospital_readmission_by_treatment, aes(x = medical_specialty, y = n)) +
geom_bar(stat = "identity") +
theme(text = element_text(size = 6),element_line(size = 2)) +
coord_flip()
#> Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
We also have to consider the age as an factor. So, I am grouping the age and counting total readmission.
hospital_readmission_by_age <- patient_records_test %>%
filter(readmitted == 1) %>%
group_by(age) %>%
count() %>%
arrange(desc(n))
hospital_readmission_by_age
#> # A tibble: 10 × 2
#> # Groups: age [10]
#> age n
#> <chr> <int>
#> 1 [70-80) 8179
#> 2 [60-70) 6860
#> 3 [80-90) 5349
#> 4 [50-60) 5129
#> 5 [40-50) 2814
#> 6 [30-40) 1016
#> 7 [90-100) 697
#> 8 [20-30) 520
#> 9 [10-20) 178
#> 10 [0-10) 22
A graphical representation of above code.
ggplot(hospital_readmission_by_age, aes(x = age, y = n)) +
geom_bar(stat = "identity") +
labs(title= "Hospital readmission by age",
x="age_group",y="total count")+
coord_flip()
Those patients who had visited hospital for internalMedicine as a medical_treatment are most readmitted. Also the age group between 80-90 are also readmitted to the hospital. Further analysis can be done on age with most medical_treatment applied.
I have used the tidyverse,dplyr fucntions such as piping fuction, filter and group_by function.
Within this dataset one of the research questions was What are the chances of readmission based on medical treatment and age group? To solve this question we will utilize the forcats package which is a package meant to work with factors within datasets. We could utilize this function to then create a logistic regression of readmission chances.
This function allows us to see the frequency of factors within our data and thus it gives us insight into some of the stronger factors within the data
We see in this example that African Americans and Other are more prevelant to no readmission
#Revise the data to utilize all relevant data only
library(caret)
#> Warning: package 'caret' was built under R version 4.2.2
#> Loading required package: lattice
#>
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#>
#> lift
cleandata <- patient_records_test%>%
select(-1,-2,-3,-7, -X8, -X18, -X19, -X25)%>%
mutate(readmitted= as.factor(readmitted))
data1<- cleandata%>%
filter(readmitted == 1)
data0<- cleandata%>%
filter(readmitted == 0)
ggplot(data1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(race)))) +
coord_flip()
ggplot(data0) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(race)))) +
coord_flip()
We see in this example that overall less people are readmitted than admitted, however the 60-70 range shows us that a greater magnitude of itspopulation is not readmitted
ggplot(data1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(age)))) +
coord_flip()
ggplot(data0) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(age)))) +
coord_flip()
In this example it is unclear fo the magnitude of thos who are readmitted vs not however we see mean are readmitted less. This could be due to other factors.
ggplot(data1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(gender)))) +
coord_flip()
ggplot(data0) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(gender)))) +
coord_flip()
Medical specialty is a very wide range and unclear how each factor affects readmission
ggplot(data1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(medical_specialty)))) +
coord_flip()
ggplot(data0) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(medical_specialty)))) +
coord_flip()
Due to the computational power it would need to predict everything, I will select various factors
testing <- cleandata
train_portion = .7
train_index <- createDataPartition(testing$readmitted,p = train_portion, list = FALSE,times = 1)
trainset <- testing[train_index,]
testset <- testing[-train_index,]
log_model <- train(readmitted ~ age + race + gender+ time_in_hospital + num_lab_procedures+ num_procedures+ num_medications+ number_diagnoses+ diabetesMed ,
data = trainset,
method="glm", family = "binomial")
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
train_preds <- predict(log_model,newdata = trainset)
test_preds <- predict(log_model,newdata = testset)
confusionMatrix(train_preds,trainset$readmitted)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 17368 12655
#> 1 7709 8880
#>
#> Accuracy : 0.5631
#> 95% CI : (0.5586, 0.5676)
#> No Information Rate : 0.538
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.1067
#>
#> Mcnemar's Test P-Value : < 2.2e-16
#>
#> Sensitivity : 0.6926
#> Specificity : 0.4124
#> Pos Pred Value : 0.5785
#> Neg Pred Value : 0.5353
#> Prevalence : 0.5380
#> Detection Rate : 0.3726
#> Detection Prevalence : 0.6441
#> Balanced Accuracy : 0.5525
#>
#> 'Positive' Class : 0
#>
While our predictive model was not the strongest due to computational power. Our key takeaway is that the utilization of the forcats package can help us better work with our factor variables as they could demonstrate importance within our datasets