# Establish main directory where data is stored
getwd()
## [1] "/cloud/project/hospital"
# Establish specific directory where data is stored
list.files( "/cloud/project")
## [1] "Elective) #and ward assignment.R" "hospital"
## [3] "project.Rproj"
list.files( "/cloud/project/hospital")
## [1] "Admissions.csv"
## [2] "Analysis"
## [3] "Analysis Report Hospital Admissions.Rmd"
## [4] "Analysis-Report-Hospital-Admissions.html"
## [5] "Analysis-Report-Hospital-Admissions.pdf"
## [6] "Analysis-Report-Hospital-Admissions.Rmd"
## [7] "Appointments.csv"
## [8] "Billing.csv"
## [9] "Doctors.csv"
## [10] "Medications.csv"
## [11] "Patients.csv"
## [12] "rsconnect"
# Load the admission dataset
admissions <- read.csv("/cloud/project/hospital/Admissions.csv")
head(admissions)
## AdmissionID PatientID AdmissionDate AdmissionType ReasonForAdmission
## 1 A001 P033 2023-12-08 Elective Fever
## 2 A002 P025 2024-08-23 Emergency Injury
## 3 A003 P006 2024-11-01 Elective Heart Condition
## 4 A004 P018 2024-01-27 Emergency Fever
## 5 A005 P026 2024-06-11 Emergency Fever
## 6 A006 P016 2024-11-27 Elective Fracture
## WardAssigned DischargeDate
## 1 ICU 2024-07-05
## 2 General Ward 2024-02-27
## 3 Surgical Ward 2024-04-17
## 4 Pediatrics Ward 2024-09-23
## 5 General Ward 2024-04-07
## 6 ICU 2024-03-23
# Load the appointments dataset
appointments <- read.csv("/cloud/project/hospital/Appointments.csv")
head(appointments)
## AppointmentID PatientID DoctorID AppointmentDate ReasonForVisit
## 1 App001 P016 D04 2024-10-11 Test Results
## 2 App002 P069 D29 2024-05-10 Test Results
## 3 App003 P003 D17 2024-06-25 Test Results
## 4 App004 P097 D05 2024-03-26 Test Results
## 5 App005 P086 D13 2024-09-06 Test Results
## 6 App006 P089 D19 2023-12-10 Test Results
# Load the billing dataset
billing <- read.csv("/cloud/project/hospital/Billing.csv")
head(billing)
## BillingID PatientID ProcedureID BillingDate Amount PaymentStatus
## 1 B001 P039 Proc01 2024-02-22 9027.37 Paid
## 2 B002 P057 Proc38 2024-06-19 397.23 Pending
## 3 B003 P079 Proc37 2024-06-02 7766.01 Insurance Pending
## 4 B004 P099 Proc43 2024-05-22 7064.53 Pending
## 5 B005 P041 Proc49 2024-05-04 842.76 Insurance Pending
## 6 B006 P058 Proc45 2024-08-02 826.28 Paid
# Load the doctors dataset
doctors <- read.csv("/cloud/project/hospital/Doctors.csv")
head(doctors)
## DoctorID Name Specialty
## 1 D01 Christopher Gray Oncology
## 2 D02 Jaime Thomas Oncology
## 3 D03 Mrs. Carrie Blankenship MD Oncology
## 4 D04 Shannon Clements Oncology
## 5 D05 Jill Lam General Medicine
## 6 D06 Renee Wood Orthopedics
# Load the medications dataset
medications <- read.csv("/cloud/project/hospital/Medications.csv")
head(medications)
## MedicationID PatientID MedicationName Dosage StartDate EndDate
## 1 M001 P085 Aspirin 2x/day 2024-07-04 2024-11-06
## 2 M002 P027 Aspirin 2x/day 2024-10-07 2024-04-23
## 3 M003 P039 Paracetamol 2x/day 2024-05-02 2024-08-29
## 4 M004 P023 Aspirin 1x/day 2024-09-08 2024-01-21
## 5 M005 P017 Metformin 3x/day 2024-10-23 2024-07-25
## 6 M006 P042 Paracetamol 1x/day 2024-02-13 2024-10-15
## PrescribingDoctorID
## 1 D01
## 2 D20
## 3 D08
## 4 D09
## 5 D24
## 6 D28
# Load the patients dataset
patients <- read.csv("/cloud/project/hospital/Patients.csv")
head(patients)
## PatientID Name Age Gender Contact
## 1 P001 Kevin Butler 62 Male 001-352-915-0814x537
## 2 P002 Rick Smith 83 Female +1-717-843-7436x3040
## 3 P003 Daniel Smith 42 Male 001-505-066-4818x120
## 4 P004 Robert Cuevas 63 Female 687.667.4769
## 5 P005 Lisa Mills 68 Male 294.590.7702x658
## 6 P006 Patricia Jackson 61 Male +1-648-439-9962x921
## Address
## 1 199 Margaret Point Apt. 383\nNorth Marissa, NY 82891
## 2 0451 John Plain Apt. 890\nAnthonyport, AK 05046
## 3 789 Deborah Extension\nVincentchester, TX 10946
## 4 5657 Angela Island Apt. 503\nSouth Michelefurt, OK 08458
## 5 9843 Amy Dam Apt. 914\nGregoryshire, IA 54978
## 6 2771 Tracy Roads\nSouth Julieport, ME 14950
# load tidyverse
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# load dplyr
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
# load ggplot2
library(ggplot2)
#What are the top reasons for patient admissions?
top_reasons <- admissions %>%
group_by (ReasonForAdmission)%>%
summarise(admission_count=n(),.groups='drop')%>%
arrange(desc(admission_count))
# print results
print(top_reasons)
## # A tibble: 5 × 2
## ReasonForAdmission admission_count
## <chr> <int>
## 1 Fever 48
## 2 Injury 42
## 3 Diabetes 40
## 4 Heart Condition 40
## 5 Fracture 30
#visualize output
library(ggplot2)
ggplot(data=top_reasons,mapping=aes(x=ReasonForAdmission,y=admission_count,fill=ReasonForAdmission))+
geom_col()+labs(title='Top reasons for admission')+
theme(axis.text.x = element_text(angle=45,hjust=1))

#Which ward has the highest patient load?
popular_ward <- admissions %>%
group_by(WardAssigned)%>%
summarise(
patient_count=n()
,.groups='drop')%>%
arrange(desc(patient_count))
print(popular_ward)
## # A tibble: 4 × 2
## WardAssigned patient_count
## <chr> <int>
## 1 General Ward 60
## 2 Pediatrics Ward 55
## 3 Surgical Ward 45
## 4 ICU 40
# visualize output
ggplot(data=popular_ward,mapping=aes(x=WardAssigned,y=patient_count,fill=WardAssigned))+
geom_col() + labs(title='Most popular ward by count')

#What is the most common admission type?
popular_admission <- admissions %>%
group_by(AdmissionType)%>%
summarise(
patient_count=n()
,.groups='drop')%>%
arrange(desc(patient_count))
print(popular_admission)
## # A tibble: 2 × 2
## AdmissionType patient_count
## <chr> <int>
## 1 Elective 102
## 2 Emergency 98
# visualize output
ggplot(data=popular_admission,mapping=aes(x=AdmissionType,y=patient_count,fill=AdmissionType))+
geom_col() + labs(title='Most common admission type')

#Which doctors handle the most appointments?
# join doctors and appointments tables
doctors_appointments <- left_join(doctors,appointments,by='DoctorID')
# group appointments by doctors
busy_doctors <- doctors_appointments %>%
group_by (Name)%>%
summarise(appointment_count=n(),.groups='drop')%>%
arrange(desc(appointment_count))%>%
slice(1:5)
# print resutls
print(busy_doctors)
## # A tibble: 5 × 2
## Name appointment_count
## <chr> <int>
## 1 Douglas Warren 8
## 2 Renee Wood 8
## 3 Ashley Schultz 7
## 4 Barbara Lewis 7
## 5 Michael Gonzalez 7
# visualise output
ggplot(data=busy_doctors,mapping=aes(x=Name,y=appointment_count,fill=Name))+
geom_col()+labs(title='Busiest doctors by appointment')+
theme(axis.text.x = element_text(angle=45,hjust=1))

#What is the trend in patient admissions over the past year, and how does it vary by admission type?
#Ensure date is in date format
admissions$AdmissionDate <- as.Date(admissions$AdmissionDate)
#Extract year from date
library(lubridate)
admissions$AdmissionYear <- year(admissions$DischargeDate)
#Group admissions by year
admission_trend <- admissions %>%
group_by(AdmissionType,AdmissionYear)%>%
summarise(Admission_Count=n(),.groups='drop')%>%
arrange(AdmissionYear)
#Print results
print(admission_trend)
## # A tibble: 4 × 3
## AdmissionType AdmissionYear Admission_Count
## <chr> <dbl> <int>
## 1 Elective 2023 9
## 2 Emergency 2023 9
## 3 Elective 2024 93
## 4 Emergency 2024 89
#Visualize output
ggplot(data=admission_trend,mapping=aes(x=AdmissionYear, y=Admission_Count,fill=AdmissionType))+
geom_col()+labs(title='Trend in Patient admissions over the years')+
theme_minimal()

#What percentage of bills remain unpaid, and how do payment statuses differ between self-paying and insured patients?
payment_status <- billing %>%
group_by(PaymentStatus)%>%
summarise(Payment_Count=n(),.groups='drop')%>%
mutate(Unpaid_Percentage=Payment_Count/sum(Payment_Count)*100)
#print results
print(payment_status)
## # A tibble: 3 × 3
## PaymentStatus Payment_Count Unpaid_Percentage
## <chr> <int> <dbl>
## 1 Insurance Pending 96 32
## 2 Paid 98 32.7
## 3 Pending 106 35.3
#visualize output
ggplot(data=payment_status,mapping=aes(x=PaymentStatus,y=Payment_Count,fill=PaymentStatus))+
geom_col()+labs(title='Payment status by payment count')

#What are the most prescribed medications, and which doctors prescribe them most frequently?
# Join medications and appointments tables
medications_appointments <- left_join(medications, appointments, by = 'PatientID')
## Warning in left_join(medications, appointments, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 42 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Join medications_appointments table to doctors table
medications_appointments_doctors <- left_join(medications_appointments, doctors, by = 'DoctorID')
# Establish most prescribed medications and count how many times each doctor prescribes them
popular_medications <- medications_appointments_doctors %>%
filter(!is.na(Name))%>%
group_by(MedicationName, Name) %>%
summarise(Medication_Count = n(), .groups = 'drop') %>%
arrange(desc(Medication_Count))
# slice to show top 5 prescribed medications
top_5_medications <- popular_medications %>%
slice_head(n=5)
# Print results
print(top_5_medications)
## # A tibble: 5 × 3
## MedicationName Name Medication_Count
## <chr> <chr> <int>
## 1 Aspirin David Rodriguez 15
## 2 Aspirin Lori Jimenez 13
## 3 Ciprofloxacin Lori Jimenez 11
## 4 Ibuprofen Michael Gonzalez 11
## 5 Aspirin Tiffany Terry 10
## Visualize output
ggplot(data=top_5_medications,mapping=aes(x=Name,y=Medication_Count,fill=MedicationName))+
geom_col()+
labs(title='Top 5 medications by Doctor')+
theme(axis.text.x = element_text(angle=45,hjust=1))

#What are the most prescribed medications, and which doctors prescribe them most frequently?
# Join medications and appointments tables
medications_appointments <- left_join(medications, appointments, by = 'PatientID')
## Warning in left_join(medications, appointments, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 42 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Join medications_appointments table to doctors table
medications_appointments_doctors <- left_join(medications_appointments, doctors, by = 'DoctorID')
# Establish most prescribed medications and count how many times each doctor prescribes them
popular_medications <- medications_appointments_doctors %>%
filter(!is.na(Name))%>%
group_by(MedicationName, Name) %>%
summarise(Medication_Count = n(), .groups = 'drop') %>%
arrange(desc(Medication_Count))
# slice to show top 5 prescribed medications
top_5_medications <- popular_medications %>%
slice_head(n=5)
# Print results
print(top_5_medications)
## # A tibble: 5 × 3
## MedicationName Name Medication_Count
## <chr> <chr> <int>
## 1 Aspirin David Rodriguez 15
## 2 Aspirin Lori Jimenez 13
## 3 Ciprofloxacin Lori Jimenez 11
## 4 Ibuprofen Michael Gonzalez 11
## 5 Aspirin Tiffany Terry 10
## Visualize output
ggplot(data=top_5_medications,mapping=aes(x=Name,y=Medication_Count,fill=MedicationName))+
geom_col()+
labs(title='Top 5 medications by Doctor')+
theme(axis.text.x = element_text(angle=45,hjust=1))

#What is the average revenue generated by ward and procedure type?
# join the billing and admissions tables
billing_admissions <- left_join(billing,admissions,by='PatientID')
## Warning in left_join(billing, admissions, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 71 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# calculate the average
derived_revenue <- billing_admissions %>%
filter(!is.na(ReasonForAdmission))%>%
group_by(WardAssigned,ReasonForAdmission)%>%
summarise(Avg_rev=mean(Amount,na.rm=TRUE),.groups='drop')
#print results
print(derived_revenue)
## # A tibble: 20 × 3
## WardAssigned ReasonForAdmission Avg_rev
## <chr> <chr> <dbl>
## 1 General Ward Diabetes 5512.
## 2 General Ward Fever 5575.
## 3 General Ward Fracture 5441.
## 4 General Ward Heart Condition 5151.
## 5 General Ward Injury 5423.
## 6 ICU Diabetes 5822.
## 7 ICU Fever 4825.
## 8 ICU Fracture 4695.
## 9 ICU Heart Condition 5096.
## 10 ICU Injury 5600.
## 11 Pediatrics Ward Diabetes 4981.
## 12 Pediatrics Ward Fever 5214.
## 13 Pediatrics Ward Fracture 5682.
## 14 Pediatrics Ward Heart Condition 4972.
## 15 Pediatrics Ward Injury 5198.
## 16 Surgical Ward Diabetes 5382.
## 17 Surgical Ward Fever 5307.
## 18 Surgical Ward Fracture 4684.
## 19 Surgical Ward Heart Condition 5268.
## 20 Surgical Ward Injury 5477.
#visualize output
ggplot(data=derived_revenue,mapping=aes(x=ReasonForAdmission,y=Avg_rev,fill=WardAssigned))+
geom_col(color='black',position='dodge') + labs(title='Average Rev generated by ward and Procedure')+
theme(axis.text.x=element_text(angle=45,hjust=1))

#What are the peak days for elective and emergency admissions?
# ensure that the date column is in date format
admissions$AdmissionDate <- as.Date(admissions$AdmissionDate)
# extract date from date
admissions$day <- wday(admissions$AdmissionDate, label = TRUE, abbr=TRUE)
# establish peak days
peak_days <- admissions %>%
group_by(AdmissionType,day)%>%
summarise(peak_count=n(),.groups='drop')
# establish the top 5 peak dates
top_days <- peak_days %>%
group_by(AdmissionType)%>%
slice_head(n=10)
# print results
print(top_days)
## # A tibble: 14 × 3
## # Groups: AdmissionType [2]
## AdmissionType day peak_count
## <chr> <ord> <int>
## 1 Elective Sun 14
## 2 Elective Mon 18
## 3 Elective Tue 12
## 4 Elective Wed 14
## 5 Elective Thu 16
## 6 Elective Fri 13
## 7 Elective Sat 15
## 8 Emergency Sun 13
## 9 Emergency Mon 10
## 10 Emergency Tue 14
## 11 Emergency Wed 11
## 12 Emergency Thu 14
## 13 Emergency Fri 18
## 14 Emergency Sat 18
# visualize output
ggplot(data=top_days, mapping=aes(x=day,y=peak_count,fill=AdmissionType))+
geom_col(position='dodge')+labs(title='Peak days for admission types')

#How does a patient’s age affect the total amount billed for their
#treatment?
# join the patients table and the billing table
patients_billings <- right_join(patients,billing,by='PatientID')
# create a linear regression model
model1 <- lm(Amount~Age,data=patients_billings)
print(model1) #Positive Relationship: The slope indicates that as the patient's age increases,
##
## Call:
## lm(formula = Amount ~ Age, data = patients_billings)
##
## Coefficients:
## (Intercept) Age
## 4843.578 8.045
#the billed amount tends to increase by (8.045)
#Does the number of medications prescribed during a hospital stay impact the total bill?
# Join the billing and medications tables by PatientID
billing_medications <- left_join(billing, medications, by = 'PatientID')
## Warning in left_join(billing, medications, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 127 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Count the number of medications per patient
medicine_count_per_patient <- billing_medications %>%
filter(!is.na(MedicationName)) %>%
group_by(PatientID) %>%
summarise(medicine_count = n(), .groups = 'drop')
# Join the medicine_count_per_patient data with the billing data (to get total billed amount)
billing_data_with_meds <- left_join(billing, medicine_count_per_patient, by = 'PatientID')
# Build the linear regression model to predict the total amount based on medicine count
model2 <- lm(Amount ~ medicine_count, data = billing_data_with_meds)
# Print the results of the regression model
summary(model2)
##
## Call:
## lm(formula = Amount ~ medicine_count, data = billing_data_with_meds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5271.0 -2389.5 -77.2 2268.6 4964.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4889.03 312.29 15.655 <2e-16 ***
## medicine_count 24.52 16.85 1.455 0.147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2889 on 290 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.007244, Adjusted R-squared: 0.003821
## F-statistic: 2.116 on 1 and 290 DF, p-value: 0.1468
# Visualize output
ggplot(data=model2,mapping=aes(x=medicine_count,y=Amount))+
geom_boxplot()+labs(title='Number of medicines and impact on billable amount')
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

#Can patient demographics (age, gender) or admission type predict
#whether a patient will have an emergency admission?
# join the admissions table and patient table
patients_admissions <- left_join(patients,admissions,by='PatientID')
# ensure that gender and admission type are factors
patients_admissions$AdmissionType <- as.factor(patients_admissions$AdmissionType)
patients_admissions$Gender <- as.factor(patients_admissions$Gender)
#calculate logistic regression
admission_model <- glm(AdmissionType~Age+Gender,data=patients_admissions,family=binomial)
print(admission_model)
##
## Call: glm(formula = AdmissionType ~ Age + Gender, family = binomial,
## data = patients_admissions)
##
## Coefficients:
## (Intercept) Age GenderMale
## 0.168874 -0.005538 0.210129
##
## Degrees of Freedom: 199 Total (i.e. Null); 197 Residual
## (17 observations deleted due to missingness)
## Null Deviance: 277.2
## Residual Deviance: 275.8 AIC: 281.8
#Does gender distributions differ significantly across admission types?
gender_admissions <- chisq.test(patients_admissions$Gender,patients_admissions$AdmissionType)
print(gender_admissions)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: patients_admissions$Gender and patients_admissions$AdmissionType
## X-squared = 0.26144, df = 1, p-value = 0.6091