# Establish main directory where data is stored
getwd()
## [1] "/cloud/project/hospital"
# Establish specific directory where data is stored
list.files( "/cloud/project")
## [1] "Elective) #and ward assignment.R" "hospital"                        
## [3] "project.Rproj"
list.files( "/cloud/project/hospital")
##  [1] "Admissions.csv"                          
##  [2] "Analysis"                                
##  [3] "Analysis Report Hospital Admissions.Rmd" 
##  [4] "Analysis-Report-Hospital-Admissions.html"
##  [5] "Analysis-Report-Hospital-Admissions.pdf" 
##  [6] "Analysis-Report-Hospital-Admissions.Rmd" 
##  [7] "Appointments.csv"                        
##  [8] "Billing.csv"                             
##  [9] "Doctors.csv"                             
## [10] "Medications.csv"                         
## [11] "Patients.csv"                            
## [12] "rsconnect"
# Load the admission dataset
admissions <- read.csv("/cloud/project/hospital/Admissions.csv")
head(admissions)
##   AdmissionID PatientID AdmissionDate AdmissionType ReasonForAdmission
## 1        A001      P033    2023-12-08      Elective              Fever
## 2        A002      P025    2024-08-23     Emergency             Injury
## 3        A003      P006    2024-11-01      Elective    Heart Condition
## 4        A004      P018    2024-01-27     Emergency              Fever
## 5        A005      P026    2024-06-11     Emergency              Fever
## 6        A006      P016    2024-11-27      Elective           Fracture
##      WardAssigned DischargeDate
## 1             ICU    2024-07-05
## 2    General Ward    2024-02-27
## 3   Surgical Ward    2024-04-17
## 4 Pediatrics Ward    2024-09-23
## 5    General Ward    2024-04-07
## 6             ICU    2024-03-23
# Load the appointments dataset
appointments <- read.csv("/cloud/project/hospital/Appointments.csv")
head(appointments)
##   AppointmentID PatientID DoctorID AppointmentDate ReasonForVisit
## 1        App001      P016      D04      2024-10-11   Test Results
## 2        App002      P069      D29      2024-05-10   Test Results
## 3        App003      P003      D17      2024-06-25   Test Results
## 4        App004      P097      D05      2024-03-26   Test Results
## 5        App005      P086      D13      2024-09-06   Test Results
## 6        App006      P089      D19      2023-12-10   Test Results
# Load the billing dataset
billing <- read.csv("/cloud/project/hospital/Billing.csv")
head(billing)
##   BillingID PatientID ProcedureID BillingDate  Amount     PaymentStatus
## 1      B001      P039      Proc01  2024-02-22 9027.37              Paid
## 2      B002      P057      Proc38  2024-06-19  397.23           Pending
## 3      B003      P079      Proc37  2024-06-02 7766.01 Insurance Pending
## 4      B004      P099      Proc43  2024-05-22 7064.53           Pending
## 5      B005      P041      Proc49  2024-05-04  842.76 Insurance Pending
## 6      B006      P058      Proc45  2024-08-02  826.28              Paid
# Load the doctors dataset
doctors <- read.csv("/cloud/project/hospital/Doctors.csv")
head(doctors)
##   DoctorID                       Name        Specialty
## 1      D01           Christopher Gray         Oncology
## 2      D02               Jaime Thomas         Oncology
## 3      D03 Mrs. Carrie Blankenship MD         Oncology
## 4      D04           Shannon Clements         Oncology
## 5      D05                   Jill Lam General Medicine
## 6      D06                 Renee Wood      Orthopedics
# Load the medications dataset
medications <- read.csv("/cloud/project/hospital/Medications.csv")
head(medications)
##   MedicationID PatientID MedicationName Dosage  StartDate    EndDate
## 1         M001      P085        Aspirin 2x/day 2024-07-04 2024-11-06
## 2         M002      P027        Aspirin 2x/day 2024-10-07 2024-04-23
## 3         M003      P039    Paracetamol 2x/day 2024-05-02 2024-08-29
## 4         M004      P023        Aspirin 1x/day 2024-09-08 2024-01-21
## 5         M005      P017      Metformin 3x/day 2024-10-23 2024-07-25
## 6         M006      P042    Paracetamol 1x/day 2024-02-13 2024-10-15
##   PrescribingDoctorID
## 1                 D01
## 2                 D20
## 3                 D08
## 4                 D09
## 5                 D24
## 6                 D28
# Load the patients dataset
patients <- read.csv("/cloud/project/hospital/Patients.csv")
head(patients)
##   PatientID             Name Age Gender              Contact
## 1      P001     Kevin Butler  62   Male 001-352-915-0814x537
## 2      P002       Rick Smith  83 Female +1-717-843-7436x3040
## 3      P003     Daniel Smith  42   Male 001-505-066-4818x120
## 4      P004    Robert Cuevas  63 Female         687.667.4769
## 5      P005       Lisa Mills  68   Male     294.590.7702x658
## 6      P006 Patricia Jackson  61   Male  +1-648-439-9962x921
##                                                    Address
## 1     199 Margaret Point Apt. 383\nNorth Marissa, NY 82891
## 2          0451 John Plain Apt. 890\nAnthonyport, AK 05046
## 3          789 Deborah Extension\nVincentchester, TX 10946
## 4 5657 Angela Island Apt. 503\nSouth Michelefurt, OK 08458
## 5            9843 Amy Dam Apt. 914\nGregoryshire, IA 54978
## 6              2771 Tracy Roads\nSouth Julieport, ME 14950
# load tidyverse
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# load dplyr
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)

# load ggplot2
library(ggplot2)
#What are the top reasons for patient admissions?
top_reasons <- admissions %>%
  group_by (ReasonForAdmission)%>%
  summarise(admission_count=n(),.groups='drop')%>%
arrange(desc(admission_count))

# print results
print(top_reasons)
## # A tibble: 5 × 2
##   ReasonForAdmission admission_count
##   <chr>                        <int>
## 1 Fever                           48
## 2 Injury                          42
## 3 Diabetes                        40
## 4 Heart Condition                 40
## 5 Fracture                        30
#visualize output
library(ggplot2)
ggplot(data=top_reasons,mapping=aes(x=ReasonForAdmission,y=admission_count,fill=ReasonForAdmission))+
  geom_col()+labs(title='Top reasons for admission')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

#Which ward has the highest patient load?
popular_ward <- admissions %>%
  group_by(WardAssigned)%>%
  summarise(
    patient_count=n()
      ,.groups='drop')%>%
  arrange(desc(patient_count))
print(popular_ward)
## # A tibble: 4 × 2
##   WardAssigned    patient_count
##   <chr>                   <int>
## 1 General Ward               60
## 2 Pediatrics Ward            55
## 3 Surgical Ward              45
## 4 ICU                        40
# visualize output
ggplot(data=popular_ward,mapping=aes(x=WardAssigned,y=patient_count,fill=WardAssigned))+
  geom_col() + labs(title='Most popular ward by count')

#What is the most common admission type?
popular_admission <- admissions %>%
  group_by(AdmissionType)%>%
  summarise(
    patient_count=n()
    ,.groups='drop')%>%
  arrange(desc(patient_count))
print(popular_admission)
## # A tibble: 2 × 2
##   AdmissionType patient_count
##   <chr>                 <int>
## 1 Elective                102
## 2 Emergency                98
# visualize output
ggplot(data=popular_admission,mapping=aes(x=AdmissionType,y=patient_count,fill=AdmissionType))+
  geom_col() + labs(title='Most common admission type')

#Which doctors handle the most appointments?

# join doctors and appointments tables
doctors_appointments <- left_join(doctors,appointments,by='DoctorID')

# group appointments by doctors
busy_doctors <- doctors_appointments %>%
  group_by (Name)%>%
  summarise(appointment_count=n(),.groups='drop')%>%
  arrange(desc(appointment_count))%>%
  slice(1:5)

# print resutls
print(busy_doctors)
## # A tibble: 5 × 2
##   Name             appointment_count
##   <chr>                        <int>
## 1 Douglas Warren                   8
## 2 Renee Wood                       8
## 3 Ashley Schultz                   7
## 4 Barbara Lewis                    7
## 5 Michael Gonzalez                 7
# visualise output
ggplot(data=busy_doctors,mapping=aes(x=Name,y=appointment_count,fill=Name))+
  geom_col()+labs(title='Busiest doctors by appointment')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

#What is the trend in patient admissions over the past year, and how does it vary by admission type?

#Ensure date is in date format
admissions$AdmissionDate <- as.Date(admissions$AdmissionDate)

#Extract year from date
library(lubridate)
admissions$AdmissionYear <- year(admissions$DischargeDate)

#Group admissions by year
admission_trend <- admissions %>%
  group_by(AdmissionType,AdmissionYear)%>%
  summarise(Admission_Count=n(),.groups='drop')%>%
arrange(AdmissionYear)

#Print results
print(admission_trend)
## # A tibble: 4 × 3
##   AdmissionType AdmissionYear Admission_Count
##   <chr>                 <dbl>           <int>
## 1 Elective               2023               9
## 2 Emergency              2023               9
## 3 Elective               2024              93
## 4 Emergency              2024              89
#Visualize output
ggplot(data=admission_trend,mapping=aes(x=AdmissionYear, y=Admission_Count,fill=AdmissionType))+
  geom_col()+labs(title='Trend in Patient admissions over the years')+
  theme_minimal()

#What percentage of bills remain unpaid, and how do payment statuses differ between self-paying and insured patients?

payment_status <- billing %>%
    group_by(PaymentStatus)%>%
  summarise(Payment_Count=n(),.groups='drop')%>%
  mutate(Unpaid_Percentage=Payment_Count/sum(Payment_Count)*100)

#print results
print(payment_status)
## # A tibble: 3 × 3
##   PaymentStatus     Payment_Count Unpaid_Percentage
##   <chr>                     <int>             <dbl>
## 1 Insurance Pending            96              32  
## 2 Paid                         98              32.7
## 3 Pending                     106              35.3
#visualize output
ggplot(data=payment_status,mapping=aes(x=PaymentStatus,y=Payment_Count,fill=PaymentStatus))+
  geom_col()+labs(title='Payment status by payment count')

#What are the most prescribed medications, and which doctors prescribe them most frequently?

# Join medications and appointments tables
medications_appointments <- left_join(medications, appointments, by = 'PatientID')
## Warning in left_join(medications, appointments, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 42 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Join medications_appointments table to doctors table
medications_appointments_doctors <- left_join(medications_appointments, doctors, by = 'DoctorID')

# Establish most prescribed medications and count how many times each doctor prescribes them
popular_medications <- medications_appointments_doctors %>%
  filter(!is.na(Name))%>%
  group_by(MedicationName, Name) %>%
  summarise(Medication_Count = n(), .groups = 'drop') %>%
  arrange(desc(Medication_Count))

# slice to show top 5 prescribed medications
top_5_medications <- popular_medications %>%
  slice_head(n=5)

# Print results
print(top_5_medications)
## # A tibble: 5 × 3
##   MedicationName Name             Medication_Count
##   <chr>          <chr>                       <int>
## 1 Aspirin        David Rodriguez                15
## 2 Aspirin        Lori Jimenez                   13
## 3 Ciprofloxacin  Lori Jimenez                   11
## 4 Ibuprofen      Michael Gonzalez               11
## 5 Aspirin        Tiffany Terry                  10
## Visualize output
ggplot(data=top_5_medications,mapping=aes(x=Name,y=Medication_Count,fill=MedicationName))+
  geom_col()+
  labs(title='Top 5 medications by Doctor')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

#What are the most prescribed medications, and which doctors prescribe them most frequently?

# Join medications and appointments tables
medications_appointments <- left_join(medications, appointments, by = 'PatientID')
## Warning in left_join(medications, appointments, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 42 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Join medications_appointments table to doctors table
medications_appointments_doctors <- left_join(medications_appointments, doctors, by = 'DoctorID')

# Establish most prescribed medications and count how many times each doctor prescribes them
popular_medications <- medications_appointments_doctors %>%
  filter(!is.na(Name))%>%
  group_by(MedicationName, Name) %>%
  summarise(Medication_Count = n(), .groups = 'drop') %>%
  arrange(desc(Medication_Count))

# slice to show top 5 prescribed medications
top_5_medications <- popular_medications %>%
  slice_head(n=5)

# Print results
print(top_5_medications)
## # A tibble: 5 × 3
##   MedicationName Name             Medication_Count
##   <chr>          <chr>                       <int>
## 1 Aspirin        David Rodriguez                15
## 2 Aspirin        Lori Jimenez                   13
## 3 Ciprofloxacin  Lori Jimenez                   11
## 4 Ibuprofen      Michael Gonzalez               11
## 5 Aspirin        Tiffany Terry                  10
## Visualize output
ggplot(data=top_5_medications,mapping=aes(x=Name,y=Medication_Count,fill=MedicationName))+
  geom_col()+
  labs(title='Top 5 medications by Doctor')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

#What is the average revenue generated by ward and procedure type?

# join the billing and admissions tables
billing_admissions <- left_join(billing,admissions,by='PatientID')
## Warning in left_join(billing, admissions, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 71 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# calculate the average
derived_revenue <- billing_admissions %>%
  filter(!is.na(ReasonForAdmission))%>%
  group_by(WardAssigned,ReasonForAdmission)%>%
  summarise(Avg_rev=mean(Amount,na.rm=TRUE),.groups='drop')

#print results
print(derived_revenue)
## # A tibble: 20 × 3
##    WardAssigned    ReasonForAdmission Avg_rev
##    <chr>           <chr>                <dbl>
##  1 General Ward    Diabetes             5512.
##  2 General Ward    Fever                5575.
##  3 General Ward    Fracture             5441.
##  4 General Ward    Heart Condition      5151.
##  5 General Ward    Injury               5423.
##  6 ICU             Diabetes             5822.
##  7 ICU             Fever                4825.
##  8 ICU             Fracture             4695.
##  9 ICU             Heart Condition      5096.
## 10 ICU             Injury               5600.
## 11 Pediatrics Ward Diabetes             4981.
## 12 Pediatrics Ward Fever                5214.
## 13 Pediatrics Ward Fracture             5682.
## 14 Pediatrics Ward Heart Condition      4972.
## 15 Pediatrics Ward Injury               5198.
## 16 Surgical Ward   Diabetes             5382.
## 17 Surgical Ward   Fever                5307.
## 18 Surgical Ward   Fracture             4684.
## 19 Surgical Ward   Heart Condition      5268.
## 20 Surgical Ward   Injury               5477.
#visualize output
ggplot(data=derived_revenue,mapping=aes(x=ReasonForAdmission,y=Avg_rev,fill=WardAssigned))+
  geom_col(color='black',position='dodge') + labs(title='Average Rev generated by ward and Procedure')+
  theme(axis.text.x=element_text(angle=45,hjust=1))

#What are the peak days for elective and emergency admissions?

# ensure that the date column is in date format
admissions$AdmissionDate <- as.Date(admissions$AdmissionDate)

# extract date from date
admissions$day <- wday(admissions$AdmissionDate, label = TRUE, abbr=TRUE)

# establish peak days
peak_days <- admissions %>%
  group_by(AdmissionType,day)%>%
  summarise(peak_count=n(),.groups='drop')

# establish the top 5 peak dates
  top_days <- peak_days %>%
    group_by(AdmissionType)%>%
  slice_head(n=10)

# print results
print(top_days)
## # A tibble: 14 × 3
## # Groups:   AdmissionType [2]
##    AdmissionType day   peak_count
##    <chr>         <ord>      <int>
##  1 Elective      Sun           14
##  2 Elective      Mon           18
##  3 Elective      Tue           12
##  4 Elective      Wed           14
##  5 Elective      Thu           16
##  6 Elective      Fri           13
##  7 Elective      Sat           15
##  8 Emergency     Sun           13
##  9 Emergency     Mon           10
## 10 Emergency     Tue           14
## 11 Emergency     Wed           11
## 12 Emergency     Thu           14
## 13 Emergency     Fri           18
## 14 Emergency     Sat           18
# visualize output
ggplot(data=top_days, mapping=aes(x=day,y=peak_count,fill=AdmissionType))+
  geom_col(position='dodge')+labs(title='Peak days for admission types')

#How does a patient’s age affect the total amount billed for their
#treatment?
# join the patients table and the billing table
patients_billings <- right_join(patients,billing,by='PatientID')

# create a linear regression model
model1 <- lm(Amount~Age,data=patients_billings)
print(model1) #Positive Relationship: The slope indicates that as the patient's age increases, 
## 
## Call:
## lm(formula = Amount ~ Age, data = patients_billings)
## 
## Coefficients:
## (Intercept)          Age  
##    4843.578        8.045
              #the billed amount tends to increase by (8.045)
#Does the number of medications prescribed during a hospital stay impact the total bill?

# Join the billing and medications tables by PatientID
billing_medications <- left_join(billing, medications, by = 'PatientID')
## Warning in left_join(billing, medications, by = "PatientID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 127 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Count the number of medications per patient
medicine_count_per_patient <- billing_medications %>%
  filter(!is.na(MedicationName)) %>%
  group_by(PatientID) %>%
  summarise(medicine_count = n(), .groups = 'drop')

# Join the medicine_count_per_patient data with the billing data (to get total billed amount)
billing_data_with_meds <- left_join(billing, medicine_count_per_patient, by = 'PatientID')

# Build the linear regression model to predict the total amount based on medicine count
model2 <- lm(Amount ~ medicine_count, data = billing_data_with_meds)

# Print the results of the regression model
summary(model2)
## 
## Call:
## lm(formula = Amount ~ medicine_count, data = billing_data_with_meds)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5271.0 -2389.5   -77.2  2268.6  4964.4 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4889.03     312.29  15.655   <2e-16 ***
## medicine_count    24.52      16.85   1.455    0.147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2889 on 290 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.007244,   Adjusted R-squared:  0.003821 
## F-statistic: 2.116 on 1 and 290 DF,  p-value: 0.1468
# Visualize output
ggplot(data=model2,mapping=aes(x=medicine_count,y=Amount))+
  geom_boxplot()+labs(title='Number of medicines and impact on billable amount')
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

#Can patient demographics (age, gender) or admission type predict
#whether a patient will have an emergency admission?

# join the admissions table and patient table
patients_admissions <- left_join(patients,admissions,by='PatientID')

# ensure that gender and admission type are factors
patients_admissions$AdmissionType <- as.factor(patients_admissions$AdmissionType)
patients_admissions$Gender <- as.factor(patients_admissions$Gender)

#calculate logistic regression
admission_model <- glm(AdmissionType~Age+Gender,data=patients_admissions,family=binomial)
print(admission_model)
## 
## Call:  glm(formula = AdmissionType ~ Age + Gender, family = binomial, 
##     data = patients_admissions)
## 
## Coefficients:
## (Intercept)          Age   GenderMale  
##    0.168874    -0.005538     0.210129  
## 
## Degrees of Freedom: 199 Total (i.e. Null);  197 Residual
##   (17 observations deleted due to missingness)
## Null Deviance:       277.2 
## Residual Deviance: 275.8     AIC: 281.8
#Does gender distributions differ significantly across admission types?
gender_admissions <- chisq.test(patients_admissions$Gender,patients_admissions$AdmissionType)
print(gender_admissions)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  patients_admissions$Gender and patients_admissions$AdmissionType
## X-squared = 0.26144, df = 1, p-value = 0.6091