library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
tidyverse → kumpulan package untuk olah data & visualisasi readr → bantu baca file csv
data <- read.csv("healthcare_dataset.csv")
head(data)
## patient_id age gender admission_date diagnosis treatment
## 1 P000001 69 F 2024-01-29 00:00:00 Respiratory Conservative
## 2 P000002 19 F 2023-04-19 00:00:00 Cardiac Therapy
## 3 P000003 35 F 2023-09-12 00:00:00 Infection Therapy
## 4 P000004 52 F 2024-03-02 00:00:00 Respiratory Conservative
## 5 P000005 56 M 2024-09-27 00:00:00 Respiratory Conservative
## 6 P000006 18 M 2023-07-16 00:00:00 Trauma Therapy
## length_of_stay satisfaction
## 1 4 2
## 2 57 4
## 3 52 6
## 4 142 5
## 5 4 6
## 6 4 2
str(data)
## 'data.frame': 1000 obs. of 8 variables:
## $ patient_id : chr "P000001" "P000002" "P000003" "P000004" ...
## $ age : int 69 19 35 52 56 18 54 38 18 21 ...
## $ gender : chr "F" "F" "F" "F" ...
## $ admission_date: chr "2024-01-29 00:00:00" "2023-04-19 00:00:00" "2023-09-12 00:00:00" "2024-03-02 00:00:00" ...
## $ diagnosis : chr "Respiratory" "Cardiac" "Infection" "Respiratory" ...
## $ treatment : chr "Conservative" "Therapy" "Therapy" "Conservative" ...
## $ length_of_stay: int 4 57 52 142 4 4 39 21 44 2 ...
## $ satisfaction : int 2 4 6 5 6 2 7 8 9 6 ...
summary(data)
## patient_id age gender admission_date
## Length:1000 Min. : 5.00 Length:1000 Length:1000
## Class :character 1st Qu.: 33.00 Class :character Class :character
## Mode :character Median : 51.00 Mode :character Mode :character
## Mean : 54.71
## 3rd Qu.: 69.00
## Max. :999.00
##
## diagnosis treatment length_of_stay satisfaction
## Length:1000 Length:1000 Min. : 1.00 Min. : 1.000
## Class :character Class :character 1st Qu.: 16.00 1st Qu.: 3.000
## Mode :character Mode :character Median : 29.00 Median : 6.000
## Mean : 31.46 Mean : 5.579
## 3rd Qu.: 45.00 3rd Qu.: 8.000
## Max. :193.00 Max. :10.000
## NA's :15
Untuk melihat apakah ada data kosong.
colSums(is.na(data))
## patient_id age gender admission_date diagnosis
## 0 0 0 0 0
## treatment length_of_stay satisfaction
## 0 0 15
Menghapus baris yang punya data kosong.
data1 <- na.omit(data)
head(data1)
## patient_id age gender admission_date diagnosis treatment
## 1 P000001 69 F 2024-01-29 00:00:00 Respiratory Conservative
## 2 P000002 19 F 2023-04-19 00:00:00 Cardiac Therapy
## 3 P000003 35 F 2023-09-12 00:00:00 Infection Therapy
## 4 P000004 52 F 2024-03-02 00:00:00 Respiratory Conservative
## 5 P000005 56 M 2024-09-27 00:00:00 Respiratory Conservative
## 6 P000006 18 M 2023-07-16 00:00:00 Trauma Therapy
## length_of_stay satisfaction
## 1 4 2
## 2 57 4
## 3 52 6
## 4 142 5
## 5 4 6
## 6 4 2
data1: dataset setelah proses penghapusan missing value.
Menghapus data yang sama/duplikat
data2 <- distinct(data1)
head(data2)
## patient_id age gender admission_date diagnosis treatment
## 1 P000001 69 F 2024-01-29 00:00:00 Respiratory Conservative
## 2 P000002 19 F 2023-04-19 00:00:00 Cardiac Therapy
## 3 P000003 35 F 2023-09-12 00:00:00 Infection Therapy
## 4 P000004 52 F 2024-03-02 00:00:00 Respiratory Conservative
## 5 P000005 56 M 2024-09-27 00:00:00 Respiratory Conservative
## 6 P000006 18 M 2023-07-16 00:00:00 Trauma Therapy
## length_of_stay satisfaction
## 1 4 2
## 2 57 4
## 3 52 6
## 4 142 5
## 5 4 6
## 6 4 2
data2: data setelah proses cleaning
Kita bikin kategori umur supaya insight lebih menarik.
data3 <- data2 %>%
mutate(Kategori_Umur =
case_when(
age < 20 ~ "Remaja",
age < 40 ~ "Dewasa Muda",
age < 60 ~ "Dewasa",
TRUE ~ "Lansia"
))
head(data3)
## patient_id age gender admission_date diagnosis treatment
## 1 P000001 69 F 2024-01-29 00:00:00 Respiratory Conservative
## 2 P000002 19 F 2023-04-19 00:00:00 Cardiac Therapy
## 3 P000003 35 F 2023-09-12 00:00:00 Infection Therapy
## 4 P000004 52 F 2024-03-02 00:00:00 Respiratory Conservative
## 5 P000005 56 M 2024-09-27 00:00:00 Respiratory Conservative
## 6 P000006 18 M 2023-07-16 00:00:00 Trauma Therapy
## length_of_stay satisfaction Kategori_Umur
## 1 4 2 Lansia
## 2 57 4 Remaja
## 3 52 6 Dewasa Muda
## 4 142 5 Dewasa
## 5 4 6 Dewasa
## 6 4 2 Remaja
data3: dataset setelah penambahan kategori umur.
ggplot(data3, aes(x = diagnosis)) +
geom_bar(fill = "powderblue") +
labs(
title = "Jumlah Pasien Berdasarkan Diagnosis",
x = "Diagnosis",
y = "Jumlah Pasien"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
“Insight Visualisasi 1 — Jumlah Pasien Berdasarkan
Diagnosis
“Berdasarkan visualisasi data, diagnosis trauma memiliki jumlah pasien paling tinggi dibanding diagnosis lainnya, sedangkan diagnosis cardiac memiliki jumlah pasien paling sedikit. Hal ini menunjukkan bahwa kasus trauma menjadi kondisi medis yang paling sering ditangani pada dataset tersebut.”
ggplot(data3, aes(x = Kategori_Umur)) +
geom_bar(fill = "mistyrose1") +
labs(
title = "Distribusi Kelompok Umur Pasien",
x = "Kategori Umur",
y = "Jumlah Pasien"
)
Insight 2 — Distribusi Kelompok Umur Pasien
“Berdasarkan visualisasi data, kelompok usia lansia memiliki jumlah pasien paling banyak, sedangkan kelompok remaja memiliki jumlah pasien paling sedikit. Selain itu, jumlah pasien pada kelompok dewasa muda juga relatif tinggi dibanding kelompok usia dewasa.”
ggplot(data3, aes(x = diagnosis, y = length_of_stay)) +
geom_boxplot(fill = "thistle1") +
labs(
title = "Lama Rawat Inap Berdasarkan Diagnosis",
x = "Diagnosis",
y = "Length of Stay"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Insight Visualisasi 3 — Lama Rawat Inap Berdasarkan
Diagnosis
“Berdasarkan boxplot, distribusi lama rawat inap pada setiap diagnosis cenderung relatif serupa, terlihat dari median yang tidak berbeda jauh antar kelompok diagnosis. Namun, terdapat beberapa nilai ekstrem (outlier), terutama pada diagnosis infection dan cardiac, yang menunjukkan adanya pasien dengan lama rawat inap jauh lebih lama dibanding mayoritas pasien lainnya.”
Kalau satisfaction bentuknya angka/rating:
ggplot(data3, aes(x = satisfaction)) +
geom_bar(fill = "lemonchiffon1") +
labs(
title = "Tingkat Kepuasan Pasien",
x = "Satisfaction",
y = "Jumlah Pasien"
)
Insight Visualisasi 4 — Tingkat Kepuasan Pasien
“Berdasarkan visualisasi data, tingkat kepuasan pasien cenderung tersebar cukup merata pada setiap kategori penilaian. Namun, nilai kepuasan 9 memiliki jumlah pasien paling tinggi dibanding tingkat kepuasan lainnya. Hal ini menunjukkan bahwa sebagian besar pasien memberikan penilaian kepuasan yang cukup baik terhadap pelayanan yang diterima.”