library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
data <- read_csv("hospital_dataset.csv")
## Rows: 700 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Nama, Tanggal_Lahir, Tensi, Suhu_Tubuh_Celcius, Penyakit
## dbl (3): Skin_Stiffness_N_per_mm, Microcirculation_PU, Peak_Plantar_Pressure...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(data)
## Rows: 700
## Columns: 8
## $ Nama <chr> "Michael Anderson", "N/A", "Tan Wei Ming", "…
## $ Tanggal_Lahir <chr> "1/4/1957", "20/09/1975", "12/4/1965", "11/9…
## $ Tensi <chr> "112/67", "140 / 91", "134/72", "120/79", "9…
## $ Skin_Stiffness_N_per_mm <dbl> 0.69, 1.50, 0.76, 1.92, 0.81, 0.61, 1.04, 2.…
## $ Microcirculation_PU <dbl> 42.0, 41.9, 26.3, NA, 25.5, 42.2, 2.0, 9.5, …
## $ Suhu_Tubuh_Celcius <chr> "37.6", "36.5°C", "37.5", "37", "36", "36.8"…
## $ Penyakit <chr> "Non-Diabetic", "Non-Diabetic", "Non-Diabeti…
## $ Peak_Plantar_Pressure_kPa <dbl> 294.0, NA, 431.8, 577.5, 502.3, 201.4, 512.8…
summary(data)
## Nama Tanggal_Lahir Tensi
## Length:700 Length:700 Length:700
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. : -2.180 Min. : -32.50 Length:700
## 1st Qu.: 0.700 1st Qu.: 18.00 Class :character
## Median : 1.100 Median : 27.70 Mode :character
## Mean : 1.342 Mean : 35.58
## 3rd Qu.: 1.595 3rd Qu.: 39.00
## Max. :150.000 Max. :5000.00
## NA's :37 NA's :50
## Penyakit Peak_Plantar_Pressure_kPa
## Length:700 Min. : -100.0
## Class :character 1st Qu.: 268.6
## Mode :character Median : 384.3
## Mean : 991.9
## 3rd Qu.: 508.5
## Max. :99999.0
## NA's :43
colSums(is.na(data))
## Nama Tanggal_Lahir Tensi
## 40 42 47
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 37 50 49
## Penyakit Peak_Plantar_Pressure_kPa
## 45 43
data %>%
count(Nama, sort = TRUE)
## # A tibble: 59 × 2
## Nama n
## <chr> <int>
## 1 <NA> 40
## 2 Lu Hsiang-Ling 22
## 3 Ong Lay Kheng 21
## 4 Richard Martin 20
## 5 Tung Li-Fang 20
## 6 Chang Chung-Wei 18
## 7 Hsu Kuo-Chang 18
## 8 Tan Wei Ming 18
## 9 Barbara Taylor 16
## 10 Pan Mei-Hsuan 16
## # ℹ 49 more rows
data %>%
count(Penyakit, sort = TRUE)
## # A tibble: 16 × 2
## Penyakit n
## <chr> <int>
## 1 Non-Diabetic 334
## 2 Diabetic 294
## 3 <NA> 45
## 4 DM 5
## 5 DIABETIC 3
## 6 No 3
## 7 Tidak 3
## 8 1 2
## 9 Yes 2
## 10 diabetic 2
## 11 non-diabetic 2
## 12 Healthy 1
## 13 NON-DIABETIC 1
## 14 Normal 1
## 15 Sakit 1
## 16 Sehat 1
data %>%
filter(str_detect(Suhu_Tubuh_Celcius, "[^0-9.]") & !is.na(Suhu_Tubuh_Celcius)) %>%
count(Suhu_Tubuh_Celcius, sort = TRUE)
## # A tibble: 13 × 2
## Suhu_Tubuh_Celcius n
## <chr> <int>
## 1 36.5°C 3
## 2 -1 1
## 3 36.1 derajat 1
## 4 36.5 derajat 1
## 5 36.6 derajat 1
## 6 36.7 derajat 1
## 7 36.9 C 1
## 8 36.9°C 1
## 9 37.0 derajat 1
## 10 37.1celcius 1
## 11 37.2celcius 1
## 12 37.2°C 1
## 13 37.4°C 1
data %>%
count(Tensi, sort = TRUE) %>%
filter(!str_detect(coalesce(Tensi, ""), "^\\d{2,3}/\\d{2,3}$") | is.na(Tensi))
## # A tibble: 22 × 2
## Tensi n
## <chr> <int>
## 1 <NA> 47
## 2 101 / 96 1
## 3 102 / 71 1
## 4 103mmHg/81 1
## 5 107|60 1
## 6 110 1
## 7 114mmHg/67 1
## 8 114|71 1
## 9 122-71 1
## 10 126/63 mmHg 1
## # ℹ 12 more rows
data_clean <- data %>%
mutate(
Nama = if_else(
str_trim(Nama) %in% c("N/A", "UNKNOWN", "Pasien", ".", "123456", "???", "unknown", "NULL"),
NA_character_,
Nama
),
Penyakit = case_when(
Penyakit %in% c("DIABETIC", "diabetic", "DM", "Yes", "1", "Sakit") ~ "Diabetic",
Penyakit %in% c("No", "Tidak", "non-diabetic", "Healthy", "NON-DIABETIC", "Normal", "Sehat") ~ "Non-Diabetic",
TRUE ~ Penyakit
),
Suhu_Tubuh_Celcius = as.numeric(str_replace_all(Suhu_Tubuh_Celcius, "[^0-9.]", "")),
Tanggal_Lahir = parse_date_time(Tanggal_Lahir, orders = c("dmy", "y")),
Tensi_bersih = str_replace_all(Tensi, " Dia:", "/"),
Tensi_bersih = str_replace_all(Tensi_bersih, "[|-]", "/"),
Tensi_bersih = str_replace_all(Tensi_bersih, "[^0-9/]", ""),
Sistolik = as.numeric(str_split_fixed(Tensi_bersih, "/", 2)[,1]),
Diastolik = as.numeric(str_split_fixed(Tensi_bersih, "/", 2)[,2])
) %>%
select(-Tensi, -Tensi_bersih)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Tanggal_Lahir = parse_date_time(Tanggal_Lahir, orders =
## c("dmy", "y"))`.
## Caused by warning:
## ! 3 failed to parse.
data_clean %>%
count(Penyakit, sort = TRUE)
## # A tibble: 3 × 2
## Penyakit n
## <chr> <int>
## 1 Non-Diabetic 346
## 2 Diabetic 309
## 3 <NA> 45
data_clean %>%
select(Nama, Sistolik, Diastolik) %>%
head(5)
## # A tibble: 5 × 3
## Nama Sistolik Diastolik
## <chr> <dbl> <dbl>
## 1 Michael Anderson 112 67
## 2 <NA> 140 91
## 3 Tan Wei Ming 134 72
## 4 Shen Yi-Ching 120 79
## 5 Kung Mei-Lin 99 77
data_clean %>%
select(Suhu_Tubuh_Celcius, Sistolik, Diastolik, Tanggal_Lahir) %>%
summary()
## Suhu_Tubuh_Celcius Sistolik Diastolik
## Min. : 1.00 Min. : 78.0 Min. : 46.00
## 1st Qu.:36.50 1st Qu.:111.0 1st Qu.: 71.00
## Median :36.80 Median :121.0 Median : 78.00
## Mean :36.84 Mean :121.3 Mean : 78.28
## 3rd Qu.:37.10 3rd Qu.:131.0 3rd Qu.: 85.00
## Max. :99.90 Max. :164.0 Max. :113.00
## NA's :49 NA's :47 NA's :49
## Tanggal_Lahir
## Min. :1940-02-21 00:00:00
## 1st Qu.:1956-09-09 00:00:00
## Median :1973-10-18 00:00:00
## Mean :1973-09-24 19:38:22
## 3rd Qu.:1989-10-02 00:00:00
## Max. :2067-02-02 00:00:00
## NA's :45
data_clean2 <- data_clean %>%
drop_na(Penyakit, Tanggal_Lahir, Sistolik, Diastolik) %>%
mutate(
Suhu_Tubuh_Celcius = replace_na(Suhu_Tubuh_Celcius, median(Suhu_Tubuh_Celcius, na.rm = TRUE)),
Skin_Stiffness_N_per_mm = replace_na(Skin_Stiffness_N_per_mm, median(Skin_Stiffness_N_per_mm, na.rm = TRUE)),
Microcirculation_PU = replace_na(Microcirculation_PU, median(Microcirculation_PU, na.rm = TRUE)),
Peak_Plantar_Pressure_kPa = replace_na(Peak_Plantar_Pressure_kPa, median(Peak_Plantar_Pressure_kPa, na.rm = TRUE))
)
colSums(is.na(data_clean2))
## Nama Tanggal_Lahir Skin_Stiffness_N_per_mm
## 42 0 0
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit
## 0 0 0
## Peak_Plantar_Pressure_kPa Sistolik Diastolik
## 0 0 0
cat("Jumlah baris duplikat:", sum(duplicated(data_clean2)), "\n")
## Jumlah baris duplikat: 4
data_clean2 <- data_clean2 %>% distinct()
cat("Baris setelah hapus duplikat:", nrow(data_clean2), "\n")
## Baris setelah hapus duplikat: 565
par(mfrow=c(2,3))
boxplot(data_clean2$Sistolik, main="Sistolik", col="orange")
boxplot(data_clean2$Diastolik, main="Diastolik", col="lightblue")
boxplot(data_clean2$Suhu_Tubuh_Celcius, main="Suhu Tubuh", col="pink")
boxplot(data_clean2$Skin_Stiffness_N_per_mm, main="Skin Stiffness", col="lightgreen")
boxplot(data_clean2$Microcirculation_PU, main="Microcirculation", col="yellow")
boxplot(data_clean2$Peak_Plantar_Pressure_kPa, main="Peak Plantar", col="grey")

par(mfrow=c(1,1))
data_final <- data_clean2 %>%
filter(
Suhu_Tubuh_Celcius > 30 & Suhu_Tubuh_Celcius < 45,
Sistolik > 40 & Sistolik < 250,
Diastolik > 30 & Diastolik < 150,
Skin_Stiffness_N_per_mm > 0 & Skin_Stiffness_N_per_mm < 100,
Microcirculation_PU > 0 & Microcirculation_PU < 1000,
Peak_Plantar_Pressure_kPa > 0 & Peak_Plantar_Pressure_kPa < 10000
)
par(mfrow=c(2,3))
boxplot(data_final$Sistolik, main="Sistolik", col="orange")
boxplot(data_final$Diastolik, main="Diastolik", col="lightblue")
boxplot(data_final$Suhu_Tubuh_Celcius, main="Suhu Tubuh", col="pink")
boxplot(data_final$Skin_Stiffness_N_per_mm, main="Skin Stiffness", col="lightgreen")
boxplot(data_final$Microcirculation_PU, main="Microcirculation", col="yellow")
boxplot(data_final$Peak_Plantar_Pressure_kPa, main="Peak Plantar", col="grey")
