library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
datars <- read_csv("hospital_dataset.csv")
## Rows: 700 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Nama, Tanggal_Lahir, Tensi, Suhu_Tubuh_Celcius, Penyakit
## dbl (3): Skin_Stiffness_N_per_mm, Microcirculation_PU, Peak_Plantar_Pressure...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(datars)
## # A tibble: 6 × 8
## Nama Tanggal_Lahir Tensi Skin_Stiffness_N_per…¹ Microcirculation_PU
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Michael Anders… 01/04/1957 112/… 0.69 42
## 2 N/A 20/09/1975 140 … 1.5 41.9
## 3 Tan Wei Ming 12/04/1965 134/… 0.76 26.3
## 4 Shen Yi-Ching 11/09/1980 120/… 1.92 NA
## 5 Kung Mei-Lin 22/08/1985 99/77 0.81 25.5
## 6 Ho Chuan-Wei 10/08/1962 149/… 0.61 42.2
## # ℹ abbreviated name: ¹​Skin_Stiffness_N_per_mm
## # ℹ 3 more variables: Suhu_Tubuh_Celcius <chr>, Penyakit <chr>,
## # Peak_Plantar_Pressure_kPa <dbl>
str(datars)
## spc_tbl_ [700 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Nama : chr [1:700] "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
## $ Tanggal_Lahir : chr [1:700] "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
## $ Tensi : chr [1:700] "112/67" "140 / 91" "134/72" "120/79" ...
## $ Skin_Stiffness_N_per_mm : num [1:700] 0.69 1.5 0.76 1.92 0.81 0.61 1.04 2.24 0.18 NA ...
## $ Microcirculation_PU : num [1:700] 42 41.9 26.3 NA 25.5 42.2 2 9.5 24.8 40.9 ...
## $ Suhu_Tubuh_Celcius : chr [1:700] "37.6" "36.5°C" "37.5" "37.0" ...
## $ Penyakit : chr [1:700] "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
## $ Peak_Plantar_Pressure_kPa: num [1:700] 294 NA 432 578 502 ...
## - attr(*, "spec")=
## .. cols(
## .. Nama = col_character(),
## .. Tanggal_Lahir = col_character(),
## .. Tensi = col_character(),
## .. Skin_Stiffness_N_per_mm = col_double(),
## .. Microcirculation_PU = col_double(),
## .. Suhu_Tubuh_Celcius = col_character(),
## .. Penyakit = col_character(),
## .. Peak_Plantar_Pressure_kPa = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
glimpse(datars)
## Rows: 700
## Columns: 8
## $ Nama <chr> "Michael Anderson", "N/A", "Tan Wei Ming", "…
## $ Tanggal_Lahir <chr> "01/04/1957", "20/09/1975", "12/04/1965", "1…
## $ Tensi <chr> "112/67", "140 / 91", "134/72", "120/79", "9…
## $ Skin_Stiffness_N_per_mm <dbl> 0.69, 1.50, 0.76, 1.92, 0.81, 0.61, 1.04, 2.…
## $ Microcirculation_PU <dbl> 42.0, 41.9, 26.3, NA, 25.5, 42.2, 2.0, 9.5, …
## $ Suhu_Tubuh_Celcius <chr> "37.6", "36.5°C", "37.5", "37.0", "36.0", "3…
## $ Penyakit <chr> "Non-Diabetic", "Non-Diabetic", "Non-Diabeti…
## $ Peak_Plantar_Pressure_kPa <dbl> 294.0, NA, 431.8, 577.5, 502.3, 201.4, 512.8…
summary(datars)
## Nama Tanggal_Lahir Tensi
## Length:700 Length:700 Length:700
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. : -2.180 Min. : -32.50 Length:700
## 1st Qu.: 0.700 1st Qu.: 18.00 Class :character
## Median : 1.100 Median : 27.70 Mode :character
## Mean : 1.342 Mean : 35.58
## 3rd Qu.: 1.595 3rd Qu.: 39.00
## Max. :150.000 Max. :5000.00
## NA's :37 NA's :50
## Penyakit Peak_Plantar_Pressure_kPa
## Length:700 Min. : -100.0
## Class :character 1st Qu.: 268.6
## Mode :character Median : 384.3
## Mean : 991.9
## 3rd Qu.: 508.5
## Max. :99999.0
## NA's :43
#melihat nama variabel
colnames(datars)
## [1] "Nama" "Tanggal_Lahir"
## [3] "Tensi" "Skin_Stiffness_N_per_mm"
## [5] "Microcirculation_PU" "Suhu_Tubuh_Celcius"
## [7] "Penyakit" "Peak_Plantar_Pressure_kPa"
#cek data kosong
colSums(is.na(datars))
## Nama Tanggal_Lahir Tensi
## 40 42 47
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 37 50 49
## Penyakit Peak_Plantar_Pressure_kPa
## 45 43
#DATA CLEANING
datars_hapus <- na.omit(datars)
#isi dengan median
datars$Skin_Stiffness_N_per_mm[is.na(datars$Skin_Stiffness_N_per_mm)] <-
median(datars$Skin_Stiffness_N_per_mm, na.rm=TRUE)
datars$Microcirculation_PU[is.na(datars$Microcirculation_PU)] <-
median(datars$Microcirculation_PU, na.rm=TRUE)
datars$Peak_Plantar_Pressure_kPa[is.na(datars$Peak_Plantar_Pressure_kPa)] <-
median(datars$Peak_Plantar_Pressure_kPa, na.rm=TRUE)
#Deteksi outlier (Kuartil & IQR)
Q1 <- quantile(datars$Microcirculation_PU, 0.25, na.rm=TRUE)
Q3 <- quantile(datars$Microcirculation_PU, 0.75, na.rm=TRUE)
IQR <- Q3 - Q1
batas_bawah <- Q1 - 1.5*IQR
batas_atas <- Q3 + 1.5*IQR
#INCONSISTENCY DATA FORMAT ANGKA
#Bersihkan suhu tubuh
datars$Suhu_Tubuh_Celcius <- as.numeric(gsub("[^0-9.]","",datars$Suhu_Tubuh_Celcius))
# Ubah ke numerik dulu agar aman
datars$Suhu_Tubuh_Celcius <- as.numeric(datars$Suhu_Tubuh_Celcius)
# Isi NA dengan nilai tengah (median)
datars$Suhu_Tubuh_Celcius[is.na(datars$Suhu_Tubuh_Celcius)] <- median(datars$Suhu_Tubuh_Celcius, na.rm=TRUE)
#pisahkan tensi
if("Tensi" %in% names(datars)){
datars <- datars %>%
mutate(Tensi = gsub(" ","",Tensi)) %>%
separate(
col = Tensi,
into = c("Sistolik","Diastolik"),
sep = "/",
remove = TRUE
)
}
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 11 rows [78, 137, 440,
## 480, 488, 535, 544, 559, 624, 639, 653].
#MENANGANI INKONSISTENSI TEKS (CASE & WHITESPACE)
# hapus spasi depan belakang
datars$Nama <- trimws(datars$Nama)
datars$Penyakit <- trimws(datars$Penyakit)
# samakan huruf kecil
datars$Penyakit <- tolower(datars$Penyakit)
# ubah ke format rapi
datars$Penyakit <- str_to_title(datars$Penyakit)
colSums(is.na(datars))
## Nama Tanggal_Lahir Sistolik
## 40 42 47
## Diastolik Skin_Stiffness_N_per_mm Microcirculation_PU
## 58 0 0
## Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 0 45 0
#Isi kolom numerik dengan median
datars$Sistolik <- as.numeric(datars$Sistolik)
## Warning: NAs introduced by coercion
datars$Diastolik <- as.numeric(datars$Diastolik)
## Warning: NAs introduced by coercion
datars$Sistolik[is.na(datars$Sistolik)] <- median(datars$Sistolik, na.rm=TRUE)
datars$Diastolik[is.na(datars$Diastolik)] <- median(datars$Diastolik, na.rm=TRUE)
#isi kolom teks dengan modus
ModeFunc <- function(x){
names(sort(table(x), decreasing=TRUE))[1]
}
datars$Nama[is.na(datars$Nama)] <- ModeFunc(datars$Nama)
datars$Penyakit[is.na(datars$Penyakit)] <- ModeFunc(datars$Penyakit)
#isi tanggal lahir
datars$Tanggal_Lahir[is.na(datars$Tanggal_Lahir)] <-
ModeFunc(datars$Tanggal_Lahir)
colSums(is.na(datars))
## Nama Tanggal_Lahir Sistolik
## 0 0 0
## Diastolik Skin_Stiffness_N_per_mm Microcirculation_PU
## 0 0 0
## Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 0 0 0
summary(datars)
## Nama Tanggal_Lahir Sistolik Diastolik
## Length:700 Length:700 Min. : 78.0 Min. : 46.00
## Class :character Class :character 1st Qu.:113.0 1st Qu.: 72.00
## Mode :character Mode :character Median :121.0 Median : 78.00
## Mean :121.3 Mean : 78.39
## 3rd Qu.:130.0 3rd Qu.: 84.25
## Max. :164.0 Max. :113.00
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. : -2.180 Min. : -32.50 Min. : 1.00
## 1st Qu.: 0.710 1st Qu.: 19.30 1st Qu.:36.50
## Median : 1.100 Median : 27.70 Median :36.80
## Mean : 1.330 Mean : 35.01 Mean :36.83
## 3rd Qu.: 1.562 3rd Qu.: 37.83 3rd Qu.:37.00
## Max. :150.000 Max. :5000.00 Max. :99.90
## Penyakit Peak_Plantar_Pressure_kPa
## Length:700 Min. : -100.0
## Class :character 1st Qu.: 276.8
## Mode :character Median : 384.3
## Mean : 954.6
## 3rd Qu.: 502.4
## Max. :99999.0
# ubah nilai tidak logis jadi NA
datars$Skin_Stiffness_N_per_mm[
datars$Skin_Stiffness_N_per_mm < 0 |
datars$Skin_Stiffness_N_per_mm > 10] <- NA
datars$Microcirculation_PU[
datars$Microcirculation_PU < 0 |
datars$Microcirculation_PU > 100] <- NA
datars$Suhu_Tubuh_Celcius[
datars$Suhu_Tubuh_Celcius < 30 |
datars$Suhu_Tubuh_Celcius > 45] <- NA
datars$Peak_Plantar_Pressure_kPa[
datars$Peak_Plantar_Pressure_kPa < 0 |
datars$Peak_Plantar_Pressure_kPa > 1000] <- NA
datars$Sistolik[datars$Sistolik > 250]
## numeric(0)
datars$Sistolik[
datars$Sistolik < 70 |
datars$Sistolik > 250
] <- NA
datars$Sistolik[is.na(datars$Sistolik)] <-
median(datars$Sistolik, na.rm=TRUE)
#isi lagi dengan median
num <- c("Skin_Stiffness_N_per_mm",
"Microcirculation_PU",
"Suhu_Tubuh_Celcius",
"Peak_Plantar_Pressure_kPa")
for(i in num){
datars[[i]][is.na(datars[[i]])] <- median(datars[[i]], na.rm=TRUE)
}
datars$Penyakit <- as.factor(datars$Penyakit)
summary(datars)
## Nama Tanggal_Lahir Sistolik Diastolik
## Length:700 Length:700 Min. : 78.0 Min. : 46.00
## Class :character Class :character 1st Qu.:113.0 1st Qu.: 72.00
## Mode :character Mode :character Median :121.0 Median : 78.00
## Mean :121.3 Mean : 78.39
## 3rd Qu.:130.0 3rd Qu.: 84.25
## Max. :164.0 Max. :113.00
##
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. :0.100 Min. : 1.00 Min. :35.5
## 1st Qu.:0.720 1st Qu.:19.75 1st Qu.:36.5
## Median :1.100 Median :27.70 Median :36.8
## Mean :1.137 Mean :28.43 Mean :36.8
## 3rd Qu.:1.560 3rd Qu.:37.65 3rd Qu.:37.0
## Max. :2.900 Max. :77.30 Max. :42.5
##
## Penyakit Peak_Plantar_Pressure_kPa
## Non-Diabetic:382 Min. : 0.001
## Diabetic :299 1st Qu.:282.650
## Dm : 5 Median :384.300
## No : 3 Mean :388.101
## Tidak : 3 3rd Qu.:499.850
## 1 : 2 Max. :715.400
## (Other) : 6
# Bar chart penyakit
ggplot(datars, aes(Penyakit)) +
geom_bar(fill="steelblue")
# Histogram suhu tubuh
ggplot(datars, aes(Suhu_Tubuh_Celcius)) +
geom_histogram(fill="orange", bins=20)
# Boxplot tekanan kaki
ggplot(datars, aes(y=Peak_Plantar_Pressure_kPa)) +
geom_boxplot(fill="green")