library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read.csv("hospital_dataset.csv")
head(df)
## Nama Tanggal_Lahir Tensi Skin_Stiffness_N_per_mm
## 1 Michael Anderson 01/04/1957 112/67 0.69
## 2 N/A 20/09/1975 140 / 91 1.50
## 3 Tan Wei Ming 12/04/1965 134/72 0.76
## 4 Shen Yi-Ching 11/09/1980 120/79 1.92
## 5 Kung Mei-Lin 22/08/1985 99/77 0.81
## 6 Ho Chuan-Wei 10/08/1962 149/65 0.61
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 1 42.0 37.6 Non-Diabetic 294.0
## 2 41.9 36.5°C Non-Diabetic NA
## 3 26.3 37.5 Non-Diabetic 431.8
## 4 NA 37.0 Diabetic 577.5
## 5 25.5 36.0 Diabetic 502.3
## 6 42.2 36.8 Non-Diabetic 201.4
Load dataset
str(df)
## 'data.frame': 700 obs. of 8 variables:
## $ Nama : chr "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
## $ Tanggal_Lahir : chr "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
## $ Tensi : chr "112/67" "140 / 91" "134/72" "120/79" ...
## $ Skin_Stiffness_N_per_mm : num 0.69 1.5 0.76 1.92 0.81 0.61 1.04 2.24 0.18 NA ...
## $ Microcirculation_PU : num 42 41.9 26.3 NA 25.5 42.2 2 9.5 24.8 40.9 ...
## $ Suhu_Tubuh_Celcius : chr "37.6" "36.5°C" "37.5" "37.0" ...
## $ Penyakit : chr "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
## $ Peak_Plantar_Pressure_kPa: num 294 NA 432 578 502 ...
summary(df)
## Nama Tanggal_Lahir Tensi
## Length:700 Length:700 Length:700
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. : -2.180 Min. : -32.50 Length:700
## 1st Qu.: 0.700 1st Qu.: 18.00 Class :character
## Median : 1.100 Median : 27.70 Mode :character
## Mean : 1.342 Mean : 35.58
## 3rd Qu.: 1.595 3rd Qu.: 39.00
## Max. :150.000 Max. :5000.00
## NA's :37 NA's :50
## Penyakit Peak_Plantar_Pressure_kPa
## Length:700 Min. : -100.0
## Class :character 1st Qu.: 268.6
## Mode :character Median : 384.3
## Mean : 991.9
## 3rd Qu.: 508.5
## Max. :99999.0
## NA's :43
colSums(is.na(df))
## Nama Tanggal_Lahir Tensi
## 0 0 0
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 37 50 0
## Penyakit Peak_Plantar_Pressure_kPa
## 0 43
Fungsi: Melihat tipe data & missing value
df <- df %>% distinct()
jumlah_sebelum <- nrow(read.csv("hospital_dataset.csv"))
jumlah_sesudah <- nrow(df)
jumlah_sebelum
## [1] 700
jumlah_sesudah
## [1] 698
Fungsi: Menghapus data duplikat
df$Microcirculation_PU[is.na(df$Microcirculation_PU)] <- median(df$Microcirculation_PU, na.rm = TRUE)
df$Peak_Plantar_Pressure_kPa[is.na(df$Peak_Plantar_Pressure_kPa)] <- median(df$Peak_Plantar_Pressure_kPa, na.rm = TRUE)
colSums(is.na(df))
## Nama Tanggal_Lahir Tensi
## 0 0 0
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 37 0 0
## Penyakit Peak_Plantar_Pressure_kPa
## 0 0
Fungsi: * Mengisi nilai kosong (NA) * Menggunakan median biar tidak terpengaruh outlier
df$Suhu_Tubuh_Celcius <- gsub("[^0-9.]", "", df$Suhu_Tubuh_Celcius)
df$Suhu_Tubuh_Celcius <- as.numeric(df$Suhu_Tubuh_Celcius)
head(df["Suhu_Tubuh_Celcius"])
## Suhu_Tubuh_Celcius
## 1 37.6
## 2 36.5
## 3 37.5
## 4 37.0
## 5 36.0
## 6 36.8
Hapus simbol ubah ke numeric
head(df["Tensi"])
## Tensi
## 1 112/67
## 2 140 / 91
## 3 134/72
## 4 120/79
## 5 99/77
## 6 149/65
df$Tensi <- gsub(" ", "", df$Tensi)
df <- df %>%
separate(Tensi, into = c("Sistolik", "Diastolik"), sep = "/")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 58 rows [9, 26, 37, 43,
## 52, 78, 83, 103, 128, 130, 137, 144, 157, 163, 165, 190, 200, 209, 212, 214,
## ...].
df$Sistolik <- as.numeric(df$Sistolik)
## Warning: NAs introduced by coercion
df$Diastolik <- as.numeric(df$Diastolik)
## Warning: NAs introduced by coercion
head(df)
## Nama Tanggal_Lahir Sistolik Diastolik Skin_Stiffness_N_per_mm
## 1 Michael Anderson 01/04/1957 112 67 0.69
## 2 N/A 20/09/1975 140 91 1.50
## 3 Tan Wei Ming 12/04/1965 134 72 0.76
## 4 Shen Yi-Ching 11/09/1980 120 79 1.92
## 5 Kung Mei-Lin 22/08/1985 99 77 0.81
## 6 Ho Chuan-Wei 10/08/1962 149 65 0.61
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 1 42.0 37.6 Non-Diabetic 294.0
## 2 41.9 36.5 Non-Diabetic 379.6
## 3 26.3 37.5 Non-Diabetic 431.8
## 4 27.7 37.0 Diabetic 577.5
## 5 25.5 36.0 Diabetic 502.3
## 6 42.2 36.8 Non-Diabetic 201.4
Memecah tekanan darah
df$Skin_Stiffness_N_per_mm[df$Skin_Stiffness_N_per_mm < 0] <- NA
df$Microcirculation_PU[df$Microcirculation_PU < 0] <- NA
df$Peak_Plantar_Pressure_kPa[df$Peak_Plantar_Pressure_kPa < 0] <- NA
summary(df)
## Nama Tanggal_Lahir Sistolik Diastolik
## Length:698 Length:698 Min. : 78.0 Min. : 46.0
## Class :character Class :character 1st Qu.:111.5 1st Qu.: 71.0
## Mode :character Mode :character Median :121.0 Median : 78.0
## Mean :121.3 Mean : 78.4
## 3rd Qu.:131.0 3rd Qu.: 85.0
## Max. :164.0 Max. :113.0
## NA's :59 NA's :60
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. : 0.100 Min. : 1.00 Min. : 1.00
## 1st Qu.: 0.700 1st Qu.: 19.52 1st Qu.:36.50
## Median : 1.100 Median : 27.70 Median :36.80
## Mean : 1.365 Mean : 35.65 Mean :36.84
## 3rd Qu.: 1.600 3rd Qu.: 37.98 3rd Qu.:37.10
## Max. :150.000 Max. :5000.00 Max. :99.90
## NA's :43 NA's :8 NA's :49
## Penyakit Peak_Plantar_Pressure_kPa
## Length:698 Min. : 0.001
## Class :character 1st Qu.: 281.450
## Mode :character Median : 379.600
## Mean : 961.846
## 3rd Qu.: 502.675
## Max. :99999.000
## NA's :4
Menghapus nilai tidak masuk akal
df$Penyakit <- tolower(df$Penyakit)
df$Penyakit <- ifelse(df$Penyakit %in% c("diabetic","dm","yes","1"),
"Diabetic",
ifelse(df$Penyakit %in% c("non-diabetic","no","normal"),
"Non-Diabetic",
"Other"))
table(df$Penyakit)
##
## Diabetic Non-Diabetic Other
## 306 341 51
Menyatukan kategori
df$Tanggal_Lahir <- as.Date(df$Tanggal_Lahir, format="%d/%m/%Y")
df$Umur <- floor(as.numeric(Sys.Date() - df$Tanggal_Lahir)/365)
head(df[, c("Nama", "Umur")])
## Nama Umur
## 1 Michael Anderson 69
## 2 N/A 50
## 3 Tan Wei Ming 61
## 4 Shen Yi-Ching 45
## 5 Kung Mei-Lin 40
## 6 Ho Chuan-Wei 63
colSums(is.na(df))
## Nama Tanggal_Lahir Sistolik
## 0 64 59
## Diastolik Skin_Stiffness_N_per_mm Microcirculation_PU
## 60 43 8
## Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 49 0 4
## Umur
## 64
df %>%
group_by(Penyakit) %>%
summarise(
rata_sistolik = mean(Sistolik, na.rm = TRUE),
rata_diastolik = mean(Diastolik, na.rm = TRUE)
)
## # A tibble: 3 × 3
## Penyakit rata_sistolik rata_diastolik
## <chr> <dbl> <dbl>
## 1 Diabetic 125. 82.1
## 2 Non-Diabetic 118. 75.1
## 3 Other 125. 78.6
head(df)
## Nama Tanggal_Lahir Sistolik Diastolik Skin_Stiffness_N_per_mm
## 1 Michael Anderson 1957-04-01 112 67 0.69
## 2 N/A 1975-09-20 140 91 1.50
## 3 Tan Wei Ming 1965-04-12 134 72 0.76
## 4 Shen Yi-Ching 1980-09-11 120 79 1.92
## 5 Kung Mei-Lin 1985-08-22 99 77 0.81
## 6 Ho Chuan-Wei 1962-08-10 149 65 0.61
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 1 42.0 37.6 Non-Diabetic 294.0
## 2 41.9 36.5 Non-Diabetic 379.6
## 3 26.3 37.5 Non-Diabetic 431.8
## 4 27.7 37.0 Diabetic 577.5
## 5 25.5 36.0 Diabetic 502.3
## 6 42.2 36.8 Non-Diabetic 201.4
## Umur
## 1 69
## 2 50
## 3 61
## 4 45
## 5 40
## 6 63