library(readxl)
data <- read_excel("data_1.xlsx")
data
## # A tibble: 110,527 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,517 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
#Membuat missing value
library(readxl)
data$Age[sample(1:nrow(data), 10)] <- NA
data
## # A tibble: 110,527 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,517 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
colSums(is.na(data))
## PatientId AppointmentID Gender ScheduledDay AppointmentDay
## 0 0 0 0 0
## Age Neighbourhood Scholarship Hipertension Diabetes
## 10 0 0 0 0
## Alcoholism Handcap SMS_received No-show
## 0 0 0 0
#Membersihkan Missing Value
data_clean <- data[complete.cases(data), ]
data_clean
## # A tibble: 110,517 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,507 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
#Cek lagi Missing Value
colSums(is.na(data_clean))
## PatientId AppointmentID Gender ScheduledDay AppointmentDay
## 0 0 0 0 0
## Age Neighbourhood Scholarship Hipertension Diabetes
## 0 0 0 0 0
## Alcoholism Handcap SMS_received No-show
## 0 0 0 0
#Cek Apakah data ada duplikat
library(readxl)
sum(duplicated(data))
## [1] 0
#Karna tidak ada duplikat maka kita buat duplikat
data <- rbind(data, data[5, ])
data
## # A tibble: 110,528 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,518 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
#Cek Duplikat
sum(duplicated(data))
## [1] 1
#Bersihkan Duplikat
data_clean <- data[!duplicated(data), ]
data_clean
## # A tibble: 110,527 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,517 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
sum(duplicated(data_clean))
## [1] 0
#Cek Apakah Data Memuat Outlier #Hitung Q1 dan
data_clear <- na.omit(data)
data_clear
## # A tibble: 110,518 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,508 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
library(readxl)
Q1 <- quantile(data_clear$Age, 0.25)
Q1
## 25%
## 18
Q3 <- quantile(data_clear$Age, 0.75)
Q3
## 75%
## 55
#Hitung IQR
IQR_value <- Q3 - Q1
IQR_value
## 75%
## 37
#Hitung Batas
lower_bound <- Q1 - 1.5 * IQR_value
lower_bound
## 25%
## -37.5
upper_bound <- Q3 + 1.5 * IQR_value
upper_bound
## 75%
## 110.5
#Menampilkan Outlier
outlier <- data_clear$Age[data_clear$Age < lower_bound | data_clear$Age > upper_bound]
outlier
## [1] 115 115 115 115 115
#Menghapus Outlier
data_no_outlier <- data_clear[
data_clear$Age >= lower_bound &
data_clear$Age <= upper_bound,
]
data_no_outlier
## # A tibble: 110,513 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,503 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
#Cek Apakah Masih Ada Outlier
summary(data_no_outlier$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 18.00 37.00 37.09 55.00 102.00
#Sebelumnya outlier 115, tapi disitu max 102 sehingga outlier sudah dihapus #Masih ada kesalahan dimana umur tidak mungkin -1 maka perlu dibenahi
data_no_outlier <- data_no_outlier[data_no_outlier$Age >= 0, ]
data_no_outlier
## # A tibble: 110,512 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2.99e13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:0… 62
## 2 5.59e14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:0… 56
## 3 4.26e12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:0… 62
## 4 8.68e11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:0… 8
## 5 8.84e12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:0… 56
## 6 9.60e13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:0… 76
## 7 7.34e14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:0… 23
## 8 3.45e12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:0… 39
## 9 5.64e13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:0… 21
## 10 7.81e13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:0… 19
## # ℹ 110,502 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
summary(data_no_outlier$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 18.00 37.00 37.09 55.00 102.00
#Mencari Mean
mean(data_no_outlier$Age)
## [1] 37.08618
#Mencari Median
median(data_no_outlier$Age)
## [1] 37
#Mencari Modus
modus <- as.numeric(names(which.max(table(data_no_outlier$Age))))
modus
## [1] 0
table(data_no_outlier$Age)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3538 2273 1618 1513 1299 1489 1519 1427 1424 1372 1274 1195 1092 1103 1118 1211
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 1402 1509 1487 1545 1437 1452 1375 1349 1242 1332 1283 1377 1448 1403 1520 1439
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 1504 1524 1526 1378 1580 1533 1629 1536 1402 1346 1272 1344 1487 1453 1459 1394
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
## 1399 1651 1613 1567 1746 1651 1530 1425 1636 1603 1469 1624 1411 1343 1312 1374
## 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
## 1331 1101 1187 973 1012 832 724 695 614 724 602 544 571 527 541 390
## 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 511 434 392 280 311 275 260 184 126 173 109 66 86 53 33 24
## 96 97 98 99 100 102
## 17 11 6 1 4 2