library(readxl)
data <- read_excel("data_1.xlsx")
data
## # A tibble: 110,527 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,517 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>

#Membuat missing value

library(readxl)
data$Age[sample(1:nrow(data), 10)] <- NA
data
## # A tibble: 110,527 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,517 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>
colSums(is.na(data))
##      PatientId  AppointmentID         Gender   ScheduledDay AppointmentDay 
##              0              0              0              0              0 
##            Age  Neighbourhood    Scholarship   Hipertension       Diabetes 
##             10              0              0              0              0 
##     Alcoholism        Handcap   SMS_received        No-show 
##              0              0              0              0

#Membersihkan Missing Value

data_clean <- data[complete.cases(data), ]
data_clean
## # A tibble: 110,517 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,507 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>

#Cek lagi Missing Value

colSums(is.na(data_clean))
##      PatientId  AppointmentID         Gender   ScheduledDay AppointmentDay 
##              0              0              0              0              0 
##            Age  Neighbourhood    Scholarship   Hipertension       Diabetes 
##              0              0              0              0              0 
##     Alcoholism        Handcap   SMS_received        No-show 
##              0              0              0              0

#Cek Apakah data ada duplikat

library(readxl)
sum(duplicated(data))
## [1] 0

#Karna tidak ada duplikat maka kita buat duplikat

data <- rbind(data, data[5, ])
data
## # A tibble: 110,528 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,518 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>

#Cek Duplikat

sum(duplicated(data))
## [1] 1

#Bersihkan Duplikat

data_clean <- data[!duplicated(data), ]
data_clean
## # A tibble: 110,527 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,517 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>
sum(duplicated(data_clean))
## [1] 0

#Cek Apakah Data Memuat Outlier #Hitung Q1 dan

data_clear <- na.omit(data)
data_clear
## # A tibble: 110,518 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,508 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>
library(readxl)
Q1 <- quantile(data_clear$Age, 0.25)
Q1
## 25% 
##  18
Q3 <- quantile(data_clear$Age, 0.75)
Q3
## 75% 
##  55

#Hitung IQR

IQR_value <- Q3 - Q1
IQR_value
## 75% 
##  37

#Hitung Batas

lower_bound <- Q1 - 1.5 * IQR_value
lower_bound
##   25% 
## -37.5
upper_bound <- Q3 + 1.5 * IQR_value
upper_bound
##   75% 
## 110.5

#Menampilkan Outlier

outlier <- data_clear$Age[data_clear$Age < lower_bound | data_clear$Age > upper_bound]
outlier
## [1] 115 115 115 115 115

#Menghapus Outlier

data_no_outlier <- data_clear[
  data_clear$Age >= lower_bound & 
  data_clear$Age <= upper_bound,
]
data_no_outlier
## # A tibble: 110,513 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,503 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>

#Cek Apakah Masih Ada Outlier

summary(data_no_outlier$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.00   18.00   37.00   37.09   55.00  102.00

#Sebelumnya outlier 115, tapi disitu max 102 sehingga outlier sudah dihapus #Masih ada kesalahan dimana umur tidak mungkin -1 maka perlu dibenahi

data_no_outlier <- data_no_outlier[data_no_outlier$Age >= 0, ]
data_no_outlier
## # A tibble: 110,512 × 14
##    PatientId AppointmentID Gender ScheduledDay         AppointmentDay        Age
##        <dbl>         <dbl> <chr>  <chr>                <chr>               <dbl>
##  1   2.99e13       5642903 F      2016-04-29T18:38:08Z 2016-04-29T00:00:0…    62
##  2   5.59e14       5642503 M      2016-04-29T16:08:27Z 2016-04-29T00:00:0…    56
##  3   4.26e12       5642549 F      2016-04-29T16:19:04Z 2016-04-29T00:00:0…    62
##  4   8.68e11       5642828 F      2016-04-29T17:29:31Z 2016-04-29T00:00:0…     8
##  5   8.84e12       5642494 F      2016-04-29T16:07:23Z 2016-04-29T00:00:0…    56
##  6   9.60e13       5626772 F      2016-04-27T08:36:51Z 2016-04-29T00:00:0…    76
##  7   7.34e14       5630279 F      2016-04-27T15:05:12Z 2016-04-29T00:00:0…    23
##  8   3.45e12       5630575 F      2016-04-27T15:39:58Z 2016-04-29T00:00:0…    39
##  9   5.64e13       5638447 F      2016-04-29T08:02:16Z 2016-04-29T00:00:0…    21
## 10   7.81e13       5629123 F      2016-04-27T12:48:25Z 2016-04-29T00:00:0…    19
## # ℹ 110,502 more rows
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>
summary(data_no_outlier$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   18.00   37.00   37.09   55.00  102.00

#Mencari Mean

mean(data_no_outlier$Age)
## [1] 37.08618

#Mencari Median

median(data_no_outlier$Age)
## [1] 37

#Mencari Modus

modus <- as.numeric(names(which.max(table(data_no_outlier$Age))))
modus
## [1] 0
table(data_no_outlier$Age)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 3538 2273 1618 1513 1299 1489 1519 1427 1424 1372 1274 1195 1092 1103 1118 1211 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
## 1402 1509 1487 1545 1437 1452 1375 1349 1242 1332 1283 1377 1448 1403 1520 1439 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
## 1504 1524 1526 1378 1580 1533 1629 1536 1402 1346 1272 1344 1487 1453 1459 1394 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
## 1399 1651 1613 1567 1746 1651 1530 1425 1636 1603 1469 1624 1411 1343 1312 1374 
##   64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79 
## 1331 1101 1187  973 1012  832  724  695  614  724  602  544  571  527  541  390 
##   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95 
##  511  434  392  280  311  275  260  184  126  173  109   66   86   53   33   24 
##   96   97   98   99  100  102 
##   17   11    6    1    4    2