Packages

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(writexl)
library(openxlsx)

Import Data

data <- read_excel("D:/SEMESTER 5/PSD/Projek UTS/Data Mentah.xlsx")
View(data)

Cleaning Data

# Mengganti String Kosong menjadi NA
data[data == ""] <- NA
nrow(data)
## [1] 3019
# Menghapus Baris Jika Rating bernilai NA
data1 <- data %>%
  filter(!is.na(Rating))
nrow(data1)
## [1] 2651
# Menghapus Baris Jika Harga bernilai NA
data2 <- data1 %>%
  filter(!is.na(Harga))
nrow(data2)
## [1] 2649
# Menghapus Baris Jika Kota bernilai NA dan Show on Map
data3 <- data2 %>%
  filter(!is.na(Kota) & Kota != "Show on Map")
View(data3)
nrow(data3)
## [1] 1813
# Mengubah Data sesuai Kata Kunci pada Kolom Room
data4 <- data3 %>%
  mutate(
    Room = ifelse(grepl("apartment", Room, ignore.case = TRUE), "Apartment", Room),
    Room = ifelse(grepl("villa|vila|house|home|rumah", Room, ignore.case = TRUE), "Villa", Room),
    Room = ifelse(grepl("superior deluxe|suite|premier|premiere|first class", Room, ignore.case = TRUE), "Superior Deluxe Room", Room),
    Room = ifelse(grepl("superior|royal|executive", Room, ignore.case = TRUE), "Superior Room", Room),
    Room = ifelse(grepl("deluxe|premium|exclusive", Room, ignore.case = TRUE), "Deluxe Room", Room),
    Room = ifelse(grepl("studio", Room, ignore.case = TRUE), "Studio", Room),
    Room = ifelse(grepl("family", Room, ignore.case = TRUE), "Family Room", Room),
    Room = ifelse(grepl(("twin|double|standar|classic|pavilion|simple|singly|single"), Room, ignore.case = TRUE), "Standard Room", Room),
  )
# Menampilkan hasil
View(data4)
# Filter Kolom Room
room_types <- c("Apartment", "Villa", "Superior Deluxe Room", "Superior Room", 
                "Deluxe Room", "Studio", "Family Room", "Standard Room")

# Menghapus baris yang tidak sesuai dengan daftar room_types
data5 <- data4 %>%
  filter(Room %in% room_types)

# Menampilkan hasil
View(data5)
nrow(data5)
## [1] 1630
# Menghapus Baris Jika Bintang bernilai NA
data6 <- data5 %>%
  filter(!is.na(Bintang))
nrow(data5)
## [1] 1630
# Menghapus Baris Jika Review bernilai NA
data7 <- data6 %>%
  filter(!is.na(Review))
nrow(data7)
## [1] 1284
View(data7)
# Mengubah Data sesuai Kata Kunci pada Kolom Tag
data8 <- data7 %>%
  mutate(
    Tag = ifelse(grepl("4-hour Cancellation Window", Tag, ignore.case = TRUE), "Free Cancellation", Tag),
    Tag = ifelse(grepl("Breakfast & dinner included", Tag, ignore.case = TRUE), "Breakfast included", Tag),
  )

# Mengubah Data NA pada Kolom Tag
data9 <- data8 %>%
  mutate(Tag = ifelse(is.na(Tag), "Nothing", Tag))
View(data9)
data9 %>%
  count(`Bintang`)
## # A tibble: 5 × 2
##   Bintang         n
##   <chr>       <int>
## 1 Amazing       165
## 2 Good          117
## 3 Great         282
## 4 Outstanding    79
## 5 Very Good     641
data9 %>%
  count(`Kota`)
## # A tibble: 120 × 2
##    Kota                 n
##    <chr>            <int>
##  1 Andir                1
##  2 Arjuna               3
##  3 Babakan Ciamis       3
##  4 Babakan Penghulu     1
##  5 Bandarharjo          1
##  6 Bandung City        58
##  7 Bandung Regency      4
##  8 Bandung Wetan       13
##  9 Bandungan            4
## 10 Banyukuning          2
## # ℹ 110 more rows
data9 %>%
  count(`Room`)
## # A tibble: 7 × 2
##   Room              n
##   <chr>         <int>
## 1 Apartment        12
## 2 Deluxe Room     347
## 3 Family Room       7
## 4 Standard Room   281
## 5 Studio           28
## 6 Superior Room   448
## 7 Villa           161
data9 %>%
  count(`Tag`)
## # A tibble: 3 × 2
##   Tag                    n
##   <chr>              <int>
## 1 Breakfast included   238
## 2 Free Cancellation    693
## 3 Nothing              353

Periksa Data

# Banyaknya Data
nrow(data9)
## [1] 1284
# Memeriksa apakah ada nilai NA di dataset
anyNA(data9)
## [1] FALSE
# Memeriksa apakah ada baris duplikat
any(duplicated(data9))
## [1] FALSE

Simpan Data

# Menyimpan data yang sudah diolah ke file Excel
write_xlsx(data9, "Data Cleaning.xlsx")
write_xlsx(data9, "D:/SEMESTER 5/PSD/Projek UTS/Data Cleaning.xlsx")