library(dplyr)

data(“Titanic”) titanic <- as.data.frame(Titanic)

house_price <- read.csv(“house_price.csv”) df <- data.frame(house_price)

cek missing value

colSums(is.na(df))

cek duplikasi

sum(duplicated(df))

Dataset bawaan r, airquality

colSums(is.na(airquality))

Ganti NA komom ozone dan solar.r

airquality\(Ozone[is.na(airquality\)Ozone)] <- median(airquality\(Ozone, na.rm = TRUE) airquality\)Solar.R[is.na(airquality\(Solar.R)] <- median(airquality\)Solar.R, na.rm = TRUE)

cek setelah mising value diganti

colSums(is.na(airquality)) sum(duplicated(airquality))

select

select_data <- select(df, price, bedrooms, bathrooms, floors) head(select_data)

filter harga lebih dari 350.000

harga_lebih_350000 <- filter(house_price, price > 350000)

cek kalau data telah terflter dengan cara cek panjang kolom

length(harga_lebih_350000\(price) length(df\)price)

ubah feature price ke harga dan country ke negara

rename_feature <- rename(df, harga = price) rename_feature <- rename(rename_feature, negara = country) head(rename_feature)

mutate titanic menambahkan kolom proporsi

head(titanic) titanic_mutate <- mutate(titanic, proporsi = Freq / sum(Freq)) head(titanic_mutate)

join kolom baru kapal dengan df extra berdasarkan classnya

extra <- data.frame( Class = c(“1st”,“2nd”,“3rd”,“Crew”), Kapal = c(“titanic”, “titanic”, “titanic”, “titanic”) ) titanic_joined <- left_join(titanic, extra, by = “Class”) head(titanic_joined)

ringkas df titanic

titanic_summary <- titanic %>% group_by(Class, Survived) %>% summarise(total = sum(Freq)) titanic_summary

Buat indeks sampling 70%

index <- sample(1:nrow(titanic), 0.7*nrow(titanic)) set.seed(123)

train_data <- titanic[index, ] test_data <- titanic[-index, ]

nrow(train_data); nrow(test_data)