#data1 <- data.frame(Titanic) #Cek apakah ada missing value di setiap variabel?
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
data1 <- data.frame(Titanic)
colSums(is.na(data1))
## Class Sex Age Survived Freq
## 0 0 0 0 0
summary(data1)
## Class Sex Age Survived Freq
## 1st :8 Male :16 Child:16 No :16 Min. : 0.00
## 2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
## 3rd :8 Median : 13.50
## Crew:8 Mean : 68.78
## 3rd Qu.: 77.00
## Max. :670.00
any(is.na(data1))
## [1] FALSE
#Berapakah jumlah outliers yang ada di kolom Freq pada data Titanic?
library(mlbench)
data1 <- data.frame(Titanic) # Mengubah dataset Titanic ke dalam bentuk data frame
boxplot(data1$Freq, main = "Boxplot of Freq", ylab = "Frequency", col = "lightblue")
Q1 <- quantile(data1$Freq, 0.25) # Kuartil 1 (Q1)
Q3 <- quantile(data1$Freq, 0.75) # Kuartil 3 (Q3)
IQR_value <- Q3 - Q1 # Interquartile Range (IQR)
# Menentukan batas bawah dan batas atas untuk outliers
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Menemukan jumlah outliers
outliers <- sum(data1$Freq < lower_bound | data1$Freq > upper_bound)
print(paste("Jumlah outliers di kolom Freq:", outliers))
## [1] "Jumlah outliers di kolom Freq: 3"
#Gunakan R untuk mendapatkan jawabannya. #Diberikan syntax #sum(duplicated(data1)) #Cek berapakah baris yang terduplikat?
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data1 <- data.frame(Titanic)
jumlah_duplikasi <- sum(duplicated(data1))
print(paste("Jumlah baris yang terduplikasi:", jumlah_duplikasi))
## [1] "Jumlah baris yang terduplikasi: 0"
#Hitunglah rata-rata, median dan standar deviasi secara berturut-turut.
library(dplyr)
nilai <- c(70,75,80,85,85,90,95,100,60,75,77,85,90,98,68,92,85,66,75,80,72,84,50,69,76,80,90,95,88,77)
data_frame(nilai = nilai) %>%
summarise(
Rata_rata = mean(nilai),
Median = median(nilai),
Standar_Deviasi = sd(nilai)
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 3
## Rata_rata Median Standar_Deviasi
## <dbl> <dbl> <dbl>
## 1 80.4 80 11.5
#Apabila ingin membagi dataset menjadi data training dan testing dengan persentasi 80% dan 20%. Berapakah dimensi data testing dan data training
library(mlbench)
data("BreastCancer")
head(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
levels(BreastCancer$Class)
## [1] "benign" "malignant"
summary(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion
## Length:699 1 :145 1 :384 1 :353 1 :407
## Class :character 5 :130 10 : 67 2 : 59 2 : 58
## Mode :character 3 :108 3 : 52 10 : 58 3 : 58
## 4 : 80 2 : 45 3 : 56 10 : 55
## 10 : 69 4 : 40 4 : 44 4 : 33
## 2 : 50 5 : 30 5 : 34 8 : 25
## (Other):117 (Other): 81 (Other): 95 (Other): 63
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 2 :386 1 :402 2 :166 1 :443 1 :579
## 3 : 72 10 :132 3 :165 10 : 61 2 : 35
## 4 : 48 2 : 30 1 :152 3 : 44 3 : 33
## 1 : 47 5 : 30 7 : 73 2 : 36 10 : 14
## 6 : 41 3 : 28 4 : 40 8 : 24 4 : 12
## 5 : 39 (Other): 61 5 : 34 6 : 22 7 : 9
## (Other): 66 NA's : 16 (Other): 69 (Other): 69 (Other): 17
## Class
## benign :458
## malignant:241
##
##
##
##
##
library(mice) #library unutk mengatasi nilai yang hilang
## Warning: package 'mice' was built under R version 4.4.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(caret) #library untuk trainng dan ploting model
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
dataset_impute <- mice(BreastCancer[,2:10], print = FALSE)
BreastCancer <- cbind(BreastCancer[, 11, drop = FALSE], mice::complete(dataset_impute, 1))
summary(BreastCancer)
## Class Cl.thickness Cell.size Cell.shape Marg.adhesion
## benign :458 1 :145 1 :384 1 :353 1 :407
## malignant:241 5 :130 10 : 67 2 : 59 2 : 58
## 3 :108 3 : 52 10 : 58 3 : 58
## 4 : 80 2 : 45 3 : 56 10 : 55
## 10 : 69 4 : 40 4 : 44 4 : 33
## 2 : 50 5 : 30 5 : 34 8 : 25
## (Other):117 (Other): 81 (Other): 95 (Other): 63
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 2 :386 1 :411 2 :166 1 :443 1 :579
## 3 : 72 10 :133 3 :165 10 : 61 2 : 35
## 4 : 48 2 : 32 1 :152 3 : 44 3 : 33
## 1 : 47 5 : 31 7 : 73 2 : 36 10 : 14
## 6 : 41 3 : 29 4 : 40 8 : 24 4 : 12
## 5 : 39 8 : 21 5 : 34 6 : 22 7 : 9
## (Other): 66 (Other): 42 (Other): 69 (Other): 69 (Other): 17
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
data("BreastCancer")
# Membagi Dataset dengan 80% Training dan 20% Testing
set.seed(110)
split <- sample.split(BreastCancer$Class, SplitRatio = 0.8)
train_set <- subset(BreastCancer, split == TRUE)
test_set <- subset(BreastCancer, split == FALSE)
# Menampilkan Dimensi Data Training dan Testing
dim_train <- nrow(train_set)
dim_test <- nrow(test_set)
print(paste("Dimensi Data Training:", dim_train))
## [1] "Dimensi Data Training: 559"
print(paste("Dimensi Data Testing:", dim_test))
## [1] "Dimensi Data Testing: 140"