#Mencari data hilang
data1 <- data.frame(Titanic)
str(data1)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
summary(data1)
## Class Sex Age Survived Freq
## 1st :8 Male :16 Child:16 No :16 Min. : 0.00
## 2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
## 3rd :8 Median : 13.50
## Crew:8 Mean : 68.78
## 3rd Qu.: 77.00
## Max. :670.00
colSums(is.na(data1))
## Class Sex Age Survived Freq
## 0 0 0 0 0
any(is.na(data1)) # TRUE jika ada NA di mana pun, FALSE jika tidak ada
## [1] FALSE
Soal no 7 Berapakah jumlah outliers yang ada di kolom Freq pada data Titanic?
data1 <- data.frame(Titanic)
str(data1)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
summary(data1$Freq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.75 13.50 68.78 77.00 670.00
boxplot(data1$Freq, main = "Boxplot of Freq", ylab = "Frequency", col = "lightblue")
outliers <- boxplot.stats(data1$Freq)$out
print("Nilai-nilai outlier:")
## [1] "Nilai-nilai outlier:"
print(outliers)
## [1] 387 670
cat("Jumlah outlier:", length(outliers), "\n")
## Jumlah outlier: 2
Soal 8 #jumlah duplikat baris
data1 <- data.frame(Titanic)
str(data1)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
head(data1, 100)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
## 7 3rd Female Child No 17
## 8 Crew Female Child No 0
## 9 1st Male Adult No 118
## 10 2nd Male Adult No 154
## 11 3rd Male Adult No 387
## 12 Crew Male Adult No 670
## 13 1st Female Adult No 4
## 14 2nd Female Adult No 13
## 15 3rd Female Adult No 89
## 16 Crew Female Adult No 3
## 17 1st Male Child Yes 5
## 18 2nd Male Child Yes 11
## 19 3rd Male Child Yes 13
## 20 Crew Male Child Yes 0
## 21 1st Female Child Yes 1
## 22 2nd Female Child Yes 13
## 23 3rd Female Child Yes 14
## 24 Crew Female Child Yes 0
## 25 1st Male Adult Yes 57
## 26 2nd Male Adult Yes 14
## 27 3rd Male Adult Yes 75
## 28 Crew Male Adult Yes 192
## 29 1st Female Adult Yes 140
## 30 2nd Female Adult Yes 80
## 31 3rd Female Adult Yes 76
## 32 Crew Female Adult Yes 20
jumlah_duplikat <- sum(duplicated(data1))
cat("Jumlah baris duplikat adalah:", jumlah_duplikat, "\n")
## Jumlah baris duplikat adalah: 0
data_duplikat <- data1[duplicated(data1), ]
print("Baris-baris yang terduplikat:")
## [1] "Baris-baris yang terduplikat:"
print(data_duplikat)
## [1] Class Sex Age Survived Freq
## <0 rows> (or 0-length row.names)
Soal 9 mencari nilai mean, median, simpangan baku (standar deviasi)
nilai <- c(70, 75, 80, 85, 85, 90, 95, 100, 60, 75, 77, 85, 90, 98, 68, 92, 85, 66, 75, 80, 72, 84, 50, 69, 76, 80, 90, 95, 88, 77)
rata_rata <- mean(nilai)
nilai_median <- median(nilai)
standar_deviasi <- sd(nilai)
cat("Rata-rata:", rata_rata, "\n")
## Rata-rata: 80.4
cat("Median:", nilai_median, "\n")
## Median: 80
cat("Standar Deviasi:", standar_deviasi, "\n")
## Standar Deviasi: 11.48792
Soal no 10 Apabila ingin membagi dataset menjadi data training dan testing dengan persentasi 80% dan 20%. Berapakah dimensi data testing dan data training
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
data("BreastCancer")
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
set.seed(110)
data_clean <- na.omit(BreastCancer)
dim(data_clean) # biasanya: 683 baris, 11 kolom
## [1] 683 11
split <- sample.split(data_clean$Class, SplitRatio = 0.8)
data_train <- subset(data_clean, split == TRUE)
data_test <- subset(data_clean, split == FALSE)
nrow(data_train)
## [1] 546
nrow(data_test)
## [1] 137