#Mencari data hilang

data1 <- data.frame(Titanic)
str(data1)
## 'data.frame':    32 obs. of  5 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : num  0 0 35 0 0 0 17 0 118 154 ...
summary(data1)
##   Class       Sex        Age     Survived      Freq       
##  1st :8   Male  :16   Child:16   No :16   Min.   :  0.00  
##  2nd :8   Female:16   Adult:16   Yes:16   1st Qu.:  0.75  
##  3rd :8                                   Median : 13.50  
##  Crew:8                                   Mean   : 68.78  
##                                           3rd Qu.: 77.00  
##                                           Max.   :670.00
colSums(is.na(data1))
##    Class      Sex      Age Survived     Freq 
##        0        0        0        0        0
any(is.na(data1))  # TRUE jika ada NA di mana pun, FALSE jika tidak ada
## [1] FALSE

Soal no 7 Berapakah jumlah outliers yang ada di kolom Freq pada data Titanic?

data1 <- data.frame(Titanic)
str(data1)
## 'data.frame':    32 obs. of  5 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : num  0 0 35 0 0 0 17 0 118 154 ...
summary(data1$Freq)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.75   13.50   68.78   77.00  670.00
boxplot(data1$Freq, main = "Boxplot of Freq", ylab = "Frequency", col = "lightblue")

outliers <- boxplot.stats(data1$Freq)$out
print("Nilai-nilai outlier:")
## [1] "Nilai-nilai outlier:"
print(outliers)
## [1] 387 670
cat("Jumlah outlier:", length(outliers), "\n")
## Jumlah outlier: 2

Soal 8 #jumlah duplikat baris

data1 <- data.frame(Titanic)
str(data1)
## 'data.frame':    32 obs. of  5 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : num  0 0 35 0 0 0 17 0 118 154 ...
head(data1, 100)
##    Class    Sex   Age Survived Freq
## 1    1st   Male Child       No    0
## 2    2nd   Male Child       No    0
## 3    3rd   Male Child       No   35
## 4   Crew   Male Child       No    0
## 5    1st Female Child       No    0
## 6    2nd Female Child       No    0
## 7    3rd Female Child       No   17
## 8   Crew Female Child       No    0
## 9    1st   Male Adult       No  118
## 10   2nd   Male Adult       No  154
## 11   3rd   Male Adult       No  387
## 12  Crew   Male Adult       No  670
## 13   1st Female Adult       No    4
## 14   2nd Female Adult       No   13
## 15   3rd Female Adult       No   89
## 16  Crew Female Adult       No    3
## 17   1st   Male Child      Yes    5
## 18   2nd   Male Child      Yes   11
## 19   3rd   Male Child      Yes   13
## 20  Crew   Male Child      Yes    0
## 21   1st Female Child      Yes    1
## 22   2nd Female Child      Yes   13
## 23   3rd Female Child      Yes   14
## 24  Crew Female Child      Yes    0
## 25   1st   Male Adult      Yes   57
## 26   2nd   Male Adult      Yes   14
## 27   3rd   Male Adult      Yes   75
## 28  Crew   Male Adult      Yes  192
## 29   1st Female Adult      Yes  140
## 30   2nd Female Adult      Yes   80
## 31   3rd Female Adult      Yes   76
## 32  Crew Female Adult      Yes   20
jumlah_duplikat <- sum(duplicated(data1))
cat("Jumlah baris duplikat adalah:", jumlah_duplikat, "\n")
## Jumlah baris duplikat adalah: 0
data_duplikat <- data1[duplicated(data1), ]
print("Baris-baris yang terduplikat:")
## [1] "Baris-baris yang terduplikat:"
print(data_duplikat)
## [1] Class    Sex      Age      Survived Freq    
## <0 rows> (or 0-length row.names)

Soal 9 mencari nilai mean, median, simpangan baku (standar deviasi)

nilai <- c(70, 75, 80, 85, 85, 90, 95, 100, 60, 75, 77, 85, 90, 98, 68, 92, 85, 66, 75, 80, 72, 84, 50, 69, 76, 80, 90, 95, 88, 77)
rata_rata <- mean(nilai)
nilai_median <- median(nilai)
standar_deviasi <- sd(nilai)
cat("Rata-rata:", rata_rata, "\n")
## Rata-rata: 80.4
cat("Median:", nilai_median, "\n")
## Median: 80
cat("Standar Deviasi:", standar_deviasi, "\n")
## Standar Deviasi: 11.48792

Soal no 10 Apabila ingin membagi dataset menjadi data training dan testing dengan persentasi 80% dan 20%. Berapakah dimensi data testing dan data training

library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
data("BreastCancer")
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
set.seed(110)

data_clean <- na.omit(BreastCancer)
dim(data_clean)  # biasanya: 683 baris, 11 kolom
## [1] 683  11
split <- sample.split(data_clean$Class, SplitRatio = 0.8)
data_train <- subset(data_clean, split == TRUE)
data_test <- subset(data_clean, split == FALSE)
nrow(data_train)
## [1] 546
nrow(data_test)
## [1] 137