#data1 <- data.frame(Titanic) #Cek apakah ada missing value di setiap variabel?

library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
data1 <- data.frame(Titanic)  
colSums(is.na(data1))  
##    Class      Sex      Age Survived     Freq 
##        0        0        0        0        0
summary(data1)
##   Class       Sex        Age     Survived      Freq       
##  1st :8   Male  :16   Child:16   No :16   Min.   :  0.00  
##  2nd :8   Female:16   Adult:16   Yes:16   1st Qu.:  0.75  
##  3rd :8                                   Median : 13.50  
##  Crew:8                                   Mean   : 68.78  
##                                           3rd Qu.: 77.00  
##                                           Max.   :670.00
any(is.na(data1)) 
## [1] FALSE

Gunakan R untuk mendapatkan jawabannya.

#Berapakah jumlah outliers yang ada di kolom Freq pada data Titanic?

library(mlbench)
data1 <- data.frame(Titanic)  # Mengubah dataset Titanic ke dalam bentuk data frame
boxplot(data1$Freq, main = "Boxplot of Freq", ylab = "Frequency", col = "lightblue")

Q1 <- quantile(data1$Freq, 0.25)  # Kuartil 1 (Q1)
Q3 <- quantile(data1$Freq, 0.75)  # Kuartil 3 (Q3)
IQR_value <- Q3 - Q1  # Interquartile Range (IQR)

# Menentukan batas bawah dan batas atas untuk outliers
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Menemukan jumlah outliers
outliers <- sum(data1$Freq < lower_bound | data1$Freq > upper_bound)
print(paste("Jumlah outliers di kolom Freq:", outliers))
## [1] "Jumlah outliers di kolom Freq: 3"

#Gunakan R untuk mendapatkan jawabannya. #Diberikan syntax #sum(duplicated(data1)) #Cek berapakah baris yang terduplikat?

library(dplyr)  
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data1 <- data.frame(Titanic)  
jumlah_duplikasi <- sum(duplicated(data1))
print(paste("Jumlah baris yang terduplikasi:", jumlah_duplikasi))
## [1] "Jumlah baris yang terduplikasi: 0"

#Hitunglah rata-rata, median dan standar deviasi secara berturut-turut.

library(dplyr)
nilai <- c(70,75,80,85,85,90,95,100,60,75,77,85,90,98,68,92,85,66,75,80,72,84,50,69,76,80,90,95,88,77)
data_frame(nilai = nilai) %>%
  summarise(
    Rata_rata = mean(nilai),
    Median = median(nilai),
    Standar_Deviasi = sd(nilai)
  )
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 3
##   Rata_rata Median Standar_Deviasi
##       <dbl>  <dbl>           <dbl>
## 1      80.4     80            11.5

#Apabila ingin membagi dataset menjadi data training dan testing dengan persentasi 80% dan 20%. Berapakah dimensi data testing dan data training

library(mlbench)  
data("BreastCancer")
head(BreastCancer)
##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant
str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
levels(BreastCancer$Class)
## [1] "benign"    "malignant"
summary(BreastCancer)
##       Id             Cl.thickness   Cell.size     Cell.shape  Marg.adhesion
##  Length:699         1      :145   1      :384   1      :353   1      :407  
##  Class :character   5      :130   10     : 67   2      : 59   2      : 58  
##  Mode  :character   3      :108   3      : 52   10     : 58   3      : 58  
##                     4      : 80   2      : 45   3      : 56   10     : 55  
##                     10     : 69   4      : 40   4      : 44   4      : 33  
##                     2      : 50   5      : 30   5      : 34   8      : 25  
##                     (Other):117   (Other): 81   (Other): 95   (Other): 63  
##   Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli    Mitoses   
##  2      :386   1      :402   2      :166   1      :443     1      :579  
##  3      : 72   10     :132   3      :165   10     : 61     2      : 35  
##  4      : 48   2      : 30   1      :152   3      : 44     3      : 33  
##  1      : 47   5      : 30   7      : 73   2      : 36     10     : 14  
##  6      : 41   3      : 28   4      : 40   8      : 24     4      : 12  
##  5      : 39   (Other): 61   5      : 34   6      : 22     7      :  9  
##  (Other): 66   NA's   : 16   (Other): 69   (Other): 69     (Other): 17  
##        Class    
##  benign   :458  
##  malignant:241  
##                 
##                 
##                 
##                 
## 
library(mice) #library unutk mengatasi nilai yang hilang 
## Warning: package 'mice' was built under R version 4.4.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(caret) #library untuk trainng dan ploting model
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
dataset_impute <- mice(BreastCancer[,2:10], print = FALSE) 
BreastCancer <- cbind(BreastCancer[, 11, drop = FALSE], mice::complete(dataset_impute, 1))
summary(BreastCancer)
##        Class      Cl.thickness   Cell.size     Cell.shape  Marg.adhesion
##  benign   :458   1      :145   1      :384   1      :353   1      :407  
##  malignant:241   5      :130   10     : 67   2      : 59   2      : 58  
##                  3      :108   3      : 52   10     : 58   3      : 58  
##                  4      : 80   2      : 45   3      : 56   10     : 55  
##                  10     : 69   4      : 40   4      : 44   4      : 33  
##                  2      : 50   5      : 30   5      : 34   8      : 25  
##                  (Other):117   (Other): 81   (Other): 95   (Other): 63  
##   Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli    Mitoses   
##  2      :386   1      :411   2      :166   1      :443     1      :579  
##  3      : 72   10     :133   3      :165   10     : 61     2      : 35  
##  4      : 48   2      : 32   1      :152   3      : 44     3      : 33  
##  1      : 47   5      : 31   7      : 73   2      : 36     10     : 14  
##  6      : 41   3      : 29   4      : 40   8      : 24     4      : 12  
##  5      : 39   8      : 21   5      : 34   6      : 22     7      :  9  
##  (Other): 66   (Other): 42   (Other): 69   (Other): 69     (Other): 17
library(caTools)  
## Warning: package 'caTools' was built under R version 4.4.3
data("BreastCancer")

# Membagi Dataset dengan 80% Training dan 20% Testing
set.seed(110)
split <- sample.split(BreastCancer$Class, SplitRatio = 0.8)
train_set <- subset(BreastCancer, split == TRUE)
test_set <- subset(BreastCancer, split == FALSE)

# Menampilkan Dimensi Data Training dan Testing
dim_train <- nrow(train_set)
dim_test <- nrow(test_set)

print(paste("Dimensi Data Training:", dim_train))
## [1] "Dimensi Data Training: 559"
print(paste("Dimensi Data Testing:", dim_test))
## [1] "Dimensi Data Testing: 140"