library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
data1 <- data.frame(Titanic)
str(data1)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
colSums(is.na(data1))
## Class Sex Age Survived Freq
## 0 0 0 0 0
Q1 <- quantile(data1$Freq, 0.25)
Q3 <- quantile(data1$Freq, 0.75)
IQR_value <- IQR(data1$Freq)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- data1$Freq[data1$Freq < lower_bound | data1$Freq > upper_bound]
length(outliers)
## [1] 3
data1[duplicated(data1), ]
## [1] Class Sex Age Survived Freq
## <0 rows> (or 0-length row.names)
#SOAL ANALISIS UJIAN MATEMATIKA
nilai <- c(70, 75, 80, 85, 85, 90, 95, 100, 60, 75, 77, 85, 90, 98, 68, 92, 85, 66, 75, 80, 72, 84, 50, 69, 76, 80, 90, 95, 88, 77)
mean(nilai)
## [1] 80.4
median(nilai)
## [1] 80
sd(nilai)
## [1] 11.48792
#DATA BREASTCACER
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
data("BreastCancer")
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
set.seed(110)
bc <- na.omit(BreastCancer)
split <- sample.split(bc$Class, SplitRatio = 0.8)
train_data <- subset(bc, split == TRUE)
test_data <- subset(bc, split == FALSE)
dim(train_data)
## [1] 546 11
dim(test_data)
## [1] 137 11
#DATA BERMAIN GOLF
data_golf <- data.frame(
Outlook = c("Rainy", "Rainy", "Overcast", "Sunny", "Sunny", "Sunny", "Overcast",
"Rainy", "Rainy", "Sunny", "Rainy", "Overcast", "Overcast", "Sunny"),
Temperature = c("Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool",
"Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"),
Humidity = c("High", "High", "High", "High", "Normal", "Normal", "Normal",
"High", "Normal", "Normal", "Normal", "High", "Normal", "High"),
Windy = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE,
FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE),
PlayGolf = c("No", "No", "Yes", "Yes", "Yes", "No", "Yes",
"No", "Yes", "Yes", "Yes", "Yes", "Yes", "No")
)
data_golf[] <- lapply(data_golf, as.factor)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
model <- naiveBayes(PlayGolf ~ ., data = data_golf)
# Data baru: Outlook = Overcast, Temp = Mild, Humidity = Normal, Windy = FALSE
data_baru <- data.frame(
Outlook = factor("Overcast", levels = levels(data_golf$Outlook)),
Temperature = factor("Mild", levels = levels(data_golf$Temperature)),
Humidity = factor("Normal", levels = levels(data_golf$Humidity)),
Windy = factor(FALSE, levels = levels(data_golf$Windy))
)
# Prediksi probabilitas
predict(model, data_baru, type = "raw")
## No Yes
## [1,] 0.000202459 0.9997975
#DATA CUSTOMER BARU
data <- data.frame(
Income = c(rep(">=5jt", 80), rep("<5jt", 120)),
Gender = c(rep("Perempuan", 65), rep("Laki-laki", 15), rep("Perempuan", 55), rep("Laki-laki", 65)),
Married = c(rep("Ya", 53), rep("Tidak", 12), rep("Ya", 0), rep("Tidak", 15), rep("Ya", 30), rep("Tidak", 25), rep("Ya", 10), rep("Tidak", 55)),
Buy = c(rep("Yes", 51), rep("No", 2), rep("Yes", 1), rep("No", 11),
rep("Yes", 0), rep("No", 0), rep("Yes", 5), rep("No", 10),
rep("Yes", 20), rep("No", 10), rep("Yes", 10), rep("No", 15),
rep("Yes", 2), rep("No", 8), rep("Yes", 8), rep("No", 47))
)
target_node <- subset(data, Income == ">=5jt" & Gender == "Perempuan" & Married == "Ya")
prob <- sum(target_node$Buy == "Yes") / nrow(target_node)
prob
## [1] 0.9622642