data_chocolate <- read.csv("C:/Users/FAQIH/Downloads/Chocolate Sales.csv")
head(data_chocolate)
## Sales.Person Country Product Date Amount Boxes.Shipped
## 1 Jehu Rudeforth UK Mint Chip Choco 04-Jan-22 $5,320 180
## 2 Van Tuxwell India 85% Dark Bars 01-Aug-22 $7,896 94
## 3 Gigi Bohling India Peanut Butter Cubes 07-Jul-22 $4,501 91
## 4 Jan Morforth Australia Peanut Butter Cubes 27-Apr-22 $12,726 342
## 5 Jehu Rudeforth UK Peanut Butter Cubes 24-Feb-22 $13,685 184
## 6 Van Tuxwell India Smooth Sliky Salty 06-Jun-22 $5,376 38
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
chocolate_clean <- data_chocolate%>%na.omit()
dt_coklat <- chocolate_clean%>%select(Amount, Boxes.Shipped)
head(dt_coklat)
## Amount Boxes.Shipped
## 1 $5,320 180
## 2 $7,896 94
## 3 $4,501 91
## 4 $12,726 342
## 5 $13,685 184
## 6 $5,376 38
dt_coklat$Amount <- as.character(dt_coklat$Amount) # Pastikan dalam format karakter
dt_coklat$Amount <- as.numeric(gsub("[$,]", "", dt_coklat$Amount))
head(dt_coklat)
## Amount Boxes.Shipped
## 1 5320 180
## 2 7896 94
## 3 4501 91
## 4 12726 342
## 5 13685 184
## 6 5376 38
summary(dt_coklat)
## Amount Boxes.Shipped
## Min. : 7 Min. : 1.0
## 1st Qu.: 2390 1st Qu.: 70.0
## Median : 4868 Median :135.0
## Mean : 5652 Mean :161.8
## 3rd Qu.: 8027 3rd Qu.:228.8
## Max. :22050 Max. :709.0
plot(dt_coklat$Amount, dt_coklat$Boxes.Shipped, main = "Scatter amount of revenue dan boxes shipped coklat",
xlab = "amount of revenue", ylab = "box shipped")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
korelasi <- cor(dt_coklat) #korelasi data_bayi
korelasi
## Amount Boxes.Shipped
## Amount 1.00000000 -0.01882685
## Boxes.Shipped -0.01882685 1.00000000
Korelasi antara dua peubah sebesar -0.0188 menunjukkan bahwa arah hubungan negatif namun hubungan antara kedua variabel tersebut sangat lemah, terlihat juga pada scatterplot dimana pola hubungan dari kedua peubah tidak linear.
hist(dt_coklat$Amount, breaks = 25)
hist(dt_coklat$Boxes.Shipped, breaks = 25)
# Eksplorasi Data dengan Boxplot
data_amount_sorted <- sort(dt_coklat$Amount)
n <- length(data_amount_sorted)
pos_K1 <- 0.25 * (n + 1)
pos_K1
## [1] 273.75
pos_K2 <- 0.50 * (n + 1)
pos_K2
## [1] 547.5
pos_K3 <- 0.75 * (n + 1)
pos_K3
## [1] 821.25
K1 <- data_amount_sorted[273] + (pos_K1 - 273) * (data_amount_sorted[274] - data_amount_sorted[273])
K1
## [1] 2385.25
K2 <- data_amount_sorted[547] + (pos_K2 - 547) * (data_amount_sorted[548] - data_amount_sorted[547])
K2
## [1] 4868.5
K3 <- data_amount_sorted[821] + (pos_K3 - 821) * (data_amount_sorted[822] - data_amount_sorted[821])
K3
## [1] 8032.5
JAK <- K3 - K1
JAK
## [1] 5647.25
PDB <- K1 - 1.5 * JAK
PDA <- K3 + 1.5 * JAK
PLB <- K1 - 3 * JAK
PLA <- K3 + 3 * JAK
# Menampilkan hasil
PDB
## [1] -6085.625
PDA
## [1] 16503.38
PLB
## [1] -14556.5
PLA
## [1] 24974.25
boxplot(dt_coklat$Amount, col="skyblue",
main="Boxplot Data amount of revenue",
xlab="amount ($)")
### Menyusun struktur boxplot
data_box_sorted <- sort(dt_coklat$Boxes.Shipped)
n <- length(data_box_sorted)
pos_K1 <- 0.25 * (n + 1)
pos_K1
## [1] 273.75
pos_K2 <- 0.50 * (n + 1)
pos_K2
## [1] 547.5
pos_K3 <- 0.75 * (n + 1)
pos_K3
## [1] 821.25
K1 <- data_box_sorted[273] + (pos_K1 - 273) * (data_box_sorted[274] - data_box_sorted[273])
K1
## [1] 70
K2 <- data_box_sorted[547] + (pos_K2 - 547) * (data_box_sorted[548] - data_box_sorted[547])
K2
## [1] 135
K3 <- data_box_sorted[821] + (pos_K3 - 821) * (data_box_sorted[822] - data_box_sorted[821])
K3
## [1] 229
JAK <- K3 - K1
JAK
## [1] 159
PDB <- K1 - 1.5 * JAK
PDA <- K3 + 1.5 * JAK
PLB <- K1 - 3 * JAK
PLA <- K3 + 3 * JAK
# Menampilkan hasil
PDB
## [1] -168.5
PDA
## [1] 467.5
PLB
## [1] -407
PLA
## [1] 706
boxplot(dt_coklat$Boxes.Shipped, col="skyblue",
main="Boxplot Data Box Shipped",
xlab="Box Shipped")
# Pemeriksaan Sebaran Data
qqnorm(dt_coklat$Amount, col = 'blue')
qqline(dt_coklat$Amount)
qqnorm(dt_coklat$Boxes.Shipped, col = 'blue')
qqline(dt_coklat$Boxes.Shipped)
### Uji Formal (Kolmogorov smirnov) Hipotesis: H0 : Data mengikuti
distribusi normal dengan rata-rata 5652 dan deviasi standar 4102.442. H1
: Data tidak mengikuti distribusi normal dengan rata-rata 5652 dan
deviasi standar 4102.442.
ks.test(dt_coklat$Amount, "pnorm", 5652, 4102.442)
## Warning in ks.test.default(dt_coklat$Amount, "pnorm", 5652, 4102.442): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: dt_coklat$Amount
## D = 0.084409, p-value = 3.394e-07
## alternative hypothesis: two-sided
Kesimpulan: Karena p-value kurang dari 0.05, maka tolak H0. Ini berarti data data amount of revenue dalam penjualan coklat tidak mengikuti distribusi normal dengan parameter mean = 5652, sd = 4102.442.
Hipotesis: H0 : Data mengikuti distribusi normal dengan rata-rata 161.8 dan deviasi standar 121.5441. H1 : Data tidak mengikuti distribusi normal dengan rata-rata 161.8 dan deviasi standar 121.5441.
ks.test(dt_coklat$Boxes.Shipped, "pnorm", 161.8, 121.5441)
## Warning in ks.test.default(dt_coklat$Boxes.Shipped, "pnorm", 161.8, 121.5441):
## ties should not be present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: dt_coklat$Boxes.Shipped
## D = 0.10388, p-value = 1.115e-10
## alternative hypothesis: two-sided
Kesimpulan: Karena p-value kurang dari 0.05, maka tolak H0. Ini berarti data data terkait boxes shipped dalam penjualan coklat tidak mengikuti distribusi normal dengan parameter mean = 161.8, sd = 121.5441. # Pendeteksian Pencilan ### Menggunakan Interquartile Range(IQR)
boxplot.stats(dt_coklat$Amount)$out
## [1] 16793 19453 17318 19929 18991 16569 16982 18340 22050 19327 17626 19481
## [13] 18032 16702 17465 18697
order.out = boxplot.stats(dt_coklat$Amount)$out
pencicilan = which(dt_coklat$Amount %in% c(order.out))
pencicilan
## [1] 58 67 120 136 213 258 373 435 544 590 610 752 807 892 950
## [16] 1009
boxplot(dt_coklat$Amount,
xlab = "amount",
ylab = "Value",
main = "Boxplot amount"
)
mtext(paste("Pencilan: ", paste(order.out, collapse = ", ")))
boxplot.stats(dt_coklat$Boxes.Shipped)$out
## [1] 581 482 520 547 477 520 475 554 543 469 479 488 512 508 708 524 597 495 614
## [20] 468 518 518 539 709 591
order.out = boxplot.stats(dt_coklat$Boxes.Shipped)$out
pencicilan = which(dt_coklat$Boxes.Shipped %in% c(order.out))
pencicilan
## [1] 40 98 110 122 151 172 189 341 459 464 585 666 673 693 731
## [16] 766 770 824 853 863 866 876 979 1029 1071
boxplot(dt_coklat$Boxes.Shipped,
xlab = "boxes shipped",
ylab = "Value",
main = "Boxplot boxes shipped"
)
mtext(paste("Pencilan: ", paste(order.out, collapse = ", ")))
Berdasarkan pendeteksian pencilan dengan metode IQR dan bantuan boxplot
terlihat baik data ‘amount’ maupun ‘boxes shipped’ terdapat banyak
pencilan atas pada data.
menggunakan trimmed mean karena jumlah data yang cukub besar dan terdapat banyak pencilan ekstrem. Menghitung mean dengan memangkas 10% data di kedua ujung distribusi.
rataan.biasa = mean(dt_coklat$Amount)
rataan.terpangkas = mean(dt_coklat$Amount, trim = 0.1) #alpha = 10%
cat("Rataan biasa : ", rataan.biasa, "\n")
## Rataan biasa : 5652.308
cat("Rataan Terpangkas : ", rataan.terpangkas)
## Rataan Terpangkas : 5221.576
rataan.biasa = mean(dt_coklat$Boxes.Shipped)
rataan.terpangkas = mean(dt_coklat$Boxes.Shipped, trim = 0.1) #alpha = 10%
cat("Rataan biasa : ", rataan.biasa, "\n")
## Rataan biasa : 161.798
cat("Rataan Terpangkas : ", rataan.terpangkas)
## Rataan Terpangkas : 147.3037
Menghitung ragam dengan memangkas 10% data di kedua ujung distribusi.
library(chemometrics)
## Warning: package 'chemometrics' was built under R version 4.4.3
## Loading required package: rpart
cat("Ragam tak terpangkas : ",sd(dt_coklat$Amount),"\n")
## Ragam tak terpangkas : 4102.442
cat("Ragam terpangkas : ",sd_trim(dt_coklat$Amount, trim = 0.1)) #alpha = 10%
## Ragam terpangkas : 3857.022
library(chemometrics)
cat("Ragam tak terpangkas : ",sd(dt_coklat$Boxes.Shipped),"\n")
## Ragam tak terpangkas : 121.5441
cat("Ragam terpangkas : ",sd_trim(dt_coklat$Boxes.Shipped, trim = 0.1)) #alpha = 10%
## Ragam terpangkas : 109.445