Import Data

data_chocolate <- read.csv("C:/Users/FAQIH/Downloads/Chocolate Sales.csv")
head(data_chocolate)
##     Sales.Person   Country             Product      Date   Amount Boxes.Shipped
## 1 Jehu Rudeforth        UK     Mint Chip Choco 04-Jan-22  $5,320            180
## 2    Van Tuxwell     India       85% Dark Bars 01-Aug-22  $7,896             94
## 3   Gigi Bohling     India Peanut Butter Cubes 07-Jul-22  $4,501             91
## 4   Jan Morforth Australia Peanut Butter Cubes 27-Apr-22 $12,726            342
## 5 Jehu Rudeforth        UK Peanut Butter Cubes 24-Feb-22 $13,685            184
## 6    Van Tuxwell     India  Smooth Sliky Salty 06-Jun-22  $5,376             38

Cleaning Data Dari Missing Value

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
chocolate_clean <- data_chocolate%>%na.omit()

Membuat Data Frame Baru

dt_coklat <- chocolate_clean%>%select(Amount, Boxes.Shipped)
head(dt_coklat)
##     Amount Boxes.Shipped
## 1  $5,320            180
## 2  $7,896             94
## 3  $4,501             91
## 4 $12,726            342
## 5 $13,685            184
## 6  $5,376             38
dt_coklat$Amount <- as.character(dt_coklat$Amount)  # Pastikan dalam format karakter
dt_coklat$Amount <- as.numeric(gsub("[$,]", "", dt_coklat$Amount))  
head(dt_coklat)
##   Amount Boxes.Shipped
## 1   5320           180
## 2   7896            94
## 3   4501            91
## 4  12726           342
## 5  13685           184
## 6   5376            38

Ringkasan Data

summary(dt_coklat)
##      Amount      Boxes.Shipped  
##  Min.   :    7   Min.   :  1.0  
##  1st Qu.: 2390   1st Qu.: 70.0  
##  Median : 4868   Median :135.0  
##  Mean   : 5652   Mean   :161.8  
##  3rd Qu.: 8027   3rd Qu.:228.8  
##  Max.   :22050   Max.   :709.0

Eksplorasi data dengan scatter plot

plot(dt_coklat$Amount, dt_coklat$Boxes.Shipped, main = "Scatter amount of revenue dan boxes shipped coklat",
     xlab = "amount of revenue", ylab = "box shipped")

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
korelasi <- cor(dt_coklat) #korelasi data_bayi
korelasi
##                    Amount Boxes.Shipped
## Amount         1.00000000   -0.01882685
## Boxes.Shipped -0.01882685    1.00000000

Korelasi antara dua peubah sebesar -0.0188 menunjukkan bahwa arah hubungan negatif namun hubungan antara kedua variabel tersebut sangat lemah, terlihat juga pada scatterplot dimana pola hubungan dari kedua peubah tidak linear.

Eksplorasi Data dengan Histogram

hist(dt_coklat$Amount, breaks = 25)

hist(dt_coklat$Boxes.Shipped, breaks = 25)

# Eksplorasi Data dengan Boxplot

Menyusun struktur boxplot

data_amount_sorted <- sort(dt_coklat$Amount)
n <- length(data_amount_sorted)
pos_K1 <- 0.25 * (n + 1)
pos_K1
## [1] 273.75
pos_K2 <- 0.50 * (n + 1)
pos_K2
## [1] 547.5
pos_K3 <- 0.75 * (n + 1)
pos_K3
## [1] 821.25
K1 <- data_amount_sorted[273] + (pos_K1 - 273) * (data_amount_sorted[274] - data_amount_sorted[273])
K1
## [1] 2385.25
K2 <- data_amount_sorted[547] + (pos_K2 - 547) * (data_amount_sorted[548] - data_amount_sorted[547])
K2
## [1] 4868.5
K3 <- data_amount_sorted[821] + (pos_K3 - 821) * (data_amount_sorted[822] - data_amount_sorted[821])
K3
## [1] 8032.5
JAK <- K3 - K1
JAK
## [1] 5647.25
PDB <- K1 - 1.5 * JAK
PDA <- K3 + 1.5 * JAK
PLB <- K1 - 3 * JAK
PLA <- K3 + 3 * JAK

# Menampilkan hasil
PDB
## [1] -6085.625
PDA
## [1] 16503.38
PLB
## [1] -14556.5
PLA
## [1] 24974.25

visualisasi boxplot

boxplot(dt_coklat$Amount, col="skyblue",
 main="Boxplot Data amount of revenue",
 xlab="amount ($)")

### Menyusun struktur boxplot

data_box_sorted <- sort(dt_coklat$Boxes.Shipped)
n <- length(data_box_sorted)
pos_K1 <- 0.25 * (n + 1)
pos_K1
## [1] 273.75
pos_K2 <- 0.50 * (n + 1)
pos_K2
## [1] 547.5
pos_K3 <- 0.75 * (n + 1)
pos_K3
## [1] 821.25
K1 <- data_box_sorted[273] + (pos_K1 - 273) * (data_box_sorted[274] - data_box_sorted[273])
K1
## [1] 70
K2 <- data_box_sorted[547] + (pos_K2 - 547) * (data_box_sorted[548] - data_box_sorted[547])
K2
## [1] 135
K3 <- data_box_sorted[821] + (pos_K3 - 821) * (data_box_sorted[822] - data_box_sorted[821])
K3
## [1] 229
JAK <- K3 - K1
JAK
## [1] 159
PDB <- K1 - 1.5 * JAK
PDA <- K3 + 1.5 * JAK
PLB <- K1 - 3 * JAK
PLA <- K3 + 3 * JAK

# Menampilkan hasil
PDB
## [1] -168.5
PDA
## [1] 467.5
PLB
## [1] -407
PLA
## [1] 706

visualisasi boxplot

boxplot(dt_coklat$Boxes.Shipped, col="skyblue",
 main="Boxplot Data Box Shipped",
 xlab="Box Shipped")

# Pemeriksaan Sebaran Data

Memeriksa data menyebar normal

qqnorm(dt_coklat$Amount, col = 'blue') 
qqline(dt_coklat$Amount)

qqnorm(dt_coklat$Boxes.Shipped, col = 'blue') 
qqline(dt_coklat$Boxes.Shipped)

### Uji Formal (Kolmogorov smirnov) Hipotesis: H0 : Data mengikuti distribusi normal dengan rata-rata 5652 dan deviasi standar 4102.442. H1 : Data tidak mengikuti distribusi normal dengan rata-rata 5652 dan deviasi standar 4102.442.

ks.test(dt_coklat$Amount, "pnorm", 5652, 4102.442)
## Warning in ks.test.default(dt_coklat$Amount, "pnorm", 5652, 4102.442): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  dt_coklat$Amount
## D = 0.084409, p-value = 3.394e-07
## alternative hypothesis: two-sided

Kesimpulan: Karena p-value kurang dari 0.05, maka tolak H0. Ini berarti data data amount of revenue dalam penjualan coklat tidak mengikuti distribusi normal dengan parameter mean = 5652, sd = 4102.442.

Hipotesis: H0 : Data mengikuti distribusi normal dengan rata-rata 161.8 dan deviasi standar 121.5441. H1 : Data tidak mengikuti distribusi normal dengan rata-rata 161.8 dan deviasi standar 121.5441.

ks.test(dt_coklat$Boxes.Shipped, "pnorm", 161.8, 121.5441)
## Warning in ks.test.default(dt_coklat$Boxes.Shipped, "pnorm", 161.8, 121.5441):
## ties should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  dt_coklat$Boxes.Shipped
## D = 0.10388, p-value = 1.115e-10
## alternative hypothesis: two-sided

Kesimpulan: Karena p-value kurang dari 0.05, maka tolak H0. Ini berarti data data terkait boxes shipped dalam penjualan coklat tidak mengikuti distribusi normal dengan parameter mean = 161.8, sd = 121.5441. # Pendeteksian Pencilan ### Menggunakan Interquartile Range(IQR)

boxplot.stats(dt_coklat$Amount)$out
##  [1] 16793 19453 17318 19929 18991 16569 16982 18340 22050 19327 17626 19481
## [13] 18032 16702 17465 18697
order.out =  boxplot.stats(dt_coklat$Amount)$out
pencicilan = which(dt_coklat$Amount %in% c(order.out))
pencicilan
##  [1]   58   67  120  136  213  258  373  435  544  590  610  752  807  892  950
## [16] 1009
boxplot(dt_coklat$Amount,
        xlab = "amount",
        ylab = "Value",
        main = "Boxplot amount"
)
mtext(paste("Pencilan: ", paste(order.out, collapse = ", ")))

boxplot.stats(dt_coklat$Boxes.Shipped)$out
##  [1] 581 482 520 547 477 520 475 554 543 469 479 488 512 508 708 524 597 495 614
## [20] 468 518 518 539 709 591
order.out =  boxplot.stats(dt_coklat$Boxes.Shipped)$out
pencicilan = which(dt_coklat$Boxes.Shipped %in% c(order.out))
pencicilan
##  [1]   40   98  110  122  151  172  189  341  459  464  585  666  673  693  731
## [16]  766  770  824  853  863  866  876  979 1029 1071
boxplot(dt_coklat$Boxes.Shipped,
        xlab = "boxes shipped",
        ylab = "Value",
        main = "Boxplot boxes shipped"
)
mtext(paste("Pencilan: ", paste(order.out, collapse = ", ")))

Berdasarkan pendeteksian pencilan dengan metode IQR dan bantuan boxplot terlihat baik data ‘amount’ maupun ‘boxes shipped’ terdapat banyak pencilan atas pada data.

Statistik Kekar untuk Ukuran Pemusatan

Trimmed Mean

menggunakan trimmed mean karena jumlah data yang cukub besar dan terdapat banyak pencilan ekstrem. Menghitung mean dengan memangkas 10% data di kedua ujung distribusi.

rataan.biasa = mean(dt_coklat$Amount)
rataan.terpangkas = mean(dt_coklat$Amount, trim = 0.1) #alpha = 10%

cat("Rataan biasa : ", rataan.biasa, "\n")
## Rataan biasa :  5652.308
cat("Rataan Terpangkas : ", rataan.terpangkas)
## Rataan Terpangkas :  5221.576
rataan.biasa = mean(dt_coklat$Boxes.Shipped)
rataan.terpangkas = mean(dt_coklat$Boxes.Shipped, trim = 0.1) #alpha = 10%

cat("Rataan biasa : ", rataan.biasa, "\n")
## Rataan biasa :  161.798
cat("Rataan Terpangkas : ", rataan.terpangkas)
## Rataan Terpangkas :  147.3037

Statistik Kekar untuk Ukuran Penyebaran

Trimmed Variance

Menghitung ragam dengan memangkas 10% data di kedua ujung distribusi.

library(chemometrics)
## Warning: package 'chemometrics' was built under R version 4.4.3
## Loading required package: rpart
cat("Ragam tak terpangkas : ",sd(dt_coklat$Amount),"\n")
## Ragam tak terpangkas :  4102.442
cat("Ragam terpangkas : ",sd_trim(dt_coklat$Amount, trim = 0.1)) #alpha = 10%
## Ragam terpangkas :  3857.022
library(chemometrics)
cat("Ragam tak terpangkas : ",sd(dt_coklat$Boxes.Shipped),"\n")
## Ragam tak terpangkas :  121.5441
cat("Ragam terpangkas : ",sd_trim(dt_coklat$Boxes.Shipped, trim = 0.1)) #alpha = 10%
## Ragam terpangkas :  109.445