membaca dataset retail online dari Excel,

library(readxl)
online_retail <- read_xlsx("online_retail.xlsx")
head(online_retail)
## # A tibble: 6 × 8
##   InvoiceNo StockCode Description         Quantity InvoiceDate         UnitPrice
##   <chr>     <chr>     <chr>                  <dbl> <dttm>                  <dbl>
## 1 536365    85123A    WHITE HANGING HEAR…        6 2010-12-01 08:26:00      2.55
## 2 536365    71053     WHITE METAL LANTERN        6 2010-12-01 08:26:00      3.39
## 3 536365    84406B    CREAM CUPID HEARTS…        8 2010-12-01 08:26:00      2.75
## 4 536365    84029G    KNITTED UNION FLAG…        6 2010-12-01 08:26:00      3.39
## 5 536365    84029E    RED WOOLLY HOTTIE …        6 2010-12-01 08:26:00      3.39
## 6 536365    22752     SET 7 BABUSHKA NES…        2 2010-12-01 08:26:00      7.65
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

memuat data Excel dan cek berapa banyak baris awalnya.

data_raw <- read_xlsx("online_retail.xlsx")
cat("Data berhasil dimuat. Jumlah baris awal:", nrow(data_raw), "\n")
## Data berhasil dimuat. Jumlah baris awal: 541909

membersihkan dataset agar siap dianalisis dengan data yang lebih valid dan konsisten.

online_retail_clean <- data_raw %>%
  mutate(TotalPrice = Quantity * UnitPrice) %>%
  filter(!is.na(CustomerID)) %>%
  filter(!grepl("^C", InvoiceNo)) %>%
  filter(Quantity > 0, UnitPrice > 0) %>%
  filter(TotalPrice < 1000)
cat("Data setelah cleaning dan engineering. Jumlah baris akhir:", nrow(online_retail_clean), "\n")
## Data setelah cleaning dan engineering. Jumlah baris akhir: 397568

mempersiapkan data untuk analisis hubungan antara jumlah barang dan total harga.

online_retail_clean <- online_retail_clean %>%
  mutate(TotalPrice = Quantity * UnitPrice)
data_model <- online_retail_clean %>%
  select(X = Quantity, Y = TotalPrice)

cat("\nVariabel X (Independen): Quantity\n")
## 
## Variabel X (Independen): Quantity
cat("Variabel Y (Dependen): TotalPrice\n")
## Variabel Y (Dependen): TotalPrice

membuat model regresi linear sederhana dan menampilkan ringkasannya untuk evaluasi.

model_regresi <- lm(Y ~ X, data = data_model)

cat("\n--- RINGKASAN MODEL REGRESI LINEAR ---\n")
## 
## --- RINGKASAN MODEL REGRESI LINEAR ---
print(summary(model_regresi))
## 
## Call:
## lm(formula = Y ~ X, data = data_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2225.85   -10.08    -5.13     2.10   896.34 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.064902   0.057192   193.5   <2e-16 ***
## X            0.755162   0.001566   482.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.02 on 397566 degrees of freedom
## Multiple R-squared:  0.369,  Adjusted R-squared:  0.369 
## F-statistic: 2.325e+05 on 1 and 397566 DF,  p-value: < 2.2e-16

menunjukkan hubungan antara jumlah barang dan total harga dalam bentuk grafik regresi linear.

plot_regresi <- ggplot(data_model, aes(x = X, y = Y)) +
  geom_point(alpha = 0.3, color = "#007BFF") +
  geom_smooth(method = "lm", col = "#DC3545", se = TRUE) +
  labs(title = "Regresi Linear: Total Harga vs. Kuantitas Barang",
       x = "Kuantitas Barang (Quantity) - Variabel X",
       y = "Total Harga (TotalPrice) - Variabel Y",
       caption = "Data dibatasi TotalPrice < 1000 GBP untuk visualisasi") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

print(plot_regresi)
## `geom_smooth()` using formula = 'y ~ x'

mengekstrak hasil model regresi dan menuliskannya sebagai persamaan matematis yang jelas.

koefisien <- coef(model_regresi)
cat("\nPersamaan Regresi (Y = intercept + slope * X):\n")
## 
## Persamaan Regresi (Y = intercept + slope * X):
cat(sprintf("TotalPrice = %.2f + %.2f * Quantity\n", koefisien[1], koefisien[2]))
## TotalPrice = 11.06 + 0.76 * Quantity

menghapus data pelanggan yang tidak lengkap (CustomerID kosong) dan melaporkan hasilnya.

online_retail_clean <- data_raw %>%
  filter(!is.na(CustomerID))

cat("   - Aksi 1: Hapus NA di CustomerID. Baris terhapus:", nrow(data_raw) - nrow(online_retail_clean), "\n")
##    - Aksi 1: Hapus NA di CustomerID. Baris terhapus: 135080
cat("   - Sisa baris setelah Aksi 1:", nrow(online_retail_clean), "\n\n")
##    - Sisa baris setelah Aksi 1: 406829

memberi informasi tentang tipe data hasil perbaikan struktur dataset.

cat("3. Perbaikan Struktur Data (Tipe Data):\n")
## 3. Perbaikan Struktur Data (Tipe Data):
cat("   - InvoiceDate diubah menjadi Tipe Data: ", class(online_retail_clean$InvoiceDate), "\n")
##    - InvoiceDate diubah menjadi Tipe Data:  POSIXct POSIXt
cat("   - CustomerID diubah menjadi Tipe Data: ", class(online_retail_clean$CustomerID), "\n\n")
##    - CustomerID diubah menjadi Tipe Data:  numeric

menjaga kualitas data dengan hanya menyimpan transaksi yang benar-benar sahih.

online_retail_clean <- online_retail_clean %>%
  filter(Quantity > 0, UnitPrice > 0)

cat("   - Aksi 4: Hapus Quantity <= 0 dan UnitPrice <= 0. Sisa baris:", nrow(online_retail_clean), "\n\n")
##    - Aksi 4: Hapus Quantity <= 0 dan UnitPrice <= 0. Sisa baris: 397884

memberikan ringkasan hasil proses cleaning dan memastikan dataset akhir siap digunakan untuk analisis.

online_retail_final <- online_retail_clean %>%
  mutate(TotalPrice = Quantity * UnitPrice)
cat("5. Ringkasan Proses Cleaning:\n")
## 5. Ringkasan Proses Cleaning:
cat("   - Total baris awal:", nrow(data_raw), "\n")
##    - Total baris awal: 541909
cat("   - Total baris akhir:", nrow(online_retail_final), "\n")
##    - Total baris akhir: 397884
cat("   - Struktur Data Akhir:\n")
##    - Struktur Data Akhir:
print(glimpse(online_retail))
## Rows: 541,909
## Columns: 8
## $ InvoiceNo   <chr> "536365", "536365", "536365", "536365", "536365", "536365"…
## $ StockCode   <chr> "85123A", "71053", "84406B", "84029G", "84029E", "22752", …
## $ Description <chr> "WHITE HANGING HEART T-LIGHT HOLDER", "WHITE METAL LANTERN…
## $ Quantity    <dbl> 6, 6, 8, 6, 6, 2, 6, 6, 6, 32, 6, 6, 8, 6, 6, 3, 2, 3, 3, …
## $ InvoiceDate <dttm> 2010-12-01 08:26:00, 2010-12-01 08:26:00, 2010-12-01 08:2…
## $ UnitPrice   <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1.85, 1.69…
## $ CustomerID  <dbl> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17…
## $ Country     <chr> "United Kingdom", "United Kingdom", "United Kingdom", "Uni…
## # A tibble: 541,909 × 8
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## Warning: package 'psych' was built under R version 4.5.2
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
data <- read_excel("online_retail.xlsx")

Overview cepat semua data dan Analisis mendalam khusus angka untuk pemodelan statistik

summary(data)
##   InvoiceNo          StockCode         Description           Quantity         
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.000  
##  Class :character   Class :character   Class :character   1st Qu.:     1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.000  
##                                                           Mean   :     9.552  
##                                                           3rd Qu.:    10.000  
##                                                           Max.   : 80995.000  
##                                                                               
##   InvoiceDate                    UnitPrice            CustomerID    
##  Min.   :2010-12-01 08:26:00   Min.   :-11062.060   Min.   :12346   
##  1st Qu.:2011-03-28 11:34:00   1st Qu.:     1.250   1st Qu.:13953   
##  Median :2011-07-19 17:17:00   Median :     2.080   Median :15152   
##  Mean   :2011-07-04 13:34:57   Mean   :     4.611   Mean   :15288   
##  3rd Qu.:2011-10-19 11:27:00   3rd Qu.:     4.130   3rd Qu.:16791   
##  Max.   :2011-12-09 12:50:00   Max.   : 38970.000   Max.   :18287   
##                                                     NA's   :135080  
##    Country         
##  Length:541909     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
describe(select(data, where(is.numeric)))
##            vars      n     mean      sd   median  trimmed     mad       min
## Quantity      1 541909     9.55  218.08     3.00     5.20    2.97 -80995.00
## UnitPrice     2 541909     4.61   96.76     2.08     2.60    1.82 -11062.06
## CustomerID    3 406829 15287.69 1713.60 15152.00 15288.25 2195.73  12346.00
##              max     range   skew  kurtosis   se
## Quantity   80995 161990.00  -0.26 119767.61 0.30
## UnitPrice  38970  50032.06 186.51  59004.96 0.13
## CustomerID 18287   5941.00   0.03     -1.18 2.69

memberikan statistik deskriptif untuk variabel numerik.

# Step-by-step debugging
# 1. Cek apakah dataset ada
if(exists("online_retail")) {
  cat("✅ Dataset 'online_retail' ditemukan\n")
} else {
  cat("❌ Dataset tidak ditemukan. Load data dulu:\n")
  library(readxl)
  online_retail <- read_excel("online_retail.xlsx")
}
## ✅ Dataset 'online_retail' ditemukan
# 2. Cek apakah kolom ada
cat("\nKolom dalam dataset:\n")
## 
## Kolom dalam dataset:
print(names(online_retail))
## [1] "InvoiceNo"   "StockCode"   "Description" "Quantity"    "InvoiceDate"
## [6] "UnitPrice"   "CustomerID"  "Country"
# 3. Cek tipe data
cat("\nTipe data masing-masing kolom:\n")
## 
## Tipe data masing-masing kolom:
print(sapply(online_retail, class))
## $InvoiceNo
## [1] "character"
## 
## $StockCode
## [1] "character"
## 
## $Description
## [1] "character"
## 
## $Quantity
## [1] "numeric"
## 
## $InvoiceDate
## [1] "POSIXct" "POSIXt" 
## 
## $UnitPrice
## [1] "numeric"
## 
## $CustomerID
## [1] "numeric"
## 
## $Country
## [1] "character"
# 4. Baru jalankan describe
library(dplyr)
library(psych)
numeric_data <- online_retail %>% select(where(is.numeric))
describe(numeric_data)
##            vars      n     mean      sd   median  trimmed     mad       min
## Quantity      1 541909     9.55  218.08     3.00     5.20    2.97 -80995.00
## UnitPrice     2 541909     4.61   96.76     2.08     2.60    1.82 -11062.06
## CustomerID    3 406829 15287.69 1713.60 15152.00 15288.25 2195.73  12346.00
##              max     range   skew  kurtosis   se
## Quantity   80995 161990.00  -0.26 119767.61 0.30
## UnitPrice  38970  50032.06 186.51  59004.96 0.13
## CustomerID 18287   5941.00   0.03     -1.18 2.69

Analisi penjualannya

# Load library
library(dplyr)
library(skimr)
## Warning: package 'skimr' was built under R version 4.5.2
# 1. BUAT KOLOM BARU: TotalSales di dataset online_retail
cat("=== MEMBUAT KOLOM TOTAL SALES ===\n")
## === MEMBUAT KOLOM TOTAL SALES ===
online_retail <- online_retail %>%
  mutate(TotalSales = Quantity * UnitPrice)

cat("Kolom baru telah dibuat\n")
## Kolom baru telah dibuat
cat("Kolom dalam dataset:", names(online_retail), "\n\n")
## Kolom dalam dataset: InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country TotalSales
# 2. RINGKASAN 3 VARIABEL UTAMA
cat("=== RINGKASAN VARIABEL TRANSAKSI ===\n")
## === RINGKASAN VARIABEL TRANSAKSI ===
skim(online_retail[, c("Quantity", "UnitPrice", "TotalSales")])
Data summary
Name online_retail[, c(“Quanti…
Number of rows 541909
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Quantity 0 1 9.55 218.08 -80995.00 1.00 3.00 10.00 80995.0 ▁▁▇▁▁
UnitPrice 0 1 4.61 96.76 -11062.06 1.25 2.08 4.13 38970.0 ▁▇▁▁▁
TotalSales 0 1 17.99 378.81 -168469.60 3.40 9.75 17.40 168469.6 ▁▁▇▁▁
# 3. ANALISIS LEBIH DETAIL
cat("\n=== ANALISIS DETAIL TOTAL SALES ===\n")
## 
## === ANALISIS DETAIL TOTAL SALES ===
# Statistik dasar
cat("Statistik TotalSales:\n")
## Statistik TotalSales:
summary_stats <- online_retail %>%
  summarise(
    Rata_rata = mean(TotalSales, na.rm = TRUE),
    Median = median(TotalSales, na.rm = TRUE),
    Total_Penjualan = sum(TotalSales, na.rm = TRUE),
    Transaksi_Tertinggi = max(TotalSales, na.rm = TRUE),
    Jumlah_Transaksi = n(),
    Transaksi_Negatif = sum(TotalSales < 0, na.rm = TRUE)
  )
print(summary_stats)
## # A tibble: 1 × 6
##   Rata_rata Median Total_Penjualan Transaksi_Tertinggi Jumlah_Transaksi
##       <dbl>  <dbl>           <dbl>               <dbl>            <int>
## 1      18.0   9.75        9747748.             168470.           541909
## # ℹ 1 more variable: Transaksi_Negatif <int>
# 4. CEK DATA PROBLEMATIK
cat("\n=== CEK DATA ANOMALI ===\n")
## 
## === CEK DATA ANOMALI ===
# Cek quantity atau price negatif
anomali <- online_retail %>%
  filter(Quantity <= 0 | UnitPrice <= 0) %>%
  select(Quantity, UnitPrice, TotalSales)

if(nrow(anomali) > 0) {
  cat("Ditemukan", nrow(anomali), "transaksi anomali:\n")
  print(head(anomali, 5))
  if(nrow(anomali) > 5) cat("... dan", nrow(anomali) - 5, "lagi\n")
} else {
  cat("Tidak ada transaksi anomali (Quantity/UnitPrice <= 0)\n")
}
## Ditemukan 11805 transaksi anomali:
## # A tibble: 5 × 3
##   Quantity UnitPrice TotalSales
##      <dbl>     <dbl>      <dbl>
## 1       -1     27.5      -27.5 
## 2       -1      4.65      -4.65
## 3      -12      1.65     -19.8 
## 4      -24      0.29      -6.96
## 5      -24      0.29      -6.96
## ... dan 11800 lagi
# 5. VISUALISASI CEPAT (opsional)
cat("\n=== DISTRIBUSI TOTAL SALES ===\n")
## 
## === DISTRIBUSI TOTAL SALES ===
# Filter untuk visualisasi yang lebih jelas
sales_for_plot <- online_retail$TotalSales[online_retail$TotalSales > 0 & 
                                          online_retail$TotalSales < 1000]

hist(sales_for_plot, 
     main = "Distribusi Total Sales (< 1000 unit)", 
     xlab = "Total Sales", 
     col = "lightblue", 
     breaks = 30)

Membuat PETA WARNA untuk LIHAT HUBUNGAN ANGKA-ANGKA dalam data.

# Panggil library yang diperlukan
library(dplyr)
library(corrplot)

# Pilih hanya kolom numerik dari data
numeric_data <- select(data, where(is.numeric))

# Hitung matriks korelasi
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Visualisasi korelasi dengan corrplot
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

ringkasan statistik (mean, median, min, max, quartil)

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example: BUAT PLOT/GRAFIK SEDERHANA Program ini analisis statistik sederhana untuk memahami perilaku pembeli dari data transaksi retail online.

library(ggplot2)
library(readxl)

# 1. Baca file Excel
online_retail <- read_excel("online_retail.xlsx")

# 2. Buat histogram Quantity
ggplot(online_retail, aes(x = Quantity)) +
  geom_histogram(binwidth = 3, fill = "skyblue", color = "black") +
  theme_minimal() +
  labs(
    title = "Distribusi Quantity",
    x = "Quantity",
    y = "Frekuensi"
  )

library(tidyverse)
library(readxl)

menampilkan distribusi jumlah barang per transaksi dalam bentuk histogram berwarna dengan keterangan angka frekuensi langsung di grafik.

# Install & load packages jika belum
# install.packages("ggplot2")
# install.packages("readxl")
# install.packages("dplyr")
library(ggplot2)
library(readxl)
library(dplyr)

# 1. Baca file Excel
online_retail <- read_excel("online_retail.xlsx")

# 2. Filter data agar hanya Quantity positif
online_retail_clean <- online_retail %>%
  filter(Quantity > 0)

# 3. Buat histogram Quantity dengan label frekuensi
ggplot(online_retail_clean, aes(x = Quantity)) +
  geom_histogram(binwidth = 10, fill = "orange", color = "darkblue") +
  geom_text(
    stat = "bin",
    binwidth = 10,
    aes(label = ..count..),
    vjust = -0.5,
    color = "darkred",
    size = 3
  ) +
  theme_light(base_size = 12) +
  theme(
    panel.background = element_rect(fill = "lightgray"),
    plot.background  = element_rect(fill = "lightgray"),
    panel.grid.major = element_line(color = "white"),
    panel.grid.minor = element_line(color = "white")
  ) +
  labs(
    title = "Distribusi Quantity (Transaksi Positif)",
    subtitle = "Setiap batang menunjukkan jumlah transaksi per interval Quantity",
    x = "Quantity",
    y = "Frekuensi"
  )
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

menampilkan distribusi total penjualan per transaksi dalam bentuk histogram berwarna gradasi agar lebih informatif dan mudah dibaca.

# Install & load packages jika belum
# install.packages("ggplot2")
# install.packages("readxl")
# install.packages("dplyr")
library(ggplot2)
library(readxl)
library(dplyr)

# 1. Baca file Excel
online_retail <- read_excel("online_retail.xlsx")

# 2. Buat kolom TotalSales (Quantity * UnitPrice)
online_retail <- online_retail %>%
  mutate(TotalSales = Quantity * UnitPrice)

# 3. Buat histogram TotalSales dengan warna gradasi
ggplot(online_retail, aes(x = TotalSales)) +
  geom_histogram(aes(fill = ..count..), binwidth = 100, color = "black") +
  scale_fill_gradient(low = "lightgreen", high = "darkgreen") +
  theme_classic() +
  labs(
    title = "Distribusi Total Penjualan (TotalSales)",
    x = "Total Penjualan per Transaksi",
    y = "Frekuensi",
    fill = "Jumlah"
  )

menganalisis produk terlaris dan memvisualisasikan hasilnya dalam grafik yang informatif.

# Install & load packages jika belum
# install.packages("tidyverse")
# install.packages("readxl")
library(tidyverse)
library(readxl)

# 1. Baca file Excel
online_retail <- read_excel("online_retail.xlsx")

# 2. Buat kolom TotalSales (Quantity * UnitPrice)
online_retail <- online_retail %>%
  mutate(TotalSales = Quantity * UnitPrice)

# 3. Hitung 5 produk terlaris berdasarkan StockCode
produk_terlaris <- online_retail %>%
  group_by(StockCode, Description) %>%   # gunakan juga nama produk agar lebih informatif
  summarise(TotalSales = sum(TotalSales), .groups = "drop") %>%
  arrange(desc(TotalSales)) %>%
  slice_head(n = 5)

# 4. Buat grafik batang horizontal
ggplot(produk_terlaris, aes(x = reorder(Description, TotalSales), y = TotalSales)) +
  geom_bar(stat = "identity", fill = "maroon") +
  coord_flip() +
  labs(
    title = "5 Produk Terlaris Berdasarkan Penjualan",
    x = "Nama Produk",
    y = "Total Penjualan"
  ) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10))

menampilkan daftar nama kolom dalam data.

colnames(online_retail)
## [1] "InvoiceNo"   "StockCode"   "Description" "Quantity"    "InvoiceDate"
## [6] "UnitPrice"   "CustomerID"  "Country"     "TotalSales"

menganalisis performa transaksi bulanan, membandingkan dengan target, dan memberi status pencapaian tiap bulan.

# Pastikan package yang dipakai
library(dplyr)
library(tidyr)

# 1) Buat kolom bulan (singkatan Jan, Feb, ... Dec) sebagai faktor berlevel Jan–Dec
online_retail <- online_retail %>%
  mutate(
    bulan_chr = format(InvoiceDate, "%b"),
    bulan = factor(bulan_chr, levels = month.abb)
  )

# 2) Hitung jumlah transaksi per bulan, sekaligus lengkapi bulan yang tidak muncul (isi 0)
tab <- online_retail %>%
  count(bulan, name = "realisasi") %>%
  complete(bulan, fill = list(realisasi = 0)) %>%
  arrange(bulan)

# 3) Tambah target (10% di atas rata-rata realisasi bulanan), selisih, dan status
avg_realisasi <- mean(tab$realisasi)
target_val    <- round(avg_realisasi * 1.10)

tab <- tab %>%
  mutate(
    target  = target_val,
    selisih = realisasi - target,
    status  = ifelse(realisasi >= target, "Tercapai", "Belum Tercapai")
  )

# Lihat hasil
print(tab)
## # A tibble: 12 × 5
##    bulan realisasi target selisih status        
##    <fct>     <int>  <dbl>   <dbl> <chr>         
##  1 Jan       35147  49675  -14528 Belum Tercapai
##  2 Feb       27707  49675  -21968 Belum Tercapai
##  3 Mar       36748  49675  -12927 Belum Tercapai
##  4 Apr       29916  49675  -19759 Belum Tercapai
##  5 May       37030  49675  -12645 Belum Tercapai
##  6 Jun       36874  49675  -12801 Belum Tercapai
##  7 Jul       39518  49675  -10157 Belum Tercapai
##  8 Aug       35284  49675  -14391 Belum Tercapai
##  9 Sep       50226  49675     551 Tercapai      
## 10 Oct       60742  49675   11067 Tercapai      
## 11 Nov       84711  49675   35036 Tercapai      
## 12 Dec       68006  49675   18331 Tercapai

menganalisis performa transaksi bulanan dan memvisualisasikan pencapaian target dengan grafik.

library(readxl)
library(ggplot2)

# Baca file Excel
online_retail <- read_excel("online_retail.xlsx")

# Buat kolom bulan
online_retail$bulan <- format(online_retail$InvoiceDate, "%b")

# Hitung jumlah transaksi per bulan
tab <- as.data.frame(table(online_retail$bulan))
names(tab) <- c("bulan", "realisasi")

# Urutkan bulan Jan–Dec
tab$bulan <- factor(tab$bulan, levels = month.abb)
tab <- tab[order(tab$bulan), ]

# Tambah target, selisih, status
tab$target  <- round(mean(tab$realisasi) * 1.10)
tab$selisih <- tab$realisasi - tab$target
tab$status  <- ifelse(tab$realisasi >= tab$target, "Tercapai", "Belum Tercapai")

# Grafik
ggplot(tab, aes(x = bulan)) +
  geom_col(aes(y = target), fill = "lightblue", alpha = 0.6) +
  geom_line(aes(y = realisasi, group = 1), color = "darkblue", linewidth = 1) +
  geom_point(aes(y = realisasi, color = status), size = 3) +
  scale_color_manual(values = c("Tercapai" = "forestgreen", "Belum Tercapai" = "red")) +
  theme_minimal()