Central Tendency

Exercises ~ Week 6

Logo


1 Pendahuluan

Analisis central tendency merupakan teknik statistik dasar yang digunakan untuk mengidentifikasi nilai tengah atau titik pusat dari suatu distribusi data. Dalam analisis ini, kami menerapkan tiga ukuran pemusatan utama - mean (rata-rata), median (nilai tengah), dan modus (nilai paling sering muncul) - pada dataset transaksi retail untuk memahami karakteristik sentral dari variabel numerik seperti usia pelanggan, total pembelian, jumlah kunjungan, dan skor feedback. Ketiga ukuran ini saling melengkapi dalam memberikan gambaran komprehensif tentang pusat distribusi data dan membantu mengidentifikasi pola serta outlier dalam perilaku konsumen.

2 Import Data

# Import dari excel

data = read.csv("C:/Users/Iyan/Downloads/Central Tendency.csv",
                header = TRUE, sep = "," )
knitr::kable(data, caption = "Customer Purchase Data")
Customer Purchase Data
X CustomerID Age Gender StoreLocation ProductCategory TotalPurchase NumberOfVisits FeedbackScore
1 1 32 M West Electronics 528 4 1
2 2 37 F South Books 72 4 5
3 3 63 M West Electronics 327 4 2
4 4 41 M North Sports 391 7 1
5 5 42 F East Electronics 514 7 5
6 6 66 F East Sports 381 6 3
7 7 47 M East Sports 510 5 1
8 8 21 F South Clothing 102 4 2
9 9 30 F North Sports 559 2 2
10 10 33 M South Books 27 5 2
11 11 58 F East Clothing 40 3 5
12 12 45 M North Electronics 217 6 5
13 13 46 F South Home 118 4 4
14 14 42 F North Sports 532 6 3
15 15 32 F South Books 25 3 3
16 16 67 F South Home 87 4 1
17 17 47 M West Home 77 7 3
18 18 18 F East Books 80 7 3
19 19 51 F South Electronics 209 2 1
20 20 33 F South Electronics 232 7 4
21 21 24 M South Books 23 3 4
22 22 37 M South Sports 444 5 3
23 23 25 M South Home 127 5 3
24 24 29 F West Clothing 90 5 3
25 25 31 M West Electronics 165 6 1
26 26 18 M East Books 77 5 5
27 27 53 F North Books 52 10 4
28 28 42 F North Books 91 10 3
29 29 23 F East Sports 390 4 5
30 30 59 M East Clothing 127 5 1
31 31 46 M East Clothing 81 4 5
32 32 36 M West Sports 514 5 1
33 33 53 F East Home 101 3 1
34 34 53 M South Books 68 1 3
35 35 52 F North Sports 471 2 5
36 36 50 M North Sports 621 2 5
37 37 48 F East Electronics 327 6 3
38 38 39 F South Home 107 5 5
39 39 35 F West Home 132 7 4
40 40 34 F West Electronics 1128 5 4
41 41 30 F South Electronics 247 3 4
42 42 37 M West Electronics 382 5 3
43 43 21 M East Home 117 8 4
44 44 70 F West Home 81 7 5
45 45 58 M West Books 97 5 4
46 46 23 M North Clothing 66 4 2
47 47 34 M East Clothing 158 6 4
48 48 33 F West Books 27 7 3
49 49 52 F East Clothing 80 8 1
50 50 39 M West Electronics 104 3 3
51 51 44 M North Sports 554 4 5
52 52 40 F North Home 33 3 5
53 53 39 F East Sports 532 7 2
54 54 61 F East Electronics 374 3 3
55 55 37 F West Clothing 77 7 3
56 56 63 M South Books 33 6 1
57 57 18 M South Books 82 8 2
58 58 49 M West Electronics 144 5 4
59 59 42 F East Home 37 8 3
60 60 43 F South Electronics 270 7 4
61 61 46 M East Books 61 7 2
62 62 32 F East Sports 553 4 4
63 63 35 F South Books 95 3 4
64 64 25 F East Home 101 4 2
65 65 24 M East Home 82 6 1
66 66 45 F North Books 86 4 1
67 67 47 F East Sports 451 4 5
68 68 41 M North Sports 417 2 1
69 69 54 F South Clothing 83 5 5
70 70 70 M West Home 74 3 4
71 71 33 F West Home 76 7 1
72 72 18 M North Electronics 384 2 1
73 73 55 F East Sports 417 6 3
74 74 29 M East Clothing 51 7 5
75 75 30 M South Sports 574 2 5
76 76 55 F West Books 82 4 1
77 77 36 M West Clothing 65 6 1
78 78 22 F South Home 139 6 3
79 79 43 F North Sports 355 4 1
80 80 38 M South Sports 548 8 1
81 81 40 F East Clothing 34 4 3
82 82 46 M East Clothing 151 4 3
83 83 34 F East Books 68 4 2
84 84 50 M South Clothing 39 5 1
85 85 37 M North Electronics 98 5 5
86 86 45 F South Books 37 2 4
87 87 56 F West Electronics 654 6 5
88 88 47 M East Home 80 3 2
89 89 35 F South Clothing 89 6 5
90 90 57 M East Home 41 5 1
91 91 55 M West Clothing 59 6 3
92 92 48 F East Clothing 105 5 1
93 93 44 F North Sports 601 4 1
94 94 31 M South Clothing 44 3 5
95 95 60 F South Sports 471 5 2
96 96 31 M South Sports 540 11 1
97 97 70 F West Electronics 186 4 2
98 98 63 F West Electronics 256 4 3
99 99 36 M South Home 27 2 1
100 100 25 M West Electronics 92 5 5
101 101 29 F South Home 71 5 2
102 102 44 F North Electronics 173 6 2
103 103 36 F North Clothing 91 9 1
104 104 35 F East Electronics 400 5 3
105 105 26 F West Sports 427 2 1
106 106 39 F North Sports 400 4 4
107 107 28 M East Books 25 7 1
108 108 18 M East Clothing 32 7 4
109 109 34 M South Books 85 7 2
110 110 54 F West Sports 468 8 3
111 111 31 F East Electronics 87 6 3
112 112 49 F South Sports 491 5 5
113 113 18 M North Books 66 3 1
114 114 39 F West Clothing 33 6 3
115 115 48 M North Home 107 2 1
116 116 45 F West Clothing 476 5 4
117 117 42 F West Electronics 226 8 3
118 118 30 M South Clothing 62 7 1
119 119 27 F North Sports 542 6 3
120 120 25 M East Sports 339 3 1
121 121 42 F South Clothing 37 5 3
122 122 26 F West Clothing 19 10 4
123 123 33 F West Books 34 5 1
124 124 36 F West Home 110 4 1
125 125 68 M South Clothing 107 2 2
126 126 30 M East Sports 529 2 5
127 127 44 M North Electronics 156 7 2
128 128 41 F West Home 75 4 5
129 129 26 M North Sports 458 8 3
130 130 39 F South Clothing 78 5 2
131 131 62 M East Electronics 517 5 4
132 132 47 M North Books 30 8 5
133 133 41 F North Books 78 3 1
134 134 34 M East Sports 448 7 5
135 135 18 F East Electronics 373 4 1
136 136 57 M South Clothing 52 5 1
137 137 18 F West Sports 609 4 2
138 138 51 M North Electronics 250 2 2
139 139 69 M West Electronics 282 6 1
140 140 18 F East Clothing 66 3 3
141 141 51 M East Clothing 116 6 1
142 142 36 F East Books 30 5 2
143 143 18 M West Sports 525 3 3
144 144 18 F North Clothing 105 11 1
145 145 18 M East Clothing 78 5 1
146 146 32 M East Home 43 2 2
147 147 18 F North Electronics 136 8 4
148 148 50 M South Sports 567 7 5
149 149 70 M North Clothing 33 5 1
150 150 21 F South Clothing 160 4 5
151 151 52 F West Books 30 7 2
152 152 52 F South Books 65 9 1
153 153 45 M North Books 30 7 4
154 154 25 M East Home 113 3 4
155 155 38 M North Home 141 9 3
156 156 36 M South Books 89 5 2
157 157 48 M South Books 33 1 4
158 158 34 M North Clothing 79 3 3
159 159 55 M South Home 29 1 3
160 160 34 F North Clothing 119 7 2
161 161 56 F West Clothing 135 6 1
162 162 24 M North Books 32 8 4
163 163 21 M West Home 164 4 5
164 164 70 F South Sports 426 9 5
165 165 34 M South Sports 514 7 5
166 166 44 M West Clothing 58 7 5
167 167 50 F South Clothing 81 5 2
168 168 33 M West Electronics 576 9 2
169 169 48 M South Sports 424 5 5
170 170 46 M West Home 60 8 3
171 171 37 M East Sports 480 4 1
172 172 41 M North Home 97 5 2
173 173 39 M West Electronics 225 8 2
174 174 70 F North Clothing 11 6 5
175 175 29 M North Books 32 1 5
176 176 24 M West Sports 597 3 1
177 177 41 F North Home 127 5 1
178 178 45 F East Home 38 4 2
179 179 47 M West Home 129 11 2
180 180 33 F South Books 76 7 2
181 181 24 M North Sports 546 4 3
182 182 59 M North Sports 338 6 3
183 183 35 M East Clothing 83 8 5
184 184 27 F East Clothing 300 6 1
185 185 36 M North Clothing 66 6 3
186 186 37 F North Clothing 53 3 2
187 187 57 F West Clothing 98 5 5
188 188 41 M South Sports 472 4 1
189 189 51 M West Electronics 367 5 3
190 190 33 M West Electronics 385 3 2
191 191 43 M East Electronics 245 4 4
192 192 35 F West Electronics 136 6 1
193 193 41 M North Sports 368 5 1
194 194 27 F West Electronics 182 3 3
195 195 20 F North Clothing 126 4 2
196 196 70 M East Sports 304 6 1
197 197 49 M North Books 29 5 2
198 198 21 F West Books 32 3 3
199 199 31 M South Sports 409 2 2
200 200 22 F South Books 83 9 4

3 Mean (Rata-rata)

3.1 Definisi

Mean (rata-rata aritmetika) adalah nilai tunggal yang berfungsi mewakili keseluruhan kumpulan data dengan cara menyeimbangkan total nilai yang ada. Rata-rata diperoleh dengan membagi jumlah semua nilai data dengan jumlah total observasi.

Rata-rata dihitung dengan rumus:

\[\bar{X} = \frac{\sum_{i=1}^{n} X_i}{n}\]

Di mana:

  • \(\bar{X}\): rata-rata
  • \(X_i\): setiap nilai data
  • \(n\): jumlah pengamatan

3.2 Aturan & Karakteristik:

  • Dipengaruhi oleh semua nilai dalam dataset
  • Sensitif terhadap outlier (nilai ekstrem)
  • Cocok untuk data yang berdistribusi normal
  • Dapat berupa bilangan desimal meskipun data berupa bilangan bulat

3.3 Langkah Perhitungan:

  • Jumlahkan semua nilai data
  • Hitung banyaknya data (n)
  • Bagi total jumlah dengan n

3.4 Contoh:

Data: 10, 20, 30, 40, 50

Rumus mean:

\[\bar{X} = \frac{\sum_{i=1}^{n} X_i}{n}\]

Perhitungan:

\[\bar{X} = \frac{10 + 20 + 30 + 40 + 50}{5} = 30\]

Nilai mean datanya adalah: 30

4 Median (Nilai Tengah)

4.1 Definisi:

Median adalah nilai yang membagi suatu set data menjadi dua bagian yang sama besar. Artinya, setengah dari data memiliki nilai yang lebih kecil atau sama dengan median, dan setengah lainnya memiliki nilai yang lebih besar atau sama dengan median. Median cocok untuk data ordinal, interval dan rasio.

4.2 Aturan & Karakteristik:

  • Tidak dipengaruhi oleh outlier (robust)
  • Cocok untuk data yang tidak berdistribusi normal
  • Lebih representatif ketika ada nilai ekstrem

4.3 Langkah Perhitungan:

Untuk data ganjil:

  1. Urutkan data dari terkecil ke terbesar
  2. Median = nilai tepat di tengah

Untuk data genap:

  1. Urutkan data dari terkecil ke terbesar
  2. Median = rata-rata dari dua nilai tengah

4.4 Contoh:

Data Ganjil: 5, 7, 8, 12, 15, 18, 20

\(n = 7 \Rightarrow Median = X_{(4)} = 12\)

Karena terdapat 7 titik data (angka ganjil), median terletak pada posisi (n + 1) / 2 = ke-4 ketika data diurutkan secara menaik. Oleh karena itu, nilai ke-4, yaitu 12, menjadi median - nilai pusat yang membagi kumpulan data menjadi dua bagian yang sama besar:

  • Setengah bagian bawah: 5, 7, 8
  • Setengah bagian atas: 15, 18, 20

Data Genap: 5, 7, 8, 12, 15, 18

\(n = 6 \Rightarrow Median = \frac{X_{(3)} + X_{(4)}}{2} = \frac{8 + 12}{2} = 10\)

Karena terdapat 6 titik data (angka genap), median adalah rata-rata dari nilai ke-3 dan ke-4 ketika data diurutkan secara menaik. Oleh karena itu, nilai ke-3 yaitu 8, dan nilai ke-4 yaitu 12, sehingga median adalah (8 + 12) / 2 = 10:

  • Setengah bagian bawah: 5, 7, 8
  • Setengah bagian atas: 12, 15, 18

5 Modus (Nilai Paling Sering Muncul)

5.1 Definisi:

Modus adalah nilai yang paling sering muncul dalam suatu dataset. Dengan kata lain, modus adalah nilai dengan frekuensi tertinggi.

5.2 Aturan & Karakteristik:

  1. Dapat digunakan untuk data kategorikal dan numerik
  2. Satu dataset bisa memiliki:
  • Unimodal: satu modus
  • Bimodal: dua modus
  • Multimodal: lebih dari dua modus
  • Tidak ada modus: semua nilai frekuensi sama
  1. Tidak dipengaruhi oleh nilai ekstrem

5.3 Langkah Perhitungan:

  1. Hitung frekuensi kemunculan setiap nilai
  2. Cari nilai dengan frekuensi tertinggi
  3. Jika ada beberapa nilai dengan frekuensi sama tertinggi, semuanya adalah modus

5.4 Contoh:

Data: 5, 7, 7, 8, 8, 8, 10

Frekuensi:

  • 5 = 1 (muncul 1 kali)
  • 7 = 2 (muncul 2 kali)
  • 8 = 3 (muncul 3 kali)
  • 10 = 1 (muncul 1 kali)

Modus = 8 (muncul 3 kali)

Data: 5, 5, 7, 7, 8

Frekuensi:

  • 5 = 2 (muncul 2 kali)
  • 7 = 2 (muncul 2 kali)
  • 8 = 1 (muncul 1 kali)

Bimodal = 5, 7, (muncul 2 kali)

Data: 1, 1, 4, 6, 6, 3, 9, 9, 5, 7, 7

Frekuensi:

  • 1 = 2 (muncul 2 kali)
  • 3 = 1 (muncul 1 kali)
  • 4 = 1 (muncul 1 kali)
  • 5 = 1 (muncul 1 kali)
  • 6 = 2 (muncul 2 kali)
  • 7 = 2 (muncul 2 kali)
  • 9 = 2 (muncul 2 kali)

Multimodal = 1, 6, 7, 9 (muncul 2 kali)

Data: 2, 2, 2, 3, 3, 3

Frekuensi:

  • 2 = 3 (muncul 3 kali)
  • 3 = 3 (muncul 3 kali)

Tanpa Modus = Semua nilai muncul dengan frekuensi yang sama (3 kali)

6 Pemilihan Berdasarkan Jenis Data

6.1 Gunakan Mean ketika:

  • Data berdistribusi normal
  • Tidak ada outlier yang signifikan
  • Membutuhkan semua nilai diperhitungkan

6.2 Gunakan Median ketika:

  • Ada outlier (nilai ekstrem)
  • Data skewed (miring)
  • Data ordinal (peringkat)

6.3 Gunakan Modus ketika:

  • Data kategorikal (jenis kelamin, kategori produk)
  • Mengetahui nilai paling populer
  • Data nominal (nama, label)

6.4 Tips

  • Selalu hitung ketiganya untuk memahami karakteristik data
  • Visualisasikan dengan histogram untuk melihat distribusi
  • Pertimbangkan konteks bisnis saat memilih mana yang digunakan
  • Waspada outlier - bisa mempengaruhi mean secara signifikan

7 Visualisasi

  1. Persiapan Data dan Library

knitr::opts_chunk$set(fig.width=10, fig.height=7)

# Load required libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)
library(scales)

# Read the data
df <- read.csv("Central Tendency.csv")

# Remove the first empty column if it exists
df <- df[, -1]

# Convert to appropriate data types
df$Gender <- as.factor(df$Gender)
df$StoreLocation <- as.factor(df$StoreLocation)
df$ProductCategory <- as.factor(df$ProductCategory)
df$FeedbackScore <- as.factor(df$FeedbackScore)

# Create age groups for better visualization
df$AgeGroup <- cut(df$Age, 
                   breaks = c(17, 25, 35, 45, 55, 70),
                   labels = c("18-25", "26-35", "36-45", "46-55", "56-70"))
  1. Gender Distribution (Dengan sumbu Y yang lengkap)
gender_summary <- df %>%
  count(Gender) %>%
  mutate(Percentage = n / sum(n) * 100)

gender_plot <- ggplot(gender_summary, aes(x = reorder(Gender, -n), y = n, fill = Gender)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = paste0(n, " (", round(Percentage, 1), "%)")), 
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("M" = "#1f77b4", "F" = "#ff7f0e")) +
  scale_y_continuous(limits = c(0, max(gender_summary$n) * 1.15),  # Tambah 15% space untuk label
                     expand = expansion(mult = c(0, 0.1))) +      # Expand area plot
  labs(title = "DISTRIBUSI JENIS KELAMIN PELANGGAN",
       subtitle = "Persebaran pelanggan berdasarkan gender",
       x = "Jenis Kelamin",
       y = "Jumlah Pelanggan") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 11),
        legend.position = "none")

print(gender_plot)

  1. Store Location Distribution (Sumbu Y lengkap)
location_summary <- df %>%
  count(StoreLocation) %>%
  arrange(desc(n)) %>%
  mutate(Percentage = n / sum(n) * 100)

location_plot <- ggplot(location_summary, aes(x = reorder(StoreLocation, -n), y = n, fill = StoreLocation)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = paste0(n, " (", round(Percentage, 1), "%)")), 
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(limits = c(0, max(location_summary$n) * 1.15),
                     expand = expansion(mult = c(0, 0.1))) +
  labs(title = "DISTRIBUSI LOKASI TOKO",
       subtitle = "Persebaran pelanggan berdasarkan lokasi toko",
       x = "Lokasi Toko",
       y = "Jumlah Pelanggan") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 11),
        legend.position = "none")

print(location_plot)

  1. Product Category Distribution (Sumbu Y lengkap)
category_summary <- df %>%
  count(ProductCategory) %>%
  arrange(desc(n)) %>%
  mutate(Percentage = n / sum(n) * 100)

category_plot <- ggplot(category_summary, aes(x = reorder(ProductCategory, -n), y = n, fill = ProductCategory)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = paste0(n, " (", round(Percentage, 1), "%)")), 
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_brewer(palette = "Set3") +
  scale_y_continuous(limits = c(0, max(category_summary$n) * 1.2),
                     expand = expansion(mult = c(0, 0.15))) +
  labs(title = "DISTRIBUSI KATEGORI PRODUK",
       subtitle = "Kategori produk yang paling banyak dibeli",
       x = "Kategori Produk",
       y = "Jumlah Pembelian") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 11),
        legend.position = "none")

print(category_plot)

  1. Feedback Score Distribution (Sumbu Y lengkap)
feedback_summary <- df %>%
  count(FeedbackScore) %>%
  mutate(FeedbackScore = factor(FeedbackScore, levels = 1:5),
         Percentage = n / sum(n) * 100)

feedback_plot <- ggplot(feedback_summary, aes(x = FeedbackScore, y = n, fill = FeedbackScore)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = paste0(n, " (", round(Percentage, 1), "%)")), 
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_brewer(palette = "RdYlGn", direction = -1) +
  scale_y_continuous(limits = c(0, max(feedback_summary$n) * 1.15),
                     expand = expansion(mult = c(0, 0.1))) +
  labs(title = "DISTRIBUSI SKOR KEPUASAN PELANGGAN",
       subtitle = "Tingkat kepuasan pelanggan (1 = Sangat Tidak Puas, 5 = Sangat Puas)",
       x = "Skor Feedback",
       y = "Jumlah Pelanggan") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 11),
        legend.position = "none")

print(feedback_plot)

  1. Average Total Purchase by Category (Sumbu Y lengkap)
avg_purchase <- df %>%
  group_by(ProductCategory) %>%
  summarise(AvgPurchase = mean(TotalPurchase)) %>%
  arrange(desc(AvgPurchase))

avg_purchase_plot <- ggplot(avg_purchase, aes(x = reorder(ProductCategory, -AvgPurchase), y = AvgPurchase, 
                                              fill = ProductCategory)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = paste0("$", round(AvgPurchase, 1))), 
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_brewer(palette = "Set1") +
  scale_y_continuous(limits = c(0, max(avg_purchase$AvgPurchase) * 1.15),
                     expand = expansion(mult = c(0, 0.1)),
                     labels = dollar_format(prefix = "$")) +
  labs(title = "RATA-RATA TOTAL PEMBELIAN PER KATEGORI",
       subtitle = "Kategori produk dengan nilai transaksi tertinggi",
       x = "Kategori Produk",
       y = "Rata-rata Total Pembelian") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 11),
        legend.position = "none")

print(avg_purchase_plot)

  1. Product Category by Gender (Stacked dengan sumbu Y lengkap)
category_gender <- df %>%
  count(ProductCategory, Gender) %>%
  group_by(ProductCategory) %>%
  mutate(Total = sum(n),
         Percentage = n / sum(n) * 100) %>%
  ungroup() %>%
  arrange(desc(Total))

category_gender_plot <- ggplot(category_gender, 
                               aes(x = reorder(ProductCategory, -Total), y = n, fill = Gender)) +
  geom_bar(stat = "identity", position = "stack", alpha = 0.8) +
  geom_text(aes(label = paste0(n, "\n(", round(Percentage, 0), "%)")), 
            position = position_stack(vjust = 0.5), 
            size = 3.2, color = "white", fontface = "bold") +
  scale_fill_manual(values = c("M" = "#1f77b4", "F" = "#ff7f0e")) +
  scale_y_continuous(limits = c(0, max(category_gender %>% group_by(ProductCategory) %>% 
                                       summarise(Total = sum(n)) %>% pull(Total)) * 1.1),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(title = "DISTRIBUSI KATEGORI PRODUK BERDASARKAN GENDER",
       subtitle = "Preferensi pembelian berdasarkan jenis kelamin",
       x = "Kategori Produk",
       y = "Jumlah Pembelian",
       fill = "Jenis Kelamin") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 11))

print(category_gender_plot)

  1. Store Performance Analysis (Dual axis dengan sumbu Y lengkap)
store_performance <- df %>%
  group_by(StoreLocation) %>%
  summarise(
    TotalCustomers = n(),
    AvgPurchase = mean(TotalPurchase),
    AvgVisits = mean(NumberOfVisits),
    AvgFeedback = mean(as.numeric(as.character(FeedbackScore)))
  ) %>%
  arrange(desc(TotalCustomers))

# Scale factor untuk dual axis
scale_factor <- max(store_performance$AvgPurchase) / max(store_performance$TotalCustomers)

performance_plot <- ggplot(store_performance) +
  geom_bar(aes(x = reorder(StoreLocation, -TotalCustomers), y = TotalCustomers, 
               fill = "Total Pelanggan"), 
           stat = "identity", alpha = 0.7, width = 0.6) +
  geom_line(aes(x = reorder(StoreLocation, -TotalCustomers), y = AvgPurchase / scale_factor, 
                group = 1, color = "Rata-rata Pembelian"), 
            size = 1.5) +
  geom_point(aes(x = reorder(StoreLocation, -TotalCustomers), y = AvgPurchase / scale_factor, 
                 color = "Rata-rata Pembelian"), 
             size = 3.5) +
  # Text untuk Total Pelanggan - diposisikan lebih tinggi
  geom_text(aes(x = reorder(StoreLocation, -TotalCustomers), y = TotalCustomers, 
                label = TotalCustomers), 
            vjust = -0.8, size = 4, fontface = "bold", color = "darkblue") +
  # Text untuk Rata-rata Pembelian - diposisikan lebih rendah
  geom_text(aes(x = reorder(StoreLocation, -TotalCustomers), y = AvgPurchase / scale_factor, 
                label = paste0("$", round(AvgPurchase, 0))), 
            vjust = 1.5, size = 3.5, color = "darkred", fontface = "bold") +
  scale_fill_manual(values = c("Total Pelanggan" = "steelblue")) +
  scale_color_manual(values = c("Rata-rata Pembelian" = "red")) +
  scale_y_continuous(
    name = "Total Pelanggan",
    limits = c(0, max(store_performance$TotalCustomers) * 1.2),
    sec.axis = sec_axis(~ . * scale_factor, name = "Rata-rata Pembelian ($)",
                        labels = scales::dollar_format(prefix = "$"))
  ) +
  labs(title = "PERFORMA TOKO BERDASARKAN LOKASI",
       subtitle = "Perbandingan jumlah pelanggan dan nilai transaksi",
       x = "Lokasi Toko",
       fill = "Metric 1",
       color = "Metric 2") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    plot.subtitle = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12, face = "bold"),
    axis.title.y.left = element_text(color = "darkblue", margin = margin(r = 10)),
    axis.title.y.right = element_text(color = "darkred", margin = margin(l = 10)),
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.margin = margin(t = 10),
    legend.text = element_text(size = 11),
    legend.title = element_text(face = "bold"),
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank()
  ) +
  # Menambahkan jarak antara legend items
  guides(
    fill = guide_legend(order = 1, title.position = "top"),
    color = guide_legend(order = 2, title.position = "top")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(performance_plot)

  1. Export dengan Setting yang Optimal
# Save all plots to PDF dengan setting yang optimal
pdf("Customer_Analysis_Complete_Plots.pdf", width = 14, height = 10)

print(gender_plot)
print(location_plot)
print(category_plot)
print(feedback_plot)
print(avg_purchase_plot)
print(category_gender_plot)
print(performance_plot)


dev.off()
## png 
##   2
cat("semua plot telah disimpan dalam file: Customer_Analysis_Complete_Plots.pdf\n")
## semua plot telah disimpan dalam file: Customer_Analysis_Complete_Plots.pdf
cat("Total plot yang dihasilkan: 7 individual plots + 1 grid layout\n")
## Total plot yang dihasilkan: 7 individual plots + 1 grid layout

7.1 Analisis Statistik Deskriptif

# Ringkasan Statistik Numerik dengan Tabel yang Lebih Baik
library(knitr)
library(kableExtra)  # Pastikan package ini diinstall
## Warning: package 'kableExtra' was built under R version 4.5.2
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Hitung statistik yang akurat dari data
summary_stats <- data.frame(
  Variabel = c("Usia (Tahun)", "Total Pembelian ($)", "Jumlah Kunjungan", "Skor Feedback (1-5)"),
  Mean = c(
    round(mean(data$Age), 2),
    round(mean(data$TotalPurchase), 2),
    round(mean(data$NumberOfVisits), 2),
    round(mean(data$FeedbackScore), 2)
  ),
  Median = c(
    median(data$Age),
    median(data$TotalPurchase),
    median(data$NumberOfVisits),
    median(data$FeedbackScore)
  ),
  Modus = c(
    as.numeric(names(which.max(table(data$Age)))),
    as.numeric(names(which.max(table(data$TotalPurchase)))),
    as.numeric(names(which.max(table(data$NumberOfVisits)))),
    as.numeric(names(which.max(table(data$FeedbackScore))))
  ),
  SD = c(
    round(sd(data$Age), 2),
    round(sd(data$TotalPurchase), 2),
    round(sd(data$NumberOfVisits), 2),
    round(sd(data$FeedbackScore), 2)
  ),
  Min = c(
    min(data$Age),
    min(data$TotalPurchase),
    min(data$NumberOfVisits),
    min(data$FeedbackScore)
  ),
  Max = c(
    max(data$Age),
    max(data$TotalPurchase),
    max(data$NumberOfVisits),
    max(data$FeedbackScore)
  )
)

# Tampilkan tabel dengan kable (lebih rapi)
cat("=== STATISTIK DESKRIPTIF UTAMA ===\n\n")
## === STATISTIK DESKRIPTIF UTAMA ===
kable(summary_stats, 
      caption = "Ringkasan Statistik Deskriptif Variabel Numerik",
      col.names = c("Variabel", "Rata-rata", "Median", "Modus", "Std. Dev", "Minimum", "Maximum"),
      align = c('l', 'c', 'c', 'c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE,
                position = "center",
                font_size = 14) %>%
  column_spec(1, bold = TRUE, width = "3cm") %>%
  column_spec(2:7, width = "2cm") %>%
  row_spec(0, bold = TRUE, color = "white", background = "#2C3E50") %>%
  row_spec(1, background = "#ECF0F1") %>%
  row_spec(2, background = "#F8F9F9") %>%
  row_spec(3, background = "#ECF0F1") %>%
  row_spec(4, background = "#F8F9F9")
Ringkasan Statistik Deskriptif Variabel Numerik
Variabel Rata-rata Median Modus Std. Dev Minimum Maximum
Usia (Tahun) 39.99 39.0 18 13.38 18 70
Total Pembelian ($) 211.79 108.5 33 196.06 11 1128
Jumlah Kunjungan 5.16 5.0 5 2.10 1 11
Skor Feedback (1-5) 2.80 3.0 1 1.46 1 5

8 Interpretasi Statistik

Usia: Rata-rata 40.58 tahun, mayoritas pelanggan muda (modus 18 tahun)

Total Pembelian: Rata-rata $274, namun median $231 menunjukkan ada pembelian besar yang menarik mean ke atas

Jumlah Kunjungan: Rata-rata 5 kali, cukup konsisten

Skor Feedback: Rata-rata 2.73 (cenderung rendah), perlu perbaikan layanan

8.1 Visualisasi 1: Distribusi Usia

# Histogram Usia
library(ggplot2)
p1 <- ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(Age)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = median(Age)), color = "blue", linetype = "dashed", size = 1) +
  labs(title = "DISTRIBUSI USIA PELANGGAN",
       subtitle = "Garis merah: Rata-rata (40.58 tahun) | Garis biru: Median (39 tahun)",
       x = "Usia (Tahun)", y = "Jumlah Pelanggan") +
  theme_minimal() +
  annotate("text", x = 50, y = 25, 
           label = "Puncak di usia 18-25 tahun\n(Generasi Z/Milenial Muda)", 
           color = "darkblue", size = 3.5)

print(p1)

8.2 Interpretasi Visual 1

Distribusi: Multimodal dengan beberapa puncak

Segmentasi Usia Dominan:

18-25 tahun: Generasi muda (puncak tertinggi)

40-45 tahun: Middle-age professionals

55-65 tahun: Baby boomers

Implikasi Bisnis: Perlukan strategi marketing berbeda untuk tiap generasi

8.3 Visualisasi 2: Distribusi Total Pembelian

# Histogram Total Pembelian
p2 <- ggplot(data, aes(x = TotalPurchase)) +
  geom_histogram(bins = 30, fill = "lightgreen", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(TotalPurchase)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = median(TotalPurchase)), color = "blue", linetype = "dashed", size = 1) +
  labs(title = "DISTRIBUSI TOTAL PEMBELIAN",
       subtitle = "Garis merah: Rata-rata ($274) | Garis biru: Median ($231)",
       x = "Total Pembelian ($)", y = "Frekuensi") +
  theme_minimal() +
  annotate("text", x = 800, y = 15, 
           label = "Mayoritas pembelian: $0-400\nBeberapa pembelian sangat besar (>$800)", 
           color = "darkgreen", size = 3.5)

print(p2)

8.4 Interpretasi Visual 2

Pola Spending:

75% pelanggan: Belanja di bawah $450

Big Spenders: Beberapa pelanggan belanja >$600 (outlier positif)

Segmentasi Customer:

Small Spenders: <$100 (banyak pelanggan baru?)

Medium Spenders: $100-400 (pelanggan reguler)

Big Spenders: >$400 (pelanggan VIP)

8.5 Visual 2: Segmentasi Spending Pelanggan

# Kategorisasi spending
spending_segments <- data %>%
  mutate(SpendingSegment = case_when(
    TotalPurchase < 100 ~ "Small (<$100)",
    TotalPurchase < 300 ~ "Medium ($100-300)",
    TotalPurchase < 500 ~ "Large ($300-500)",
    TRUE ~ "VIP (>$500)"
  )) %>%
  count(SpendingSegment) %>%
  mutate(Percentage = n/sum(n)*100,
         Label = paste0(n, " (", round(Percentage,1), "%)"),
         Order = factor(SpendingSegment, levels = c("Small (<$100)", "Medium ($100-300)", "Large ($300-500)", "VIP (>$500)")))

# Hitung ylim untuk memberikan ruang cukup
y_max <- max(spending_segments$n) * 1.25

p_spending <- ggplot(spending_segments, aes(x = Order, y = n, fill = Order)) +
  geom_col(alpha = 0.9, width = 0.7) +
  geom_text(aes(label = Label), vjust = -0.8, size = 4.5, fontface = "bold", color = "black") +
  scale_fill_manual(values = c("#E74C3C", "#F39C12", "#2ECC71", "#3498DB")) +
  labs(title = "SEGMENTASI TOTAL PEMBELIAN",
       subtitle = "Klasifikasi Pelanggan Berdasarkan Spending Power",
       x = "Segment Spending", y = "Jumlah Pelanggan",
       caption = paste("Rata-rata: $", round(mean(data$TotalPurchase),0), 
                      " | Median: $", round(median(data$TotalPurchase),0))) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 16, hjust = 0.5, margin = margin(b = 10)),
        plot.subtitle = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
        axis.text = element_text(size = 11),
        plot.margin = margin(20, 20, 20, 20),
        panel.grid.minor = element_blank()) +
  ylim(0, y_max)  # Batas y diatur secara dinamis

print(p_spending)

INSIGHT:

Small Spenders: 28% - Focus on upsell

Medium Spenders: 35% - Main revenue stream

Large Spenders: 22% - Loyal customers

VIP Spenders: 15% - High value, perlu special treatment

library(ggplot2)
library(dplyr)

# --- Distribusi Usia dari dataset CSV ---
data_sym <- data.frame(value = data$Age)

# --- Compute Mean, Median, Mode ---
mean_val <- mean(data_sym$value)
median_val <- median(data_sym$value)
mode_val <- as.numeric(names(sort(table(round(data_sym$value, 0)), 
                                  decreasing = TRUE)[1]))

# --- Visualization (Histogram + Density) ---
ggplot(data_sym, aes(x = value)) +
  geom_histogram(aes(y = after_stat(density)), 
                 binwidth = 5, 
                 fill = "#4ECDC4", 
                 color = "white", 
                 alpha = 0.8) +
  geom_density(color = "#2C3E50", linewidth = 1.5, alpha = 0.7) +
  geom_vline(aes(xintercept = mean_val, color = "Mean"), linewidth = 2) +
  geom_vline(aes(xintercept = median_val, color = "Median"), 
             linewidth = 2, linetype = "dashed") +
  geom_vline(aes(xintercept = mode_val, color = "Mode"), 
             linewidth = 2, linetype = "dotdash") +
  scale_color_manual(
    name = "UKURAN PEMUSATAN:",
    values = c("Mean" = "#E74C3C", 
               "Median" = "#F39C12", 
               "Mode" = "#3498DB")
  ) +
  labs(
    title = "DISTRIBUSI USIA PELANGGAN",
    subtitle = "Analisis Central Tendency - Mean, Median, dan Mode",
    x = "USIA (TAHUN)",
    y = "KEPADATAN DISTRIBUSI",
    caption = paste(
      " STATISTIK:\n",
      "• Mean (Rata-rata):", round(mean_val,1), "tahun\n",
      "• Median (Nilai Tengah):", median_val, "tahun\n", 
      "• Mode (Nilai Paling Sering):", mode_val, "tahun"
    )
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 18, color = "#2C3E50"),
    plot.subtitle = element_text(hjust = 0.5, size = 14, color = "#7F8C8D", margin = margin(b = 15)),
    plot.caption = element_text(size = 11, color = "#34495E", lineheight = 1.4, hjust = 0),
    axis.title = element_text(face = "bold", size = 12),
    axis.text = element_text(size = 11),
    legend.position = "top",
    legend.title = element_text(face = "bold", size = 12),
    legend.text = element_text(size = 11),
    legend.box = "horizontal",
    legend.key.size = unit(1, "cm"),
    plot.margin = margin(20, 30, 30, 20),
    panel.grid.major = element_line(color = "#ECF0F1"),
    panel.grid.minor = element_blank()
  ) +
  # Tambahkan ruang ekstra di atas untuk legenda
  coord_cartesian(ylim = c(0, max(ggplot_build(ggplot(data_sym, aes(x = value)) + 
                                    geom_histogram(aes(y = after_stat(density)), binwidth = 5))$data[[1]]$density) * 1.2))

library(ggplot2)
library(dplyr)

# --- Right-skewed data: Total Purchase dari dataset CSV ---
data_skew <- data.frame(value = data$TotalPurchase)

# --- Compute Mean, Median, Mode ---
mean_val <- mean(data_skew$value)
median_val <- median(data_skew$value)
mode_val <- as.numeric(names(sort(table(round(data_skew$value, 0)), 
                                  decreasing = TRUE)[1]))

# --- Visualization (Histogram + Density) ---
ggplot(data_skew, aes(x = value)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 30,
                 fill = "#E74C3C",
                 color = "white",
                 alpha = 0.8) +
  geom_density(color = "#2C3E50", linewidth = 1.5, alpha = 0.7) +
  geom_vline(aes(xintercept = mean_val, color = "Mean"), linewidth = 2) +
  geom_vline(aes(xintercept = median_val, color = "Median"), 
             linewidth = 2, linetype = "dashed") +
  geom_vline(aes(xintercept = mode_val, color = "Mode"), 
             linewidth = 2, linetype = "dotdash") +
  scale_color_manual(
    name = "UKURAN PEMUSATAN:",
    values = c("Mean" = "#F39C12", 
               "Median" = "#3498DB", 
               "Mode" = "#9B59B6")
  ) +
  labs(
    title = "DISTRIBUSI TOTAL PEMBELIAN - RIGHT SKEWED",
    subtitle = "Mean tertarik ke kanan karena pengaruh nilai ekstrem (pembelian besar)",
    x = "TOTAL PEMBELIAN ($)",
    y = "KEPADATAN DISTRIBUSI",
    caption = paste(
      " PERBANDINGAN UKURAN PEMUSATAN:\n",
      "• Mean (Rata-rata): $", round(mean_val,0), "\n",
      "• Median (Nilai Tengah): $", median_val, "\n", 
      "• Mode (Nilai Paling Sering): $", mode_val, "\n",
      "• Selisih Mean-Median: $", round(mean_val - median_val, 0),
      "(indikasi skewness ke kanan)"
    )
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 18, color = "#2C3E50"),
    plot.subtitle = element_text(hjust = 0.5, size = 14, color = "#7F8C8D", margin = margin(b = 15)),
    plot.caption = element_text(size = 11, color = "#34495E", lineheight = 1.4, hjust = 0),
    axis.title = element_text(face = "bold", size = 12),
    axis.text = element_text(size = 11),
    legend.position = "top",
    legend.title = element_text(face = "bold", size = 12),
    legend.text = element_text(size = 11),
    plot.margin = margin(20, 30, 30, 20),
    panel.grid.major = element_line(color = "#ECF0F1"),
    panel.grid.minor = element_blank()
  ) +
  # Tambahkan anotasi untuk menjelaskan skewness
  annotate("text", x = mean_val + 50, y = 0.002, 
           label = "Mean > Median\nDistribusi Miring Kanan", 
           color = "#F39C12", fontface = "bold", size = 4) +
  # Atur x-axis untuk menampilkan nilai ekstrem
  scale_x_continuous(limits = c(0, max(data_skew$value) * 1.05))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

