Bab 1 VISUALISASI DESKRIPTIF

## C:\Users\HP\AppData\Local\Programs\Python\PYTHON~1\python.exe
library(DT)

data <- read.csv("data/data-dummy.csv")
datatable(data, options = list(pageLength = 10))

1.1 Numerical Data

1.1.1 Python

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load data
data = pd.read_csv("data/data-dummy.csv")

# Pastikan kolom 'Quantity' ada dan tidak null
quantity = data['Quantity'].dropna()

# Statistik
Q1 = quantity.quantile(0.25)
Q3 = quantity.quantile(0.75)
IQR = Q3 - Q1
mean = quantity.mean()
min_val = quantity.min()
max_val = quantity.max()
outliers = quantity[(quantity < Q1 - 1.5 * IQR) | (quantity > Q3 + 1.5 * IQR)]

# Plot
plt.figure(figsize=(10, 6))
sns.boxplot(y=quantity, color="skyblue", width=0.4, fliersize=0)
sns.stripplot(y=quantity, color='black', size=3, alpha=0.5, jitter=True)
sns.stripplot(y=outliers, color='red', size=5, jitter=True)

# Garis vertikal tengah
plt.axhline(mean, color='blue', linestyle='--', linewidth=1)

# Anotasi
def annotate_text(y, text, color, bold=False, italic=False):
    plt.text(
        1.2, y, text,
        fontsize=10,
        color=color,
        ha='left',
        fontstyle='italic' if italic else 'normal',
        fontweight='bold' if bold else 'normal'
    )

annotate_text(Q1, f"Q1: {Q1:.1f}", 'green')
annotate_text(Q3, f"Q3: {Q3:.1f}", 'green')
annotate_text(mean, f"Mean: {mean:.2f}", 'blue', bold=True)
annotate_text(min_val, f"Min: {min_val}", 'orange')
annotate_text(max_val, f"Max: {max_val}", 'orange')
annotate_text(outliers.max(), f"Outliers: {len(outliers)}", 'red', bold=True)

# Styling
plt.title("Boxplot of Quantity with Jitter", fontsize=16, weight='bold')
plt.ylabel("Quantity")
plt.xticks([])  # Sembunyikan x-axis karena cuma 1 variabel
## ([], [])
plt.tight_layout()
plt.show()

1.1.2 Violin Plot

# ==============================
# 1. Import Library
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load dan Persiapan Data
# ==============================
# Ganti dengan path CSV kamu
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Pastikan kolom Quantity bertipe numerik
data_bisnis["Quantity"] = pd.to_numeric(data_bisnis["Quantity"], errors="coerce")
data_bisnis = data_bisnis.dropna(subset=["Quantity"])

# Hitung statistik untuk anotasi
Q1 = data_bisnis["Quantity"].quantile(0.25)
Q3 = data_bisnis["Quantity"].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR
mean_val = data_bisnis["Quantity"].mean()
median_val = data_bisnis["Quantity"].median()
min_val = data_bisnis["Quantity"].min()
max_val = data_bisnis["Quantity"].max()
outliers_count = ((data_bisnis["Quantity"] > upper_whisker) | (data_bisnis["Quantity"] < lower_whisker)).sum()

# Tandai outlier
data_bisnis["is_outlier"] = data_bisnis["Quantity"].apply(
    lambda x: "Outlier" if x < lower_whisker or x > upper_whisker else "Normal"
)

# ==============================
# 3. Plot Violin + Box + Jitter + Anotasi
# ==============================
plt.figure(figsize=(16, 10))
sns.violinplot(
    x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
    inner=None, color="skyblue"
)
sns.boxplot(
    x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
    width=0.2, showcaps=True, boxprops={'facecolor': 'none'},
    showfliers=False
)
sns.stripplot(
    x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
    hue="is_outlier", palette={"Normal": "black", "Outlier": "red"},
    jitter=0.15, size=4, alpha=0.7, dodge=False
)

# Tambahkan anotasi statistik
plt.text(1.1, mean_val, f"Mean: {mean_val:.2f}", color="blue", fontsize=14, fontweight="bold")
plt.text(1.1, Q1, f"Q1: {Q1:.2f}", color="green", fontsize=13)
plt.text(1.1, median_val, f"Median: {median_val:.2f}", color="purple", fontsize=13)
plt.text(1.1, Q3, f"Q3: {Q3:.2f}", color="green", fontsize=13)
plt.text(1.1, min_val, f"Min: {min_val:.2f}", color="orange", fontsize=13)
plt.text(1.1, max_val, f"Max: {max_val:.2f}", color="orange", fontsize=13)
plt.text(1, max_val + 0.3, f"Outliers: {outliers_count}", color="red", fontsize=14, style="italic", ha="center")

# Judul dan tampilan
plt.title("Violin Plot of Quantity with Outlier Highlighted", fontsize=24, weight="bold")
plt.ylabel("Quantity", fontsize=18)
plt.xticks([], [])  # sembunyikan sumbu X karena dummy
## ([], [])
plt.yticks(fontsize=14)
## (array([-2.,  0.,  2.,  4.,  6.,  8., 10., 12.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10'), Text(0, 12.0, '12')])
plt.legend(title="Point Type", title_fontsize=15, fontsize=13, loc="upper right")
sns.despine()
plt.tight_layout()
plt.show()

1.2 Combo

1.2.1 Grouped Bar Chart

# ==============================
# 1. Import Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# ==============================
# 3. Data Summarization
# ==============================
sales_summary = (
    data_bisnis
    .groupby(['Product_Category', 'Region'], as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
)

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
plt.figure(figsize=(12, 6))
sns.barplot(
    data=sales_summary,
    x="Product_Category",
    y="Total_Sales",
    hue="Region",
    dodge=True
)

plt.title("Total Sales by Product Category and Region", fontsize=16, fontweight="bold")
plt.xlabel("Product Category", fontsize=14)
plt.ylabel("Total Sales (USD)", fontsize=14)
plt.xticks(rotation=45, ha='right')
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Books'), Text(1, 0, 'Clothing'), Text(2, 0, 'Electronics'), Text(3, 0, 'Groceries'), Text(4, 0, 'Home')])
plt.legend(title="Region", title_fontsize=12)
sns.despine()
plt.tight_layout()
plt.show()

1.2.2 Redgeline Plot

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
from joypy import joyplot
import seaborn as sns
import matplotlib.ticker as ticker

# ==============================
# 2. Filter Valid Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Filter hanya nilai Price_per_Unit yang finite (bukan NaN, Inf, -Inf)
data_bisnis_filtered = data_bisnis[pd.to_numeric(data_bisnis['Price_per_Unit'], errors='coerce').notna()]
data_bisnis_filtered = data_bisnis_filtered[np.isfinite(data_bisnis_filtered['Price_per_Unit'])]

# ==============================
# 3. Create Ridgeline Plot
# ==============================
plt.figure(figsize=(16, 10))

joyplot(
    data_bisnis_filtered,
    by="Region",
    column="Price_per_Unit",
    figsize=(16, 10),
    fade=True,
    alpha=0.7,
    colormap=plt.cm.tab10,
    linewidth=1
)

plt.title("Distribution of Price per Unit by Region", fontsize=24, weight="bold")
plt.xlabel("Price per Unit", fontsize=18)
plt.ylabel("Region", fontsize=18)

# Format Rupiah (e.g., Rp1.000)
def rupiah_format(x, pos):
    return f"Rp{int(x):,}".replace(",", ".")  # Format pakai titik sebagai pemisah ribuan

plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(rupiah_format))
plt.xticks(fontsize=14)
## (array([-5.,  0.,  5., 10., 15., 20., 25., 30.]), [Text(-5.0, 0, 'Rp-5'), Text(0.0, 0, 'Rp0'), Text(5.0, 0, 'Rp5'), Text(10.0, 0, 'Rp10'), Text(15.0, 0, 'Rp15'), Text(20.0, 0, 'Rp20'), Text(25.0, 0, 'Rp25'), Text(30.0, 0, 'Rp30')])
plt.yticks(fontsize=14)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0, 0.0, '0.0'), Text(0, 0.2, '0.2'), Text(0, 0.4, '0.4'), Text(0, 0.6000000000000001, '0.6'), Text(0, 0.8, '0.8'), Text(0, 1.0, '1.0')])
#plt.tight_layout()
plt.show()

1.2.3 Boxplot by Category

# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Pastikan Quantity numerik dan hapus nilai NaN
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# ==============================
# 3. Create Boxplot
# ==============================
plt.figure(figsize=(18, 10))  # Sesuaikan ukuran agar teks terlihat jelas

sns.boxplot(
    data=data_bisnis,
    x="Product_Category",
    y="Quantity",
    palette="Set2",
    showfliers=True,  # Tampilkan outlier
    fliersize=6,      # Ukuran titik outlier
    flierprops=dict(marker='o', markerfacecolor='red', markersize=8)  # Outlier merah
)

plt.title("Boxplot of Quantity by Product Category", fontsize=30, weight='bold')
plt.xlabel("Product Category", fontsize=25)
plt.ylabel("Quantity", fontsize=25)
plt.xticks(fontsize=20, rotation=0)
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Clothing'), Text(1, 0, 'Electronics'), Text(2, 0, 'Home'), Text(3, 0, 'Groceries'), Text(4, 0, 'Books')])
plt.yticks(fontsize=20)
## (array([-2.,  0.,  2.,  4.,  6.,  8., 10.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10')])
plt.tight_layout()
plt.show()

1.2.4 Lollipop Chart

# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Hitung total sales berdasarkan Product_Category dan Region
sales_grouped = data_bisnis.groupby(['Product_Category', 'Region'], as_index=False)['Total_Price'].sum()
sales_grouped.rename(columns={'Total_Price': 'Total_Sales'}, inplace=True)

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
plt.figure(figsize=(18, 10))

# Gunakan palette yang mendekati ggplot2 (defaultnya "Set1" sudah cukup mirip)
colors = sns.color_palette("Set1", n_colors=sales_grouped['Region'].nunique())

# Pastikan urutan y mengikuti dari kecil ke besar total (mirip reorder() di R)
category_order = sales_grouped.groupby("Product_Category")["Total_Sales"].sum().sort_values().index.tolist()

# Buat plot per Region
for i, region in enumerate(sales_grouped['Region'].unique()):
    subset = sales_grouped[sales_grouped['Region'] == region]
    # Gunakan order yang sesuai dengan R (y = Product_Category yang sudah diurutkan)
    subset = subset.set_index("Product_Category").loc[category_order].reset_index()
    plt.hlines(y=subset['Product_Category'], xmin=0, xmax=subset['Total_Sales'],
               color=colors[i], linewidth=5, label=region)
    plt.plot(subset['Total_Sales'], subset['Product_Category'], "o",
             color=colors[i], markersize=10)

# ==============================
# 4. Styling
# ==============================
plt.xlabel("Total Sales", fontsize=30)
plt.ylabel("Product Category", fontsize=30)
plt.title("Grouped Lollipop Chart", fontsize=20, weight='bold')
plt.xticks(fontsize=20)
## (array([-200.,    0.,  200.,  400.,  600.,  800., 1000., 1200., 1400.,
##        1600.]), [Text(-200.0, 0, '−200'), Text(0.0, 0, '0'), Text(200.0, 0, '200'), Text(400.0, 0, '400'), Text(600.0, 0, '600'), Text(800.0, 0, '800'), Text(1000.0, 0, '1000'), Text(1200.0, 0, '1200'), Text(1400.0, 0, '1400'), Text(1600.0, 0, '1600')])
plt.yticks(fontsize=20)
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Electronics'), Text(0, 1, 'Groceries'), Text(0, 2, 'Books'), Text(0, 3, 'Home'), Text(0, 4, 'Clothing')])
plt.legend(title="Region", fontsize=18, title_fontsize=20)
plt.tight_layout()
plt.show()

1.2.5 Heatmap

Python Code

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("data/data-dummy.csv")

# Pivot tabel untuk heatmap: Total Quantity by Product Category and Region
pivot_qty = df.pivot_table(
    values="Quantity",
    index="Product_Category",
    columns="Region",
    aggfunc="sum",
    fill_value=0
)

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_qty, annot=True, fmt=".0f", cmap="Blues", linewidths=0.5)
plt.title("Heatmap: Quantity Sold by Product Category and Region", fontsize=16, weight="bold")
plt.xlabel("Region")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()

R Code

library(ggplot2)
library(dplyr)

# Load data
data_bisnis <- read.csv("data/data-dummy.csv", stringsAsFactors = FALSE)

# Hitung total Quantity berdasarkan kategori produk dan region
heatmap_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Quantity = sum(Quantity, na.rm = TRUE), .groups = "drop")

# Buat heatmap
ggplot(heatmap_data, aes(x = Region, y = Product_Category, fill = Total_Quantity)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Total_Quantity), color = "white", size = 5) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(
    title = "Heatmap: Total Quantity by Product Category and Region",
    x = "Region",
    y = "Product Category",
    fill = "Quantity"
  ) +
  theme_minimal(base_size = 16)

1.3 Relationship

1.3.1 Scatter Plot

Python Code

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Hitung Total Sales dan Rata-rata Diskon per Product_Category dan Region
scatter_data = data_bisnis.groupby(['Product_Category', 'Region'], as_index=False).agg({
    'Total_Price': 'sum',
    'Discount': 'mean'
})
scatter_data.rename(columns={
    'Total_Price': 'Total_Sales',
    'Discount': 'Avg_Discount'
}, inplace=True)

# Setup gaya visual
sns.set(style="whitegrid")

# Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=scatter_data,
    x="Avg_Discount",
    y="Total_Sales",
    hue="Region",
    palette="deep",
    s=150,
    alpha=0.85,
    edgecolor="black"
)

plt.title("Scatter Plot: Average Discount vs Total Sales", fontsize=20, weight='bold')
plt.xlabel("Average Discount", fontsize=14)
plt.ylabel("Total Sales", fontsize=14)
plt.xticks(fontsize=12)
## (array([0.12 , 0.125, 0.13 , 0.135, 0.14 , 0.145, 0.15 , 0.155, 0.16 ,
##        0.165]), [Text(0.12, 0, '0.120'), Text(0.125, 0, '0.125'), Text(0.13, 0, '0.130'), Text(0.135, 0, '0.135'), Text(0.13999999999999999, 0, '0.140'), Text(0.145, 0, '0.145'), Text(0.15, 0, '0.150'), Text(0.155, 0, '0.155'), Text(0.16, 0, '0.160'), Text(0.16499999999999998, 0, '0.165')])
plt.yticks(fontsize=12)
## (array([ 200.,  400.,  600.,  800., 1000., 1200., 1400., 1600.]), [Text(0, 200.0, '200'), Text(0, 400.0, '400'), Text(0, 600.0, '600'), Text(0, 800.0, '800'), Text(0, 1000.0, '1000'), Text(0, 1200.0, '1200'), Text(0, 1400.0, '1400'), Text(0, 1600.0, '1600')])
plt.legend(title="Region", fontsize=11, title_fontsize=13)
plt.tight_layout()
plt.show()

R Code

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load & Summarize Data
# ==============================
data_bisnis <- read.csv("data/data-dummy.csv", stringsAsFactors = FALSE)

scatter_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(
    Total_Sales = sum(Total_Price, na.rm = TRUE),
    Avg_Discount = mean(Discount, na.rm = TRUE),
    .groups = "drop"
  )

# ==============================
# 3. Create Scatter Plot
# ==============================
ggplot(scatter_data, aes(x = Avg_Discount, y = Total_Sales, color = Region)) +
  geom_point(size = 5, alpha = 0.85) +
  labs(
    title = "Scatter Plot: Average Discount vs Total Sales",
    x = "Average Discount",
    y = "Total Sales",
    color = "Region"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

1.3.2 Bubble Chart

Python Code

import pandas as pd
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Memfilter data untuk menghilangkan baris dengan Quantity atau Total_Price nol
data_filtered = data[(data['Quantity'] > 0) & (data['Total_Price'] > 0)]

# Membuat palet warna untuk Region
colors = {'North': '#1b9e77',  # Hijau tua
          'South': '#d95f02',  # Oranye
          'East': '#7570b3',   # Ungu
          'West': '#e7298a'}   # Pink

# Membuat bubble chart
plt.figure(figsize=(10, 6))
for region in data_filtered['Region'].unique():
    subset = data_filtered[data_filtered['Region'] == region]
    plt.scatter(subset['Quantity'], 
                subset['Total_Price'], 
                s=subset['Discount'] * 1000,  # Skala diskon untuk ukuran gelembung
                c=colors[region], 
                label=region, 
                alpha=0.6)

# Menambahkan judul dan label sumbu
plt.title('Bubble Chart: Kuantitas vs Total Harga (Ukuran: Diskon, Warna: Wilayah)', 
          fontsize=14, pad=10)
plt.xlabel('Kuantitas', fontsize=12)
plt.ylabel('Total Harga', fontsize=12)

# Menambahkan legenda dan grid
plt.legend(title='Wilayah')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

R Code

# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Memfilter data untuk menghilangkan baris dengan Quantity atau Total_Price nol atau kosong
data_filtered <- data %>%
  filter(Quantity > 0, Total_Price > 0)

# Membuat bubble chart
ggplot(data_filtered, aes(x = Quantity, y = Total_Price, size = Discount, color = Region)) +
  geom_point(alpha = 0.6) +  # Transparansi untuk menangani tumpang tindih
  scale_size_continuous(range = c(3, 15)) +  # Mengatur rentang ukuran gelembung
  scale_color_manual(values = c("North" = "#1b9e77",  # Hijau tua
                                "South" = "#d95f02",  # Oranye
                                "East" = "#7570b3",   # Ungu
                                "West" = "#e7298a")) + # Pink
  labs(title = "Bubble Chart: Kuantitas vs Total Harga (Ukuran: Diskon, Warna: Wilayah)",
       x = "Kuantitas",
       y = "Total Harga",
       size = "Diskon",
       color = "Wilayah") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 8, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "right"
  )

1.3.3 Correlation Matrix

Python Code

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Memilih variabel untuk korelasi
vars = ['Unit_Price', 'Delivery_Time', 'Efficiency', 'Feature_Interaction']
data_selected = data[vars].dropna()

# Menghitung matriks korelasi (Pearson)
cor_pearson = data_selected.corr(method='pearson')

# Menghitung matriks korelasi (Spearman)
cor_spearman = data_selected.corr(method='spearman')

# Visualisasi matriks korelasi Pearson
plt.figure(figsize=(8, 6))
sns.heatmap(cor_pearson, 
            annot=True,  # Menambahkan nilai korelasi
            cmap='RdYlGn',  # Palet warna: merah (negatif), kuning (netral), hijau (positif)
            vmin=-1, vmax=1,  # Rentang nilai korelasi
            square=True, 
            fmt='.2f')
plt.title('Matriks Korelasi (Pearson)', fontsize=14, pad=10)
plt.show()

R Code

# Memuat library yang diperlukan
library(corrplot)
library(dplyr)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Memilih variabel untuk korelasi
vars <- c("Unit_Price", "Delivery_Time", "Efficiency", "Feature_Interaction")
data_selected <- data %>% 
  select(all_of(vars)) %>%
  filter(complete.cases(.))  # Menghapus baris dengan nilai NA

# Menghitung matriks korelasi (Pearson)
cor_pearson <- cor(data_selected, method = "pearson")

# Menghitung matriks korelasi (Spearman)
cor_spearman <- cor(data_selected, method = "spearman")

# Visualisasi matriks korelasi Pearson
corrplot(cor_pearson, 
         method = "color", 
         type = "upper", 
         order = "hclust", 
         addCoef.col = "black",  # Menambahkan nilai korelasi
         tl.col = "black", 
         tl.srt = 45, 
         col = colorRampPalette(c("#d73027", "#f7f7f7", "#1a9850"))(100),
         title = "Matriks Korelasi (Pearson)",
         mar = c(0, 0, 2, 0))

1.4 Time Series

1.4.1 Line Chart

Python Code

import pandas as pd
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun-bulan
data['Date'] = pd.to_datetime(data['Transaction_Date'])
data['YearMonth'] = data['Date'].dt.to_period('M').dt.to_timestamp()

# Memfilter data yang valid
data = data[data['Total_Price'] > 0].dropna(subset=['Total_Price'])

# Mengelompokkan data berdasarkan YearMonth dan Region, menghitung rata-rata Total_Price
data_agg = data.groupby(['YearMonth', 'Region'])['Total_Price'].mean().reset_index()

# Membuat palet warna untuk Region
colors = {'North': '#1b9e77',  # Hijau tua
          'South': '#d95f02',  # Oranye
          'East': '#7570b3',   # Ungu
          'West': '#e7298a'}   # Pink

# Membuat line chart
plt.figure(figsize=(10, 6))
for region in data_agg['Region'].unique():
    subset = data_agg[data_agg['Region'] == region]
    plt.plot(subset['YearMonth'], 
             subset['Total_Price'], 
             marker='o', 
             linewidth=2, 
             markersize=5, 
             color=colors[region], 
             label=region)

# Menambahkan judul dan label sumbu
plt.title('Tren Rata-rata Total Harga per Bulan berdasarkan Wilayah', fontsize=14, pad=10)
plt.xlabel('Bulan', fontsize=12)
plt.ylabel('Rata-rata Total Harga', fontsize=12)

# Menyesuaikan tampilan grafik
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Wilayah')
plt.tight_layout()

plt.show()

R Code

# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
library(lubridate)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun-bulan
data <- data %>%
  mutate(Date = as.Date(Transaction_Date, format = "%Y-%m-%d"),
         YearMonth = floor_date(Date, "month")) %>%
  filter(!is.na(Total_Price) & Total_Price > 0)  # Memfilter data yang valid

# Mengelompokkan data berdasarkan YearMonth dan Region, menghitung rata-rata Total_Price
data_agg <- data %>%
  group_by(YearMonth, Region) %>%
  summarise(Avg_Total_Price = mean(Total_Price, na.rm = TRUE), .groups = "drop")

# Membuat line chart
ggplot(data_agg, aes(x = YearMonth, y = Avg_Total_Price, color = Region)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +  # Menambahkan titik pada setiap data
  scale_color_manual(values = c("North" = "#1b9e77",  # Hijau tua
                                "South" = "#d95f02",  # Oranye
                                "East" = "#7570b3",   # Ungu
                                "West" = "#e7298a")) + # Pink
  labs(title = "Tren Rata-rata Total Harga per Bulan berdasarkan Wilayah",
       x = "Bulan",
       y = "Rata-rata Total Harga",
       color = "Wilayah") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "right"
  )

1.4.2 Area Chart

Python Code

import pandas as pd
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun
data['Date'] = pd.to_datetime(data['Transaction_Date'])
data['Year'] = data['Date'].dt.year

# Memfilter data yang valid
data = data[(data['Unit_Price'] > 0) & (data['Unit_Price'].notna())]

# Mengelompokkan data berdasarkan Year dan Sales_Channel, menghitung rata-rata Unit_Price
data_agg = data.groupby(['Year', 'Sales_Channel'])['Unit_Price'].mean().unstack().fillna(0)
data_agg = data_agg.loc[2020:2024]  # Membatasi ke tahun 2020–2024

# Membuat palet warna untuk Sales_Channel
colors = {'Online': '#1f77b4',  # Biru tua
          'Offline': '#ff7f0e'}  # Oranye

# Membuat stacked area chart
plt.figure(figsize=(10, 6))
plt.stackplot(data_agg.index, 
              data_agg.T, 
              labels=data_agg.columns, 
              colors=[colors[cat] for cat in data_agg.columns], 
              alpha=0.8)

# Menambahkan judul dan label sumbu
plt.title('Rata-rata Harga per Unit per Tahun berdasarkan Saluran Penjualan', fontsize=14, pad=10)
plt.xlabel('Tahun', fontsize=12)
plt.ylabel('Rata-rata Harga per Unit', fontsize=12)

# Menyesuaikan tampilan grafik
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Saluran Penjualan', loc='upper center')
plt.tight_layout()

# Menampilkan grafik
plt.show()

R Code

# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
library(lubridate)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun
data <- data %>%
  mutate(Date = as.Date(Transaction_Date, format = "%Y-%m-%d"),
         Year = year(Date)) %>%
  filter(!is.na(Unit_Price) & Unit_Price > 0)  # Memfilter data yang valid

# Mengelompokkan data berdasarkan Year dan Sales_Channel, menghitung rata-rata Unit_Price
data_agg <- data %>%
  group_by(Year, Sales_Channel) %>%
  summarise(Avg_Unit_Price = mean(Unit_Price, na.rm = TRUE), .groups = "drop") %>%
  filter(Year >= 2020 & Year <= 2024)  # Membatasi ke tahun 2020–2024

# Membuat stacked area chart
ggplot(data_agg, aes(x = Year, y = Avg_Unit_Price, fill = Sales_Channel)) +
  geom_area(position = "stack", alpha = 0.8) +  # Stacked area dengan transparansi
  scale_fill_manual(values = c("Online" = "#1f77b4",  # Biru tua
                               "Offline" = "#ff7f0e")) +  # Oranye
  labs(title = "Rata-rata Harga per Unit per Tahun berdasarkan Saluran Penjualan",
       x = "Tahun",
       y = "Rata-rata Harga per Unit",
       fill = "Saluran Penjualan") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "top"
  )