Bab 1 VISUALISASI DESKRIPTIF

## C:\Users\HP\AppData\Local\Programs\Python\PYTHON~1\python.exe

library(DT)

data <- read.csv("data/data-dummy.csv")
datatable(data, options = list(pageLength = 10))

1.1 Numerical Data

1.1.1 Python

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load data
data = pd.read_csv("data/data-dummy.csv")

# Pastikan kolom 'Quantity' ada dan tidak null
quantity = data['Quantity'].dropna()

# Statistik
Q1 = quantity.quantile(0.25)
Q3 = quantity.quantile(0.75)
IQR = Q3 - Q1
mean = quantity.mean()
min_val = quantity.min()
max_val = quantity.max()
outliers = quantity[(quantity < Q1 - 1.5 * IQR) | (quantity > Q3 + 1.5 * IQR)]

# Plot
plt.figure(figsize=(10, 6))
sns.boxplot(y=quantity, color="skyblue", width=0.4, fliersize=0)
sns.stripplot(y=quantity, color='black', size=3, alpha=0.5, jitter=True)
sns.stripplot(y=outliers, color='red', size=5, jitter=True)

# Garis vertikal tengah
plt.axhline(mean, color='blue', linestyle='--', linewidth=1)

# Anotasi
def annotate_text(y, text, color, bold=False, italic=False):
    plt.text(
        1.2, y, text,
        fontsize=10,
        color=color,
        ha='left',
        fontstyle='italic' if italic else 'normal',
        fontweight='bold' if bold else 'normal'
    )

annotate_text(Q1, f"Q1: {Q1:.1f}", 'green')
annotate_text(Q3, f"Q3: {Q3:.1f}", 'green')
annotate_text(mean, f"Mean: {mean:.2f}", 'blue', bold=True)
annotate_text(min_val, f"Min: {min_val}", 'orange')
annotate_text(max_val, f"Max: {max_val}", 'orange')
annotate_text(outliers.max(), f"Outliers: {len(outliers)}", 'red', bold=True)

# Styling
plt.title("Boxplot of Quantity with Jitter", fontsize=16, weight='bold')
plt.ylabel("Quantity")
plt.xticks([])  # Sembunyikan x-axis karena cuma 1 variabel

## ([], [])

plt.tight_layout()
plt.show()

1.1.2 Violin Plot

# ==============================
# 1. Import Library
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load dan Persiapan Data
# ==============================
# Ganti dengan path CSV kamu
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Pastikan kolom Quantity bertipe numerik
data_bisnis["Quantity"] = pd.to_numeric(data_bisnis["Quantity"], errors="coerce")
data_bisnis = data_bisnis.dropna(subset=["Quantity"])

# Hitung statistik untuk anotasi
Q1 = data_bisnis["Quantity"].quantile(0.25)
Q3 = data_bisnis["Quantity"].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR
mean_val = data_bisnis["Quantity"].mean()
median_val = data_bisnis["Quantity"].median()
min_val = data_bisnis["Quantity"].min()
max_val = data_bisnis["Quantity"].max()
outliers_count = ((data_bisnis["Quantity"] > upper_whisker) | (data_bisnis["Quantity"] < lower_whisker)).sum()

# Tandai outlier
data_bisnis["is_outlier"] = data_bisnis["Quantity"].apply(
    lambda x: "Outlier" if x < lower_whisker or x > upper_whisker else "Normal"
)

# ==============================
# 3. Plot Violin + Box + Jitter + Anotasi
# ==============================
plt.figure(figsize=(16, 10))
sns.violinplot(
    x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
    inner=None, color="skyblue"
)
sns.boxplot(
    x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
    width=0.2, showcaps=True, boxprops={'facecolor': 'none'},
    showfliers=False
)
sns.stripplot(
    x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
    hue="is_outlier", palette={"Normal": "black", "Outlier": "red"},
    jitter=0.15, size=4, alpha=0.7, dodge=False
)

# Tambahkan anotasi statistik
plt.text(1.1, mean_val, f"Mean: {mean_val:.2f}", color="blue", fontsize=14, fontweight="bold")
plt.text(1.1, Q1, f"Q1: {Q1:.2f}", color="green", fontsize=13)
plt.text(1.1, median_val, f"Median: {median_val:.2f}", color="purple", fontsize=13)
plt.text(1.1, Q3, f"Q3: {Q3:.2f}", color="green", fontsize=13)
plt.text(1.1, min_val, f"Min: {min_val:.2f}", color="orange", fontsize=13)
plt.text(1.1, max_val, f"Max: {max_val:.2f}", color="orange", fontsize=13)
plt.text(1, max_val + 0.3, f"Outliers: {outliers_count}", color="red", fontsize=14, style="italic", ha="center")

# Judul dan tampilan
plt.title("Violin Plot of Quantity with Outlier Highlighted", fontsize=24, weight="bold")
plt.ylabel("Quantity", fontsize=18)
plt.xticks([], [])  # sembunyikan sumbu X karena dummy

## ([], [])

plt.yticks(fontsize=14)

## (array([-2.,  0.,  2.,  4.,  6.,  8., 10., 12.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10'), Text(0, 12.0, '12')])

plt.legend(title="Point Type", title_fontsize=15, fontsize=13, loc="upper right")
sns.despine()
plt.tight_layout()
plt.show()

1.2 Combo

1.2.1 Grouped Bar Chart

# ==============================
# 1. Import Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# ==============================
# 3. Data Summarization
# ==============================
sales_summary = (
    data_bisnis
    .groupby(['Product_Category', 'Region'], as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
)

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
plt.figure(figsize=(12, 6))
sns.barplot(
    data=sales_summary,
    x="Product_Category",
    y="Total_Sales",
    hue="Region",
    dodge=True
)

plt.title("Total Sales by Product Category and Region", fontsize=16, fontweight="bold")
plt.xlabel("Product Category", fontsize=14)
plt.ylabel("Total Sales (USD)", fontsize=14)
plt.xticks(rotation=45, ha='right')

## ([0, 1, 2, 3, 4], [Text(0, 0, 'Books'), Text(1, 0, 'Clothing'), Text(2, 0, 'Electronics'), Text(3, 0, 'Groceries'), Text(4, 0, 'Home')])

plt.legend(title="Region", title_fontsize=12)
sns.despine()
plt.tight_layout()
plt.show()

1.2.2 Redgeline Plot

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
from joypy import joyplot
import seaborn as sns
import matplotlib.ticker as ticker

# ==============================
# 2. Filter Valid Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Filter hanya nilai Price_per_Unit yang finite (bukan NaN, Inf, -Inf)
data_bisnis_filtered = data_bisnis[pd.to_numeric(data_bisnis['Price_per_Unit'], errors='coerce').notna()]
data_bisnis_filtered = data_bisnis_filtered[np.isfinite(data_bisnis_filtered['Price_per_Unit'])]

# ==============================
# 3. Create Ridgeline Plot
# ==============================
plt.figure(figsize=(16, 10))

joyplot(
    data_bisnis_filtered,
    by="Region",
    column="Price_per_Unit",
    figsize=(16, 10),
    fade=True,
    alpha=0.7,
    colormap=plt.cm.tab10,
    linewidth=1
)

plt.title("Distribution of Price per Unit by Region", fontsize=24, weight="bold")
plt.xlabel("Price per Unit", fontsize=18)
plt.ylabel("Region", fontsize=18)

# Format Rupiah (e.g., Rp1.000)
def rupiah_format(x, pos):
    return f"Rp{int(x):,}".replace(",", ".")  # Format pakai titik sebagai pemisah ribuan

plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(rupiah_format))
plt.xticks(fontsize=14)

## (array([-5.,  0.,  5., 10., 15., 20., 25., 30.]), [Text(-5.0, 0, 'Rp-5'), Text(0.0, 0, 'Rp0'), Text(5.0, 0, 'Rp5'), Text(10.0, 0, 'Rp10'), Text(15.0, 0, 'Rp15'), Text(20.0, 0, 'Rp20'), Text(25.0, 0, 'Rp25'), Text(30.0, 0, 'Rp30')])

plt.yticks(fontsize=14)

## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0, 0.0, '0.0'), Text(0, 0.2, '0.2'), Text(0, 0.4, '0.4'), Text(0, 0.6000000000000001, '0.6'), Text(0, 0.8, '0.8'), Text(0, 1.0, '1.0')])

#plt.tight_layout()
plt.show()

1.2.3 Boxplot by Category

# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Pastikan Quantity numerik dan hapus nilai NaN
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# ==============================
# 3. Create Boxplot
# ==============================
plt.figure(figsize=(18, 10))  # Sesuaikan ukuran agar teks terlihat jelas

sns.boxplot(
    data=data_bisnis,
    x="Product_Category",
    y="Quantity",
    palette="Set2",
    showfliers=True,  # Tampilkan outlier
    fliersize=6,      # Ukuran titik outlier
    flierprops=dict(marker='o', markerfacecolor='red', markersize=8)  # Outlier merah
)

plt.title("Boxplot of Quantity by Product Category", fontsize=30, weight='bold')
plt.xlabel("Product Category", fontsize=25)
plt.ylabel("Quantity", fontsize=25)
plt.xticks(fontsize=20, rotation=0)

## ([0, 1, 2, 3, 4], [Text(0, 0, 'Clothing'), Text(1, 0, 'Electronics'), Text(2, 0, 'Home'), Text(3, 0, 'Groceries'), Text(4, 0, 'Books')])

plt.yticks(fontsize=20)

## (array([-2.,  0.,  2.,  4.,  6.,  8., 10.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10')])

plt.tight_layout()
plt.show()

1.2.4 Lollipop Chart

# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Hitung total sales berdasarkan Product_Category dan Region
sales_grouped = data_bisnis.groupby(['Product_Category', 'Region'], as_index=False)['Total_Price'].sum()
sales_grouped.rename(columns={'Total_Price': 'Total_Sales'}, inplace=True)

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
plt.figure(figsize=(18, 10))

# Gunakan palette yang mendekati ggplot2 (defaultnya "Set1" sudah cukup mirip)
colors = sns.color_palette("Set1", n_colors=sales_grouped['Region'].nunique())

# Pastikan urutan y mengikuti dari kecil ke besar total (mirip reorder() di R)
category_order = sales_grouped.groupby("Product_Category")["Total_Sales"].sum().sort_values().index.tolist()

# Buat plot per Region
for i, region in enumerate(sales_grouped['Region'].unique()):
    subset = sales_grouped[sales_grouped['Region'] == region]
    # Gunakan order yang sesuai dengan R (y = Product_Category yang sudah diurutkan)
    subset = subset.set_index("Product_Category").loc[category_order].reset_index()
    plt.hlines(y=subset['Product_Category'], xmin=0, xmax=subset['Total_Sales'],
               color=colors[i], linewidth=5, label=region)
    plt.plot(subset['Total_Sales'], subset['Product_Category'], "o",
             color=colors[i], markersize=10)

# ==============================
# 4. Styling
# ==============================
plt.xlabel("Total Sales", fontsize=30)
plt.ylabel("Product Category", fontsize=30)
plt.title("Grouped Lollipop Chart", fontsize=20, weight='bold')
plt.xticks(fontsize=20)

## (array([-200.,    0.,  200.,  400.,  600.,  800., 1000., 1200., 1400.,
##        1600.]), [Text(-200.0, 0, '−200'), Text(0.0, 0, '0'), Text(200.0, 0, '200'), Text(400.0, 0, '400'), Text(600.0, 0, '600'), Text(800.0, 0, '800'), Text(1000.0, 0, '1000'), Text(1200.0, 0, '1200'), Text(1400.0, 0, '1400'), Text(1600.0, 0, '1600')])

plt.yticks(fontsize=20)

## ([0, 1, 2, 3, 4], [Text(0, 0, 'Electronics'), Text(0, 1, 'Groceries'), Text(0, 2, 'Books'), Text(0, 3, 'Home'), Text(0, 4, 'Clothing')])

plt.legend(title="Region", fontsize=18, title_fontsize=20)
plt.tight_layout()
plt.show()

1.2.5 Heatmap

Python Code

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("data/data-dummy.csv")

# Pivot tabel untuk heatmap: Total Quantity by Product Category and Region
pivot_qty = df.pivot_table(
    values="Quantity",
    index="Product_Category",
    columns="Region",
    aggfunc="sum",
    fill_value=0
)

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_qty, annot=True, fmt=".0f", cmap="Blues", linewidths=0.5)
plt.title("Heatmap: Quantity Sold by Product Category and Region", fontsize=16, weight="bold")
plt.xlabel("Region")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()

R Code

library(ggplot2)
library(dplyr)

# Load data
data_bisnis <- read.csv("data/data-dummy.csv", stringsAsFactors = FALSE)

# Hitung total Quantity berdasarkan kategori produk dan region
heatmap_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Quantity = sum(Quantity, na.rm = TRUE), .groups = "drop")

# Buat heatmap
ggplot(heatmap_data, aes(x = Region, y = Product_Category, fill = Total_Quantity)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Total_Quantity), color = "white", size = 5) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(
    title = "Heatmap: Total Quantity by Product Category and Region",
    x = "Region",
    y = "Product Category",
    fill = "Quantity"
  ) +
  theme_minimal(base_size = 16)

1.3 Relationship

1.3.1 Scatter Plot

Python Code

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
data_bisnis = pd.read_csv("data/data-dummy.csv")

# Hitung Total Sales dan Rata-rata Diskon per Product_Category dan Region
scatter_data = data_bisnis.groupby(['Product_Category', 'Region'], as_index=False).agg({
    'Total_Price': 'sum',
    'Discount': 'mean'
})
scatter_data.rename(columns={
    'Total_Price': 'Total_Sales',
    'Discount': 'Avg_Discount'
}, inplace=True)

# Setup gaya visual
sns.set(style="whitegrid")

# Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=scatter_data,
    x="Avg_Discount",
    y="Total_Sales",
    hue="Region",
    palette="deep",
    s=150,
    alpha=0.85,
    edgecolor="black"
)

plt.title("Scatter Plot: Average Discount vs Total Sales", fontsize=20, weight='bold')
plt.xlabel("Average Discount", fontsize=14)
plt.ylabel("Total Sales", fontsize=14)
plt.xticks(fontsize=12)

## (array([0.12 , 0.125, 0.13 , 0.135, 0.14 , 0.145, 0.15 , 0.155, 0.16 ,
##        0.165]), [Text(0.12, 0, '0.120'), Text(0.125, 0, '0.125'), Text(0.13, 0, '0.130'), Text(0.135, 0, '0.135'), Text(0.13999999999999999, 0, '0.140'), Text(0.145, 0, '0.145'), Text(0.15, 0, '0.150'), Text(0.155, 0, '0.155'), Text(0.16, 0, '0.160'), Text(0.16499999999999998, 0, '0.165')])

plt.yticks(fontsize=12)

## (array([ 200.,  400.,  600.,  800., 1000., 1200., 1400., 1600.]), [Text(0, 200.0, '200'), Text(0, 400.0, '400'), Text(0, 600.0, '600'), Text(0, 800.0, '800'), Text(0, 1000.0, '1000'), Text(0, 1200.0, '1200'), Text(0, 1400.0, '1400'), Text(0, 1600.0, '1600')])

plt.legend(title="Region", fontsize=11, title_fontsize=13)
plt.tight_layout()
plt.show()

R Code

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load & Summarize Data
# ==============================
data_bisnis <- read.csv("data/data-dummy.csv", stringsAsFactors = FALSE)

scatter_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(
    Total_Sales = sum(Total_Price, na.rm = TRUE),
    Avg_Discount = mean(Discount, na.rm = TRUE),
    .groups = "drop"
  )

# ==============================
# 3. Create Scatter Plot
# ==============================
ggplot(scatter_data, aes(x = Avg_Discount, y = Total_Sales, color = Region)) +
  geom_point(size = 5, alpha = 0.85) +
  labs(
    title = "Scatter Plot: Average Discount vs Total Sales",
    x = "Average Discount",
    y = "Total Sales",
    color = "Region"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

1.3.2 Bubble Chart

Python Code

import pandas as pd
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Memfilter data untuk menghilangkan baris dengan Quantity atau Total_Price nol
data_filtered = data[(data['Quantity'] > 0) & (data['Total_Price'] > 0)]

# Membuat palet warna untuk Region
colors = {'North': '#1b9e77',  # Hijau tua
          'South': '#d95f02',  # Oranye
          'East': '#7570b3',   # Ungu
          'West': '#e7298a'}   # Pink

# Membuat bubble chart
plt.figure(figsize=(10, 6))
for region in data_filtered['Region'].unique():
    subset = data_filtered[data_filtered['Region'] == region]
    plt.scatter(subset['Quantity'], 
                subset['Total_Price'], 
                s=subset['Discount'] * 1000,  # Skala diskon untuk ukuran gelembung
                c=colors[region], 
                label=region, 
                alpha=0.6)

# Menambahkan judul dan label sumbu
plt.title('Bubble Chart: Kuantitas vs Total Harga (Ukuran: Diskon, Warna: Wilayah)', 
          fontsize=14, pad=10)
plt.xlabel('Kuantitas', fontsize=12)
plt.ylabel('Total Harga', fontsize=12)

# Menambahkan legenda dan grid
plt.legend(title='Wilayah')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

R Code

# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Memfilter data untuk menghilangkan baris dengan Quantity atau Total_Price nol atau kosong
data_filtered <- data %>%
  filter(Quantity > 0, Total_Price > 0)

# Membuat bubble chart
ggplot(data_filtered, aes(x = Quantity, y = Total_Price, size = Discount, color = Region)) +
  geom_point(alpha = 0.6) +  # Transparansi untuk menangani tumpang tindih
  scale_size_continuous(range = c(3, 15)) +  # Mengatur rentang ukuran gelembung
  scale_color_manual(values = c("North" = "#1b9e77",  # Hijau tua
                                "South" = "#d95f02",  # Oranye
                                "East" = "#7570b3",   # Ungu
                                "West" = "#e7298a")) + # Pink
  labs(title = "Bubble Chart: Kuantitas vs Total Harga (Ukuran: Diskon, Warna: Wilayah)",
       x = "Kuantitas",
       y = "Total Harga",
       size = "Diskon",
       color = "Wilayah") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 8, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "right"
  )

1.3.3 Correlation Matrix

Python Code

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Memilih variabel untuk korelasi
vars = ['Unit_Price', 'Delivery_Time', 'Efficiency', 'Feature_Interaction']
data_selected = data[vars].dropna()

# Menghitung matriks korelasi (Pearson)
cor_pearson = data_selected.corr(method='pearson')

# Menghitung matriks korelasi (Spearman)
cor_spearman = data_selected.corr(method='spearman')

# Visualisasi matriks korelasi Pearson
plt.figure(figsize=(8, 6))
sns.heatmap(cor_pearson, 
            annot=True,  # Menambahkan nilai korelasi
            cmap='RdYlGn',  # Palet warna: merah (negatif), kuning (netral), hijau (positif)
            vmin=-1, vmax=1,  # Rentang nilai korelasi
            square=True, 
            fmt='.2f')
plt.title('Matriks Korelasi (Pearson)', fontsize=14, pad=10)
plt.show()

R Code

# Memuat library yang diperlukan
library(corrplot)
library(dplyr)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Memilih variabel untuk korelasi
vars <- c("Unit_Price", "Delivery_Time", "Efficiency", "Feature_Interaction")
data_selected <- data %>% 
  select(all_of(vars)) %>%
  filter(complete.cases(.))  # Menghapus baris dengan nilai NA

# Menghitung matriks korelasi (Pearson)
cor_pearson <- cor(data_selected, method = "pearson")

# Menghitung matriks korelasi (Spearman)
cor_spearman <- cor(data_selected, method = "spearman")

# Visualisasi matriks korelasi Pearson
corrplot(cor_pearson, 
         method = "color", 
         type = "upper", 
         order = "hclust", 
         addCoef.col = "black",  # Menambahkan nilai korelasi
         tl.col = "black", 
         tl.srt = 45, 
         col = colorRampPalette(c("#d73027", "#f7f7f7", "#1a9850"))(100),
         title = "Matriks Korelasi (Pearson)",
         mar = c(0, 0, 2, 0))

1.4 Time Series

1.4.1 Line Chart

Python Code

import pandas as pd
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun-bulan
data['Date'] = pd.to_datetime(data['Transaction_Date'])
data['YearMonth'] = data['Date'].dt.to_period('M').dt.to_timestamp()

# Memfilter data yang valid
data = data[data['Total_Price'] > 0].dropna(subset=['Total_Price'])

# Mengelompokkan data berdasarkan YearMonth dan Region, menghitung rata-rata Total_Price
data_agg = data.groupby(['YearMonth', 'Region'])['Total_Price'].mean().reset_index()

# Membuat palet warna untuk Region
colors = {'North': '#1b9e77',  # Hijau tua
          'South': '#d95f02',  # Oranye
          'East': '#7570b3',   # Ungu
          'West': '#e7298a'}   # Pink

# Membuat line chart
plt.figure(figsize=(10, 6))
for region in data_agg['Region'].unique():
    subset = data_agg[data_agg['Region'] == region]
    plt.plot(subset['YearMonth'], 
             subset['Total_Price'], 
             marker='o', 
             linewidth=2, 
             markersize=5, 
             color=colors[region], 
             label=region)

# Menambahkan judul dan label sumbu
plt.title('Tren Rata-rata Total Harga per Bulan berdasarkan Wilayah', fontsize=14, pad=10)
plt.xlabel('Bulan', fontsize=12)
plt.ylabel('Rata-rata Total Harga', fontsize=12)

# Menyesuaikan tampilan grafik
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Wilayah')
plt.tight_layout()

plt.show()

R Code

# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
library(lubridate)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun-bulan
data <- data %>%
  mutate(Date = as.Date(Transaction_Date, format = "%Y-%m-%d"),
         YearMonth = floor_date(Date, "month")) %>%
  filter(!is.na(Total_Price) & Total_Price > 0)  # Memfilter data yang valid

# Mengelompokkan data berdasarkan YearMonth dan Region, menghitung rata-rata Total_Price
data_agg <- data %>%
  group_by(YearMonth, Region) %>%
  summarise(Avg_Total_Price = mean(Total_Price, na.rm = TRUE), .groups = "drop")

# Membuat line chart
ggplot(data_agg, aes(x = YearMonth, y = Avg_Total_Price, color = Region)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +  # Menambahkan titik pada setiap data
  scale_color_manual(values = c("North" = "#1b9e77",  # Hijau tua
                                "South" = "#d95f02",  # Oranye
                                "East" = "#7570b3",   # Ungu
                                "West" = "#e7298a")) + # Pink
  labs(title = "Tren Rata-rata Total Harga per Bulan berdasarkan Wilayah",
       x = "Bulan",
       y = "Rata-rata Total Harga",
       color = "Wilayah") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "right"
  )

1.4.2 Area Chart

Python Code

import pandas as pd
import matplotlib.pyplot as plt

# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun
data['Date'] = pd.to_datetime(data['Transaction_Date'])
data['Year'] = data['Date'].dt.year

# Memfilter data yang valid
data = data[(data['Unit_Price'] > 0) & (data['Unit_Price'].notna())]

# Mengelompokkan data berdasarkan Year dan Sales_Channel, menghitung rata-rata Unit_Price
data_agg = data.groupby(['Year', 'Sales_Channel'])['Unit_Price'].mean().unstack().fillna(0)
data_agg = data_agg.loc[2020:2024]  # Membatasi ke tahun 2020–2024

# Membuat palet warna untuk Sales_Channel
colors = {'Online': '#1f77b4',  # Biru tua
          'Offline': '#ff7f0e'}  # Oranye

# Membuat stacked area chart
plt.figure(figsize=(10, 6))
plt.stackplot(data_agg.index, 
              data_agg.T, 
              labels=data_agg.columns, 
              colors=[colors[cat] for cat in data_agg.columns], 
              alpha=0.8)

# Menambahkan judul dan label sumbu
plt.title('Rata-rata Harga per Unit per Tahun berdasarkan Saluran Penjualan', fontsize=14, pad=10)
plt.xlabel('Tahun', fontsize=12)
plt.ylabel('Rata-rata Harga per Unit', fontsize=12)

# Menyesuaikan tampilan grafik
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Saluran Penjualan', loc='upper center')
plt.tight_layout()

# Menampilkan grafik
plt.show()

R Code

# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
library(lubridate)

# Membaca dataset
data <- read.csv("data/data-dummy.csv")

# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun
data <- data %>%
  mutate(Date = as.Date(Transaction_Date, format = "%Y-%m-%d"),
         Year = year(Date)) %>%
  filter(!is.na(Unit_Price) & Unit_Price > 0)  # Memfilter data yang valid

# Mengelompokkan data berdasarkan Year dan Sales_Channel, menghitung rata-rata Unit_Price
data_agg <- data %>%
  group_by(Year, Sales_Channel) %>%
  summarise(Avg_Unit_Price = mean(Unit_Price, na.rm = TRUE), .groups = "drop") %>%
  filter(Year >= 2020 & Year <= 2024)  # Membatasi ke tahun 2020–2024

# Membuat stacked area chart
ggplot(data_agg, aes(x = Year, y = Avg_Unit_Price, fill = Sales_Channel)) +
  geom_area(position = "stack", alpha = 0.8) +  # Stacked area dengan transparansi
  scale_fill_manual(values = c("Online" = "#1f77b4",  # Biru tua
                               "Offline" = "#ff7f0e")) +  # Oranye
  labs(title = "Rata-rata Harga per Unit per Tahun berdasarkan Saluran Penjualan",
       x = "Tahun",
       y = "Rata-rata Harga per Unit",
       fill = "Saluran Penjualan") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "top"
  )

Data Science Progamming

Data Science Progamming

Bab 1 VISUALISASI DESKRIPTIF

1.1 Numerical Data

1.1.1 Python

1.1.2 Violin Plot

1.2 Combo

1.2.1 Grouped Bar Chart

1.2.2 Redgeline Plot

1.2.3 Boxplot by Category

1.2.4 Lollipop Chart

1.2.5 Heatmap

1.3 Relationship

1.3.1 Scatter Plot

1.3.2 Bubble Chart

1.3.3 Correlation Matrix

1.4 Time Series

1.4.1 Line Chart

1.4.2 Area Chart